In [1]:
DATA_PATH = "Data/"

In [2]:
# FASTA file reader
from collections import OrderedDict
from typing import Dict

NAME_SYMBOL = '>'


def parse_sequences(filename: str,
                    ordered: bool=False) -> Dict[str, str]:
    """
    Parses a text file of genome sequences into a dictionary.
    Arguments:
      filename: str - The name of the file containing the genome info.
      ordered: bool - Set this to True if you want the result to be ordered.
    """
    result = OrderedDict() if ordered else {}

    last_name = None
    with open(filename) as sequences:
        for line in sequences:
            if line.startswith(NAME_SYMBOL):
                last_name = line[1:-1]
                result[last_name] = []
            else:
                result[last_name].append(line[:-1])

    for name in result:
        result[name] = ''.join(result[name])

    return result

In [5]:
# numbers txt file reader
def parse_numbers(path):
    """
    Parses a text file of numbers into a list
    """
    with open(path, 'r') as handle:
        for line in handle:
            if not line.strip():
                continue  # This skips blank lines

            values = [*map(int, line.split())]
            return values

### Problem 11 - Mortal Fibonacci Rabbits 

Recall the definition of the Fibonacci numbers from “Rabbits and Recurrence Relations”, which followed the recurrence relation $F_{n}=F_{n−1}+F_{n−2}$ and assumed that each pair of rabbits reaches maturity in one month and produces a single pair of offspring (one male, one female) each subsequent month.

Our aim is to somehow modify this recurrence relation to achieve a dynamic programming solution in the case that all rabbits die out after a fixed number of months.

Given: Positive integers $n\leq100$ and $m\leq20$.

Return: The total number of pairs of rabbits that will remain after the $n$
-th month if all rabbits live for $m$

months.
Sample Dataset

6 3

Sample Output

4


### Solution 11

In [6]:
def s11(n, m):
    rabbits = []
    rabbits.insert(0, 1)
    rabbits.insert(1, 1)
    # counting the number of rabbits for each value before n
    for i in range(2, n):
        if i < m:
            rabbits.insert(i, (rabbits[i-1] + rabbits[i-2]))
        elif i == m:
            rabbits.insert(i, (rabbits[i-1] + rabbits[i-2] - 1))
        else:
            rabbits.insert(i, (rabbits[i-1]+ rabbits[i-2] - rabbits[i - (m+1)]))
    return rabbits[n-1]

In [7]:
n, m = parse_numbers(DATA_PATH + 'rosalind_fibd.txt')
n, m

(99, 19)

In [8]:
s11(n, m)

218067459026532066738

### Problem 12 - Overlap Graphs

A graph whose nodes have all been labeled can be represented by an adjacency list, in which each row of the list contains the two node labels corresponding to a unique edge.

A directed graph (or digraph) is a graph containing directed edges, each of which has an orientation. That is, a directed edge is represented by an arrow instead of a line segment; the starting and ending nodes of an edge form its tail and head, respectively. The directed edge with tail $v$ and head $w$ is represented by $(v,w)$ (but not by $(w,v)$). A directed loop is a directed edge of the form $(v,v)$.

For a collection of strings and a positive integer $k$, the overlap graph for the strings is a directed graph $O_{k}$ in which each string is represented by a node, and string $s$ is connected to string $t$ with a directed edge when there is a length $k$ suffix of $s$ that matches a length $k$ prefix of $t$, as long as $s\neq t$; we demand $s\neq t$ to prevent directed loops in the overlap graph (although directed cycles may be present).

Given: A collection of DNA strings in FASTA format having total length at most 10 kbp.

Return: The adjacency list corresponding to $O_{3}$. You may return edges in any order.

### Solution 12

In [9]:
dic = {"Rosalind_0498":
"AAATAAA",
"Rosalind_2391":
"AAATTTT",
"Rosalind_2323":
"TTTTCCC",
"Rosalind_0442":
"AAATCCC",
"Rosalind_5013":
"GGGTGGG"}

In [10]:
def s12(fasta, k):
    edges = []
    labels = list(fasta.keys())
    for name in labels:
        for label in fasta:
            if name != label:
                if fasta[name][-k:] == fasta[label][:k]:
                    pair = (name, label)
                    edges.append(pair)
    joiner = " ".join
    for i in (list(map(joiner, edges))):
        print(i)

In [11]:
s12(dic, 3)

Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323


In [12]:
FASTAfile = parse_sequences(DATA_PATH + 'rosalind_grph.txt')

In [13]:
s12(FASTAfile, 3)

Rosalind_2133 Rosalind_0551
Rosalind_9517 Rosalind_5383
Rosalind_9517 Rosalind_5123
Rosalind_8387 Rosalind_9238
Rosalind_8387 Rosalind_6394
Rosalind_2665 Rosalind_0356
Rosalind_2665 Rosalind_2672
Rosalind_5323 Rosalind_8497
Rosalind_1310 Rosalind_6278
Rosalind_7423 Rosalind_6278
Rosalind_6813 Rosalind_1310
Rosalind_6813 Rosalind_2311
Rosalind_6813 Rosalind_2202
Rosalind_6813 Rosalind_5542
Rosalind_6813 Rosalind_5532
Rosalind_0551 Rosalind_7460
Rosalind_3359 Rosalind_1575
Rosalind_3224 Rosalind_3026
Rosalind_1835 Rosalind_9096
Rosalind_1835 Rosalind_5724
Rosalind_1835 Rosalind_5609
Rosalind_1835 Rosalind_6755
Rosalind_7928 Rosalind_0356
Rosalind_7928 Rosalind_2672
Rosalind_4006 Rosalind_8100
Rosalind_4006 Rosalind_9303
Rosalind_4006 Rosalind_2752
Rosalind_1559 Rosalind_6555
Rosalind_1559 Rosalind_0679
Rosalind_1559 Rosalind_5499
Rosalind_1559 Rosalind_0151
Rosalind_6555 Rosalind_2665
Rosalind_6555 Rosalind_2092
Rosalind_6555 Rosalind_7784
Rosalind_7460 Rosalind_7423
Rosalind_7460 Rosali

### Problem 13 - Calculating Expected Offspring

For a random variable $X$ taking integer values between 1 and $n$, the expected value of $X$ is $E(X)=\sum_{k=1}^n=k×P_{r}(X=k)$. The expected value offers us a way of taking the long-term average of a random variable over a large number of trials.

As a motivating example, let $X$ be the number on a six-sided die. Over a large number of rolls, we should expect to obtain an average of 3.5 on the die (even though it's not possible to roll a 3.5). The formula for expected value confirms that $E(X)=\sum_{k=1}^6=k×P_{r}(X=k)=3.5$.

More generally, a random variable for which every one of a number of equally spaced outcomes has the same probability is called a uniform random variable (in the die example, this "equal spacing" is equal to 1). We can generalize our die example to find that if $X$ is a uniform random variable with minimum possible value a and maximum possible value $b$, then $E(X)=\frac{a + b}{2}$.  You may also wish to verify that for the dice example, if $Y$ is the random variable associated with the outcome of a second die roll, then $E(X+Y)=7$.

Given: Six nonnegative integers, each of which does not exceed 20,000. The integers correspond to the number of couples in a population possessing each genotype pairing for a given factor. In order, the six given integers represent the number of couples having the following genotypes:

AA-AA

AA-Aa

AA-aa

Aa-Aa

Aa-aa

aa-aa

Return: The expected number of offspring displaying the dominant phenotype in the next generation, under the assumption that every couple has exactly two offspring.

Sample Dataset

1 0 0 1 0 1

Sample Output

3.5


### Solution 13

In [14]:
#probabilities for a dominant phenotype
p = [1, 1, 1, 0.75, 0.5, 0]

In [15]:
gen = [1, 0, 0, 1, 0, 1,]

In [16]:
def s13(line):
    offspring = 0
    for i in range(6):
        offspring += (2 * line[i] * p[i])
    return offspring

In [17]:
s13(gen)

3.5

In [18]:
generations = parse_numbers(DATA_PATH + 'rosalind_iev.txt')

In [19]:
generations

[16906, 16149, 18849, 18091, 16466, 16994]

In [20]:
s13(generations)

147410.5

### Problem 14 -  Finding a shared motif

A common substring of a collection of strings is a substring of every member of the collection. We say that a common substring is a longest common substring if there does not exist a longer common substring. For example, "CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as long as possible; in this case, "CGTA" is a longest common substring of "ACGTACGT" and "AACCGTATA".

Note that the longest common substring is not necessarily unique; for a simple example, "AA" and "CC" are both longest common substrings of "AACC" and "CCAA".

Given: A collection of $k$ $(k\leq100)$ DNA strings of length at most 1 kbp each in FASTA format.

Return: A longest common substring of the collection. (If multiple solutions exist, you may return any single solution.)

Sample Dataset

$\gt$Rosalind_1

GATTACA

$\gt$Rosalind_2

TAGACCA

$\gt$Rosalind_3

ATACA

Sample Output

AC

### Solution 14

In [21]:
dic = {'Rosalind_1': 'GATTACA',
       'Rosalind_2': 'TAGACCA',
       'Rosalind_3': 'ATACA'}

In [22]:
labels = list(dic.values())

In [23]:
def s14(fasta):
    labels = list(fasta.values())
    index = labels.index(min(labels, key=len))

    motif = 'A'
    shortest = labels[index]

    #cycle over the DNA string letters
    for i in range(len(shortest)):
        n = 0
        present = True
        while present:
            #cycle inside over all other DNA strings and if it's present in there considered a motif and length gets increased by 1
            for each in labels:
                if shortest[i:i+n] not in each or n>1000:
                    present = False
                    break
            if present:
                motif = max(shortest[i:i+n], motif, key=len)
            n += 1
    print(motif)

In [24]:
s14(dic)

CA


In [25]:
fasta = parse_sequences(DATA_PATH + 'rosalind_lcsm.txt')

In [26]:
s14(fasta)

AGGTACCTCTCATGCGAAGGGTGAAAAGATCTCGACGTGAAGGCACCCCT


### Problem 15 - Independent Alleles

Two events A and B are independent if $P_r(A$ and $B)$ is equal to $P_r(A)×P_r(B)$. In other words, the events do not influence each other, so that we may simply calculate each of the individual probabilities separately and then multiply.

More generally, random variables $X$ and $Y$ are independent if whenever A and B are respective events for X and Y, A and B are independent (i.e., $P_r(A$ and $B)=P_r(A)×P_r(B))$.
As an example of how helpful independence can be for calculating probabilities, let $X$
and $Y$ represent the numbers showing on two six-sided dice. Intuitively, the number of pips showing on one die should not affect the number showing on the other die. If we want to find the probability that $X+Y$ is odd, then we don't need to draw a tree diagram and consider all possibilities. We simply first note that for $X+Y$ to be odd, either $X$ is even and $Y$ is odd or $X$ is odd and $Y$ is even. In terms of probability, $P_r(X+Y$ is odd$)=P_r(X$ is even and $Y$ is odd$)+P_r(X$ is odd and $Y$ is even$)$. Using independence, this becomes $[P_r(X$ is even$)×P_r(Y$ is odd$)]+[P_r(X$ is odd$)×P_r(Y$ is even$)]$, or $(12)2+(12)2=12$.

Given: Two positive integers $k (k\leq7)$ and $N (N\leq2^k)$. In this problem, we begin with Tom, who in the $0$-th generation has genotype Aa Bb. Tom has two children in the 1st generation, each of whom has two children, and so on. Each organism always mates with an organism having genotype Aa Bb.

Return: The probability that at least $N$ Aa Bb organisms will belong to the $k$-th generation of Tom's family tree (don't count the Aa Bb mates at each level). Assume that Mendel's second law holds for the factors.

Sample Dataset

2 1

Sample Output

0.684


### Solution 15

The probability is derived from the general formula of the binomial distribution:
$P(k) = C(N, k) * p^K * q^{(n-k)}$

Where $C(N, k)$ is a combination which equals to:
$C(N, k) = \frac{N!}{(N - k)!}$

In [27]:
k = 1 
N = 2

In [28]:
# p - the probability that the offspring is Aa Bb (p(Aa)=1/2 and p(Bb)= 1/2)
p = 0.25
q = 1-p

In [29]:
import math

def s15(k, N):
    probability = 0                                                                
    P = 2**k 
    for i in range(N, P + 1):
        combination = math.factorial(P)/(math.factorial(i) * math.factorial(P-i))
        probability += combination * (p**i) * (q**(P-i))
    return probability

In [30]:
s15(2, 1)

0.68359375

In [31]:
k, N = parse_numbers(DATA_PATH + 'rosalind_lia.txt')
k, N

(7, 30)

In [32]:
s15(k, N)

0.6905237918230707

### Problem 16 - Finding a Protein Motif

To allow for the presence of its varying forms, a protein motif is represented by a shorthand as follows: [XY] means "either X or Y" and {X} means "any amino acid except X." For example, the N-glycosylation motif is written as N{P}[ST]{P}.

You can see the complete description and features of a particular protein by its access ID "uniprot_id" in the UniProt database, by inserting the ID number into

http://www.uniprot.org/uniprot/uniprot_id

Alternatively, you can obtain a protein sequence in FASTA format by following

http://www.uniprot.org/uniprot/uniprot_id.fasta

For example, the data for protein B5ZC00 can be found at http://www.uniprot.org/uniprot/B5ZC00.

Given: At most 15 UniProt Protein Database access IDs.

Return: For each protein possessing the N-glycosylation motif, output its given access ID followed by a list of locations in the protein string where the motif can be found.

Sample Dataset

A2Z669
B5ZC00
P07204_TRBM_HUMAN
P20840_SAG1_YEAST

Sample Output

B5ZC00

85 118 142 306 395

P07204_TRBM_HUMAN

47 115 116 382 409

P20840_SAG1_YEAST

79 109 135 248 306 348 364 402 485 501 614

### Solution 16:

Creating all possible versions for the protein motif N{P}[ST]{P}

In [33]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
P = amino_acids.copy()
P.remove('P')

In [34]:
#all possible {P}{P} permutations
import itertools
all = P + P
PP = []
for r in range(2, 3):
    for s in itertools.product(all, repeat=r):
         PP.append(''.join(s))

In [35]:
#add "S" between {P}{P}
PPS = []
for i in range(len(PP)):
    PPS.append(PP[i][:1] + "S" + PP[i][1:])
#add "T" between {P}{P}
PPT = []
for i in range(len(PP)):
    PPS.append(PP[i][:1] + "T" + PP[i][1:])  
#{P}[ST]{P}
PP = PPS + PPT

In [36]:
#All possible versions of the protein motif N{P}[ST]{P}, no duplicates
motifs = list(dict.fromkeys(['N{0}'.format(i) for i in PP]))
len(motifs)

722

Parsing the fasta files from the "UniProt Protein Database access IDs" into a dictionary using the IDs given in the text file.

In [37]:
from urllib.request import urlopen

def create_dictionary(ids_list):
    #removing the '\n'
    for i in range(len(ids_list)):
        ids_list[i] = ids_list[i][:-1]
    #creating urls from the IDs
    urls = []
    for i in range(len(ids_list)):
        urls.append('https://www.uniprot.org/uniprot/' + ids_list[i] + '.fasta')
    #parsing the fasta files
    fasta_files = []
    for url in urls:
        response = urlopen(url)
        fasta = response.read().decode("utf-8", "ignore")
        fasta_files.append(fasta)
    #Cropping out the strings only
    for fasta_file in range(len(fasta_files)):
        if 'SV=1' in fasta_files[fasta_file]:
            fasta_files[fasta_file]= fasta_files[fasta_file].replace('\n', '').split('SV=1')[1]
        if 'SV=2' in fasta_files[fasta_file]:
            fasta_files[fasta_file]= fasta_files[fasta_file].replace('\n', '').split('SV=2')[1]
        if 'SV=3' in fasta_files[fasta_file]:
            fasta_files[fasta_file]= fasta_files[fasta_file].replace('\n', '').split('SV=3')[1]
    #Creating a dictionary (keys = IDs, values = strings)
    dictionary = dict(zip(ids_list, fasta_files))
    return dictionary

In [38]:
dictionary = create_dictionary(open(DATA_PATH + 'rosalind_mprt.txt').readlines())
dictionary

{'P05155_IC1_HUMAN': 'MASRLTLLTLLLLLLAGDRASSNPNATSSSSQDPESLQDRGEGKVATTVISKMLFVEPILEVSSLPTTNSTTNSATKITANTTDEPTTQPTTEPTTQPTIQPTQPTTQLPTDSPTQPTTGSFCPGPVTLCSDLESHSTEAVLGDALVDFSLKLYHAFSAMKKVETNMAFSPFSIASLLTQVLLGAGENTKTNLESILSYPKDFTCVHQALKGFTTKGVTSVSQIFHSPDLAIRDTFVNASRTLYSSSPRVLSNNSDANLELINTWVAKNTNNKISRLLDSLPSDTRLVLLNAIYLSAKWKTTFDPKKTRMEPFHFKNSVIKVPMMNSKKYPVAHFIDQTLKAKVGQLQLSHNLSLVILVPQNLKHRLEDMEQALSPSVFKAIMEKLEMSKFQPTLLTLPRIKVTTSQDMLSIMEKLEFFDFSYDLNLCGLTEDPDLQVSAMQHQTVLELTETGVEAAAASAISVARTLLVFEVQQPFLFVLWDQQHKFPVFMGRVYDPRA',
 'P13838_LEUK_RAT': 'WAQVVSQENLPNTMTMLPFTPNSESPSTSEALSTYSSIATVPVTEDPKESISPWGQTTAPASSIPLGTPELSSFFFTSAGASGNTPVPELTTSQEVSTEASLVLFPKSSGVASDPPVTITNPATSSAVASTSLETFKGTSAPPVTVTSSTMTSGPFVATTVSSETSGPPVTMATGSLGPSKETHGLSATIATSSGESSSVAGGTPVFSTKISTTSTPNPITTVPPRPGSSGMLLVSMLIALTVVLVLVALLLLWRQRQKRRTGALTLSRGGKRNGTVDAWAGPARVPDEEATTASGSGGNKSSGAPETDGSGQRPTLTTFFSRRKSRQGSVALEELKPGTGPNLKGEEEPLVGSEDEAVETPTSDGPQAKDGAAPQSL',
 'P00304_ARA3_AMBEL': 'GKVYLVGGPELGGWKLQSDPRAYALWSARQQFKTTDVLWFNFTTGEDSVA

In [39]:
#motif locations 
def s16a(s):
    results = []
    for motif in motifs:
        for i in range(0, len(s) - len(motif) + 1):
            if s[i:i+len(motif)] == motif:
                results.append(i+1)
    results.sort()
    return results

In [40]:
#formating results
def s16b(d):
    results = []
    #motif locations for all strings from a dictionary
    for i in d.values():
        results.append(s16a(i))
    #new dictionary 'dic'(key = ID, value = results)
    dic = dict(zip(list(d.keys()), results))
    #removing items with empty lists i.e no motif found
    dic = {key:val for key, val in dic.items() if val !=[]}
    for i in range(len(list(dic.keys()))):
        print(list(dic.keys())[i])
        print(" ".join(str(x) for x in list(dic.values())[i]))

In [41]:
s16b(dictionary)

P05155_IC1_HUMAN
25 69 81 238 253 352
P13838_LEUK_RAT
274 300
P00304_ARA3_AMBEL
41
P37803
110
P25174
17 32 56 97 116 132 151 178 183 198 325 670
Q640N1
471 519 913 1030
Q7TMB8
209 291 328 442 607 672 831 858
P00744_PRTZ_BOVIN
59 191 289
P81428_FA10_TROCA
254
P02725_GLP_PIG
16 19 39
P43541
129
P01374_TNFB_HUMAN
96
P10646_TFPI_HUMAN
145 195 256
P01046_KNL1_BOVIN
47 87 168 169 197 204


### Problem 17 - Inferring mRNA from Protein

For positive integers $a$ and $n$, a modulo $n$ (written amodn in shorthand) is the remainder when $a$ is divided by $n$. For example, $29 mod 11=7$ because $29=11×2+7$.

Modular arithmetic is the study of addition, subtraction, multiplication, and division with respect to the modulo operation. We say that $a$ and $b$ are congruent modulo $n$ if $a modn=bmodn$; in this case, we use the notation $a\equiv bmodn$.

Two useful facts in modular arithmetic are that if $a≡bmodn$ and $c\equiv dmodn$, then $a+c≡b+dmodn$ and $a×c\equiv b×dmodn$.

As you will see in this exercise, some Rosalind problems will ask for a (very large) integer solution modulo a smaller number to avoid the computational pitfalls that arise with storing such large numbers.

Given: A protein string of length at most 1000 aa.

Return: The total number of different RNA strings from which the protein could have been translated, modulo 1,000,000. (Don't neglect the importance of the stop codon in protein translation.)

Sample Dataset

MA

Sample Output

12


### Solution 17

In [42]:
# RNA codon table
rna_codon = {"UUU" : "F", "CUU" : "L", "AUU" : "I", "GUU" : "V",
           "UUC" : "F", "CUC" : "L", "AUC" : "I", "GUC" : "V",
           "UUA" : "L", "CUA" : "L", "AUA" : "I", "GUA" : "V",
           "UUG" : "L", "CUG" : "L", "AUG" : "M", "GUG" : "V",
           "UCU" : "S", "CCU" : "P", "ACU" : "T", "GCU" : "A",
           "UCC" : "S", "CCC" : "P", "ACC" : "T", "GCC" : "A",
           "UCA" : "S", "CCA" : "P", "ACA" : "T", "GCA" : "A",
           "UCG" : "S", "CCG" : "P", "ACG" : "T", "GCG" : "A",
           "UAU" : "Y", "CAU" : "H", "AAU" : "N", "GAU" : "D",
           "UAC" : "Y", "CAC" : "H", "AAC" : "N", "GAC" : "D",
           "UAA" : "STOP", "CAA" : "Q", "AAA" : "K", "GAA" : "E",
           "UAG" : "STOP", "CAG" : "Q", "AAG" : "K", "GAG" : "E",
           "UGU" : "C", "CGU" : "R", "AGU" : "S", "GGU" : "G",
           "UGC" : "C", "CGC" : "R", "AGC" : "S", "GGC" : "G",
           "UGA" : "STOP", "CGA" : "R", "AGA" : "R", "GGA" : "G",
           "UGG" : "W", "CGG" : "R", "AGG" : "R", "GGG" : "G" 
           }

In [43]:
def reverse_table(table):
    #list with empty list for each aminoacid in amino_acids
    lists = [[] for _ in amino_acids]
    #removing the STOP sequences from rna_codon
    rna_codon_noSTOP = {key:val for key, val in rna_codon.items() if val != 'STOP'}
    #filling each list with possible codons for the given amino acid
    for i in range(len(amino_acids)):
        for codon, acid in rna_codon_noSTOP.items():
            if acid == amino_acids[i]:
                lists[i].append(codon)
    #dictionary (key = amino acid, value = list of possible codons)
    reverse_rna_codon = dict(zip(amino_acids, lists))
    return reverse_rna_codon

In [44]:
reverse_table(rna_codon)

{'A': ['GCU', 'GCC', 'GCA', 'GCG'],
 'C': ['UGU', 'UGC'],
 'D': ['GAU', 'GAC'],
 'E': ['GAA', 'GAG'],
 'F': ['UUU', 'UUC'],
 'H': ['CAU', 'CAC'],
 'I': ['AUU', 'AUC', 'AUA'],
 'G': ['GGU', 'GGC', 'GGA', 'GGG'],
 'K': ['AAA', 'AAG'],
 'L': ['CUU', 'CUC', 'UUA', 'CUA', 'UUG', 'CUG'],
 'M': ['AUG'],
 'N': ['AAU', 'AAC'],
 'P': ['CCU', 'CCC', 'CCA', 'CCG'],
 'Q': ['CAA', 'CAG'],
 'R': ['CGU', 'CGC', 'CGA', 'AGA', 'CGG', 'AGG'],
 'S': ['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'],
 'T': ['ACU', 'ACC', 'ACA', 'ACG'],
 'V': ['GUU', 'GUC', 'GUA', 'GUG'],
 'W': ['UGG'],
 'Y': ['UAU', 'UAC']}

In [45]:
def s17(path):
    file = open(path, 'r')
    sequence = file.readline().rstrip()
    file.close()
    RNAs = 0
    
    #number of possible RNA strings corresponding to the first amino acid
    RNAs = len(reverse_table(rna_codon)[sequence[0]])
    
    # loop over the protein sequence, without the first amino acid     
    for i in range(1, len(sequence)):
        RNAs *= len(reverse_table(rna_codon)[sequence[i]])
    # 3 STOP codons means three ways of ending the RNA string 
    RNAs *= 3
    
    print(RNAs % 1000000)

In [46]:
s17(DATA_PATH + 'rosalind_mrna.txt')

366144


### Problem 18 - Open Reading Frames

Either strand of a DNA double helix can serve as the coding strand for RNA transcription. Hence, a given DNA string implies six total reading frames, or ways in which the same region of DNA can be translated into amino acids: three reading frames result from reading the string itself, whereas three more result from reading its reverse complement.

An open reading frame (ORF) is one which starts from the start codon and ends by stop codon, without any other stop codons in between. Thus, a candidate protein string is derived by translating an open reading frame into amino acids until a stop codon is reached.

Given: A DNA string $s$ of length at most 1 kbp in FASTA format.

Return: Every distinct candidate protein string that can be translated from ORFs of $s$. Strings can be returned in any order.

Sample Dataset

$\gt$Rosalind_99

AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG

Sample Output

MLLGSFRLIPKETLIQVAGSSPCNLS

M

MGMTPRLGLESLLE

MTPRLGLESLLE

### Solution 18

In [47]:
import re

In [48]:
DNAstring = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'

In [49]:
#create complementary DNA string
def replace(string):
    tr = str.maketrans(dict(zip('ACGT', 'TGCA')))
    return string.translate(tr)[::-1]
#create protein sequence out of RNA string
def s8(rna):
    protein_string = ""
    for i in range(0, len(rna)-(3+len(rna)%3), 3):
        if rna_codon[rna[i:i+3]] == "STOP" :
            break
        protein_string += rna_codon[rna[i:i+3]]
    return protein_string

In [50]:
def s18(dna):
    '''Print all distinct candidate protein sequences that can be translated from ORFs of a DNA string'''
    candidates = []
    #pattern which starts with START DNA codon and ends with any of the three STOP DNA codons
    pattern = re.compile(r'(?=(ATG(?:...)*?(?:TAG|TGA|TAA)))')
    #patterns for both main and complementary DNA string
    patterns = list(pattern.findall(dna) + pattern.findall(replace(dna)))
    for i in patterns:
        # turn DNA substrings into RNA and print possible candidates
        candidates.append(s8(i.replace('T', 'U')))
    #no duplicates
    for i in set(candidates):
        print(i)

In [51]:
s18(DNAstring)

MGMTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS
M
MTPRLGLESLLE


In [52]:
s18(list(parse_sequences(DATA_PATH + 'rosalind_orf.txt').values())[0])

MNRPASKSRFSS
MVPFTSCATALRLARELPI
MAHDTLVIAIMSQLLTTDECTEQLWDVVLIHCGTILLGLQVD
MGILSLSSHDGL
MDEFGLRALFVTYRQKPYLGRWQDPGRPLGMISV
MHNLARLCMNRPAWSRSPRAPRPSG
MSHGRVWAKSSLRHLPSEALSW
MLGGSCII
MPLRPSLIK
MTFPSLLCRQTQRQIPLHAHILEYELILAASCRCRLNFSSFGQNEGLEGRL
MN
MHEPPSIKVAI
MGYDKSILLSLTVYLERIVQFYL
MNRPAWSRSPRAPRPSG
MGD
MYGTALGCGTYTLRDDPVGPSS
M
MHNLARLCMNRPASKSRFSS
MIPLLSRL
MGRSFDQ
MS
MISV
MPFEF
MSQLLTTDECTEQLWDVVLIHCGTILLGLQVD
MWYLYTAGRSCWAFKLIDGDPVFK
MVVGALNVGAPAVPLLVAKWAISC
MLTS
MQWNLPLGLATQK
MHEPPSMVPFTSCATALRLARELPI
MDCRGA


### Problem 19 - Enumerating Gene Orders

A permutation of length $n$ is an ordering of the positive integers ${1,2,…,n}$. For example, $\pi=(5,3,2,1,4)$ is a permutation of length $5$.

Given: A positive integer $n\leq7$.

Return: The total number of permutations of length $n$, followed by a list of all such permutations (in any order).

Sample Dataset

3

Sample Output

6

1 2 3

1 3 2

2 1 3

2 3 1

3 1 2

3 2 1


In [53]:
def s19(num):
    #range_num = [all numbers preceding num including num]
    range_num = []
    for i in range(1, num+1):
        range_num.append(i)
    print(len(list(itertools.permutations(range_num))))
    for item in list(itertools.permutations(range_num)):
        print(item[0], ' '.join(map(str, item[1:])))

In [54]:
s19(3)

6
1 2 3
1 3 2
2 1 3
2 3 1
3 1 2
3 2 1


In [55]:
s19(parse_numbers(DATA_PATH + 'rosalind_perm.txt')[0])

5040
1 2 3 4 5 6 7
1 2 3 4 5 7 6
1 2 3 4 6 5 7
1 2 3 4 6 7 5
1 2 3 4 7 5 6
1 2 3 4 7 6 5
1 2 3 5 4 6 7
1 2 3 5 4 7 6
1 2 3 5 6 4 7
1 2 3 5 6 7 4
1 2 3 5 7 4 6
1 2 3 5 7 6 4
1 2 3 6 4 5 7
1 2 3 6 4 7 5
1 2 3 6 5 4 7
1 2 3 6 5 7 4
1 2 3 6 7 4 5
1 2 3 6 7 5 4
1 2 3 7 4 5 6
1 2 3 7 4 6 5
1 2 3 7 5 4 6
1 2 3 7 5 6 4
1 2 3 7 6 4 5
1 2 3 7 6 5 4
1 2 4 3 5 6 7
1 2 4 3 5 7 6
1 2 4 3 6 5 7
1 2 4 3 6 7 5
1 2 4 3 7 5 6
1 2 4 3 7 6 5
1 2 4 5 3 6 7
1 2 4 5 3 7 6
1 2 4 5 6 3 7
1 2 4 5 6 7 3
1 2 4 5 7 3 6
1 2 4 5 7 6 3
1 2 4 6 3 5 7
1 2 4 6 3 7 5
1 2 4 6 5 3 7
1 2 4 6 5 7 3
1 2 4 6 7 3 5
1 2 4 6 7 5 3
1 2 4 7 3 5 6
1 2 4 7 3 6 5
1 2 4 7 5 3 6
1 2 4 7 5 6 3
1 2 4 7 6 3 5
1 2 4 7 6 5 3
1 2 5 3 4 6 7
1 2 5 3 4 7 6
1 2 5 3 6 4 7
1 2 5 3 6 7 4
1 2 5 3 7 4 6
1 2 5 3 7 6 4
1 2 5 4 3 6 7
1 2 5 4 3 7 6
1 2 5 4 6 3 7
1 2 5 4 6 7 3
1 2 5 4 7 3 6
1 2 5 4 7 6 3
1 2 5 6 3 4 7
1 2 5 6 3 7 4
1 2 5 6 4 3 7
1 2 5 6 4 7 3
1 2 5 6 7 3 4
1 2 5 6 7 4 3
1 2 5 7 3 4 6
1 2 5 7 3 6 4
1 2 5 7 4 3 6
1 2 5 7 4 6 3
1 2 5 7 6 3 4
1

1 7 5 6 2 3 4
1 7 5 6 2 4 3
1 7 5 6 3 2 4
1 7 5 6 3 4 2
1 7 5 6 4 2 3
1 7 5 6 4 3 2
1 7 6 2 3 4 5
1 7 6 2 3 5 4
1 7 6 2 4 3 5
1 7 6 2 4 5 3
1 7 6 2 5 3 4
1 7 6 2 5 4 3
1 7 6 3 2 4 5
1 7 6 3 2 5 4
1 7 6 3 4 2 5
1 7 6 3 4 5 2
1 7 6 3 5 2 4
1 7 6 3 5 4 2
1 7 6 4 2 3 5
1 7 6 4 2 5 3
1 7 6 4 3 2 5
1 7 6 4 3 5 2
1 7 6 4 5 2 3
1 7 6 4 5 3 2
1 7 6 5 2 3 4
1 7 6 5 2 4 3
1 7 6 5 3 2 4
1 7 6 5 3 4 2
1 7 6 5 4 2 3
1 7 6 5 4 3 2
2 1 3 4 5 6 7
2 1 3 4 5 7 6
2 1 3 4 6 5 7
2 1 3 4 6 7 5
2 1 3 4 7 5 6
2 1 3 4 7 6 5
2 1 3 5 4 6 7
2 1 3 5 4 7 6
2 1 3 5 6 4 7
2 1 3 5 6 7 4
2 1 3 5 7 4 6
2 1 3 5 7 6 4
2 1 3 6 4 5 7
2 1 3 6 4 7 5
2 1 3 6 5 4 7
2 1 3 6 5 7 4
2 1 3 6 7 4 5
2 1 3 6 7 5 4
2 1 3 7 4 5 6
2 1 3 7 4 6 5
2 1 3 7 5 4 6
2 1 3 7 5 6 4
2 1 3 7 6 4 5
2 1 3 7 6 5 4
2 1 4 3 5 6 7
2 1 4 3 5 7 6
2 1 4 3 6 5 7
2 1 4 3 6 7 5
2 1 4 3 7 5 6
2 1 4 3 7 6 5
2 1 4 5 3 6 7
2 1 4 5 3 7 6
2 1 4 5 6 3 7
2 1 4 5 6 7 3
2 1 4 5 7 3 6
2 1 4 5 7 6 3
2 1 4 6 3 5 7
2 1 4 6 3 7 5
2 1 4 6 5 3 7
2 1 4 6 5 7 3
2 1 4 6 7 3 5
2 1 4 

3 4 1 5 6 7 2
3 4 1 5 7 2 6
3 4 1 5 7 6 2
3 4 1 6 2 5 7
3 4 1 6 2 7 5
3 4 1 6 5 2 7
3 4 1 6 5 7 2
3 4 1 6 7 2 5
3 4 1 6 7 5 2
3 4 1 7 2 5 6
3 4 1 7 2 6 5
3 4 1 7 5 2 6
3 4 1 7 5 6 2
3 4 1 7 6 2 5
3 4 1 7 6 5 2
3 4 2 1 5 6 7
3 4 2 1 5 7 6
3 4 2 1 6 5 7
3 4 2 1 6 7 5
3 4 2 1 7 5 6
3 4 2 1 7 6 5
3 4 2 5 1 6 7
3 4 2 5 1 7 6
3 4 2 5 6 1 7
3 4 2 5 6 7 1
3 4 2 5 7 1 6
3 4 2 5 7 6 1
3 4 2 6 1 5 7
3 4 2 6 1 7 5
3 4 2 6 5 1 7
3 4 2 6 5 7 1
3 4 2 6 7 1 5
3 4 2 6 7 5 1
3 4 2 7 1 5 6
3 4 2 7 1 6 5
3 4 2 7 5 1 6
3 4 2 7 5 6 1
3 4 2 7 6 1 5
3 4 2 7 6 5 1
3 4 5 1 2 6 7
3 4 5 1 2 7 6
3 4 5 1 6 2 7
3 4 5 1 6 7 2
3 4 5 1 7 2 6
3 4 5 1 7 6 2
3 4 5 2 1 6 7
3 4 5 2 1 7 6
3 4 5 2 6 1 7
3 4 5 2 6 7 1
3 4 5 2 7 1 6
3 4 5 2 7 6 1
3 4 5 6 1 2 7
3 4 5 6 1 7 2
3 4 5 6 2 1 7
3 4 5 6 2 7 1
3 4 5 6 7 1 2
3 4 5 6 7 2 1
3 4 5 7 1 2 6
3 4 5 7 1 6 2
3 4 5 7 2 1 6
3 4 5 7 2 6 1
3 4 5 7 6 1 2
3 4 5 7 6 2 1
3 4 6 1 2 5 7
3 4 6 1 2 7 5
3 4 6 1 5 2 7
3 4 6 1 5 7 2
3 4 6 1 7 2 5
3 4 6 1 7 5 2
3 4 6 2 1 5 7
3 4 6 2 1 7 5
3 4 6 

4 6 3 1 2 7 5
4 6 3 1 5 2 7
4 6 3 1 5 7 2
4 6 3 1 7 2 5
4 6 3 1 7 5 2
4 6 3 2 1 5 7
4 6 3 2 1 7 5
4 6 3 2 5 1 7
4 6 3 2 5 7 1
4 6 3 2 7 1 5
4 6 3 2 7 5 1
4 6 3 5 1 2 7
4 6 3 5 1 7 2
4 6 3 5 2 1 7
4 6 3 5 2 7 1
4 6 3 5 7 1 2
4 6 3 5 7 2 1
4 6 3 7 1 2 5
4 6 3 7 1 5 2
4 6 3 7 2 1 5
4 6 3 7 2 5 1
4 6 3 7 5 1 2
4 6 3 7 5 2 1
4 6 5 1 2 3 7
4 6 5 1 2 7 3
4 6 5 1 3 2 7
4 6 5 1 3 7 2
4 6 5 1 7 2 3
4 6 5 1 7 3 2
4 6 5 2 1 3 7
4 6 5 2 1 7 3
4 6 5 2 3 1 7
4 6 5 2 3 7 1
4 6 5 2 7 1 3
4 6 5 2 7 3 1
4 6 5 3 1 2 7
4 6 5 3 1 7 2
4 6 5 3 2 1 7
4 6 5 3 2 7 1
4 6 5 3 7 1 2
4 6 5 3 7 2 1
4 6 5 7 1 2 3
4 6 5 7 1 3 2
4 6 5 7 2 1 3
4 6 5 7 2 3 1
4 6 5 7 3 1 2
4 6 5 7 3 2 1
4 6 7 1 2 3 5
4 6 7 1 2 5 3
4 6 7 1 3 2 5
4 6 7 1 3 5 2
4 6 7 1 5 2 3
4 6 7 1 5 3 2
4 6 7 2 1 3 5
4 6 7 2 1 5 3
4 6 7 2 3 1 5
4 6 7 2 3 5 1
4 6 7 2 5 1 3
4 6 7 2 5 3 1
4 6 7 3 1 2 5
4 6 7 3 1 5 2
4 6 7 3 2 1 5
4 6 7 3 2 5 1
4 6 7 3 5 1 2
4 6 7 3 5 2 1
4 6 7 5 1 2 3
4 6 7 5 1 3 2
4 6 7 5 2 1 3
4 6 7 5 2 3 1
4 6 7 5 3 1 2
4 6 7 5 3 2 1
4 7 1 

6 1 5 4 7 2 3
6 1 5 4 7 3 2
6 1 5 7 2 3 4
6 1 5 7 2 4 3
6 1 5 7 3 2 4
6 1 5 7 3 4 2
6 1 5 7 4 2 3
6 1 5 7 4 3 2
6 1 7 2 3 4 5
6 1 7 2 3 5 4
6 1 7 2 4 3 5
6 1 7 2 4 5 3
6 1 7 2 5 3 4
6 1 7 2 5 4 3
6 1 7 3 2 4 5
6 1 7 3 2 5 4
6 1 7 3 4 2 5
6 1 7 3 4 5 2
6 1 7 3 5 2 4
6 1 7 3 5 4 2
6 1 7 4 2 3 5
6 1 7 4 2 5 3
6 1 7 4 3 2 5
6 1 7 4 3 5 2
6 1 7 4 5 2 3
6 1 7 4 5 3 2
6 1 7 5 2 3 4
6 1 7 5 2 4 3
6 1 7 5 3 2 4
6 1 7 5 3 4 2
6 1 7 5 4 2 3
6 1 7 5 4 3 2
6 2 1 3 4 5 7
6 2 1 3 4 7 5
6 2 1 3 5 4 7
6 2 1 3 5 7 4
6 2 1 3 7 4 5
6 2 1 3 7 5 4
6 2 1 4 3 5 7
6 2 1 4 3 7 5
6 2 1 4 5 3 7
6 2 1 4 5 7 3
6 2 1 4 7 3 5
6 2 1 4 7 5 3
6 2 1 5 3 4 7
6 2 1 5 3 7 4
6 2 1 5 4 3 7
6 2 1 5 4 7 3
6 2 1 5 7 3 4
6 2 1 5 7 4 3
6 2 1 7 3 4 5
6 2 1 7 3 5 4
6 2 1 7 4 3 5
6 2 1 7 4 5 3
6 2 1 7 5 3 4
6 2 1 7 5 4 3
6 2 3 1 4 5 7
6 2 3 1 4 7 5
6 2 3 1 5 4 7
6 2 3 1 5 7 4
6 2 3 1 7 4 5
6 2 3 1 7 5 4
6 2 3 4 1 5 7
6 2 3 4 1 7 5
6 2 3 4 5 1 7
6 2 3 4 5 7 1
6 2 3 4 7 1 5
6 2 3 4 7 5 1
6 2 3 5 1 4 7
6 2 3 5 1 7 4
6 2 3 5 4 1 7
6 2 3 

7 4 1 3 5 2 6
7 4 1 3 5 6 2
7 4 1 3 6 2 5
7 4 1 3 6 5 2
7 4 1 5 2 3 6
7 4 1 5 2 6 3
7 4 1 5 3 2 6
7 4 1 5 3 6 2
7 4 1 5 6 2 3
7 4 1 5 6 3 2
7 4 1 6 2 3 5
7 4 1 6 2 5 3
7 4 1 6 3 2 5
7 4 1 6 3 5 2
7 4 1 6 5 2 3
7 4 1 6 5 3 2
7 4 2 1 3 5 6
7 4 2 1 3 6 5
7 4 2 1 5 3 6
7 4 2 1 5 6 3
7 4 2 1 6 3 5
7 4 2 1 6 5 3
7 4 2 3 1 5 6
7 4 2 3 1 6 5
7 4 2 3 5 1 6
7 4 2 3 5 6 1
7 4 2 3 6 1 5
7 4 2 3 6 5 1
7 4 2 5 1 3 6
7 4 2 5 1 6 3
7 4 2 5 3 1 6
7 4 2 5 3 6 1
7 4 2 5 6 1 3
7 4 2 5 6 3 1
7 4 2 6 1 3 5
7 4 2 6 1 5 3
7 4 2 6 3 1 5
7 4 2 6 3 5 1
7 4 2 6 5 1 3
7 4 2 6 5 3 1
7 4 3 1 2 5 6
7 4 3 1 2 6 5
7 4 3 1 5 2 6
7 4 3 1 5 6 2
7 4 3 1 6 2 5
7 4 3 1 6 5 2
7 4 3 2 1 5 6
7 4 3 2 1 6 5
7 4 3 2 5 1 6
7 4 3 2 5 6 1
7 4 3 2 6 1 5
7 4 3 2 6 5 1
7 4 3 5 1 2 6
7 4 3 5 1 6 2
7 4 3 5 2 1 6
7 4 3 5 2 6 1
7 4 3 5 6 1 2
7 4 3 5 6 2 1
7 4 3 6 1 2 5
7 4 3 6 1 5 2
7 4 3 6 2 1 5
7 4 3 6 2 5 1
7 4 3 6 5 1 2
7 4 3 6 5 2 1
7 4 5 1 2 3 6
7 4 5 1 2 6 3
7 4 5 1 3 2 6
7 4 5 1 3 6 2
7 4 5 1 6 2 3
7 4 5 1 6 3 2
7 4 5 2 1 3 6
7 4 5 

### Problem 20 - Calculating Protein Mass

In a weighted alphabet, every symbol is assigned a positive real number called a weight. A string formed from a weighted alphabet is called a weighted string, and its weight is equal to the sum of the weights of its symbols.

The standard weight assigned to each member of the 20-symbol amino acid alphabet is the monoisotopic mass of the corresponding amino acid.

Given: A protein string $P$ of length at most 1000 aa.

Return: The total weight of $P$. Consult the monoisotopic mass table.

Sample Dataset

SKADYEK

Sample Output

821.392


### Solution 20

In [56]:
def s20(protein_string):
    dic ={'A':71.03711,
'C':103.00919,
'D':115.02694,
'E':129.04259,
'F':147.06841,
'G':57.02146,
'H':137.05891,
'I':113.08406,
'K':128.09496,
'L':113.08406,
'M':131.04049,
'N':114.04293,
'P':97.05276,
'Q':128.05858,
'R':156.10111,
'S':87.03203,
'T':101.04768,
'V':99.06841,
'W':186.07931,
'Y':163.06333 }
    return round(sum(dic[p] for p in protein_string), 3)

In [57]:
s20('SKADYEK')

821.392

In [58]:
file = open(DATA_PATH + 'rosalind_prtm.txt', 'r')
protein = file.read()
file.close()

In [59]:
s20(protein[:-1])

102761.917