In [2]:
DATA_PATH = "Data/"

In [3]:
# FASTA file reader
from collections import OrderedDict
from typing import Dict

NAME_SYMBOL = '>'


def parse_sequences(filename: str,
                    ordered: bool=False) -> Dict[str, str]:
    """
    Parses a text file of genome sequences into a dictionary.
    Arguments:
      filename: str - The name of the file containing the genome info.
      ordered: bool - Set this to True if you want the result to be ordered.
    """
    result = OrderedDict() if ordered else {}

    last_name = None
    with open(filename) as sequences:
        for line in sequences:
            if line.startswith(NAME_SYMBOL):
                last_name = line[1:-1]
                result[last_name] = []
            else:
                result[last_name].append(line[:-1])

    for name in result:
        result[name] = ''.join(result[name])

    return result

In [1]:
def parse_numbers(path):
    """
    Parses a text file of numbers into a list
    """
    with open(path, 'r') as handle:
        for line in handle:
            if not line.strip():
                continue  # This skips blank lines

            values = [*map(int, line.split())]
            return values

### Problem 1  - Counting DNA Nucleotides

A string is simply an ordered collection of symbols selected from some alphabet and formed into a word; the length of a string is the number of symbols that it contains.

An example of a length 21 DNA string (whose alphabet contains the symbols 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG."

Given: A DNA string $s$ of length at most 1000 nt.

Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in $s$.

Sample Dataset:

AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC

Sample Output:

20 12 17 21

### Solution 1

In [3]:
bases = ['A', 'C', 'G', 'T']

In [4]:
string = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
for i in bases:
    print(string. count(i), end=" ")

20 12 17 21 

In [5]:
file = open(DATA_PATH + 'rosalind_dna.txt', 'r')
dna = file.read()
for i in bases:
    print(dna. count(i), end=" ")

201 224 225 210 

### Problem 2 - Transcribing DNA into RNA

An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', and 'U'.

Given a DNA string $t$ corresponding to a coding strand, its transcribed RNA string $u$ is formed by replacing all occurrences of 'T' in $t$ with 'U' in $u$.

Given: A DNA string $t$ having length at most 1000 nt.

Return: The transcribed RNA string of $t$.

Sample Dataset:

GATGGAACTTGACTACGTAAATT

Sample Output:

GAUGGAACUUGACUACGUAAAUU


### Solution 2:

In [6]:
string = "GATGGAACTTGACTACGTAAATT"
RNAstring = string.replace('T', 'U')
RNAstring

'GAUGGAACUUGACUACGUAAAUU'

In [7]:
file = open(DATA_PATH + 'rosalind_rna.txt', 'r')
dna = file.read()
rna = dna.replace('T', 'U')
rna

'AAGAAAGGACUCAAUACGCCCAUUUUUUACCAACGUGCAACUGACUGACCGUUUCACAGGUAUGCCUCUUUGAUGCACUGUCGGAAGAGAGGGACUACGAUGUGCUGGAAUGAUAUCACGUAUACCGUGCUGGUUUGACCAAGUCUUCCGUGUAGUACCUGAUGCGGUUCCCGGUAGUUAUUUGCUCUUCCGGUAAAGGCUUUUAGCAUUGUCCGUUUUUUUACUUCCCCGGAACGCGGCGCCCGAGAGAUGAAAAGUUUUGUCGACCCUCGGUCAGCAACGACGACCAAUCCAAGCUCUCUGAACUUAGCUUACUUAUAGAAACCCGUUGGACCCCGUCGUGCCAUUCACUACAGAUGCAACUUCUAAACCAGCGCACCUAGCAGAUAUGUUUACGCCUAUUGGAUAAUCGUUGAAUAUCUUGUUGGUGUCCGUCUAUGGUCCAAUGUGCGGUGGACCCCAUAACACUCUUACUUUUCAGCUUCUGAUUAAGCGGCGCGUCACGUAAUCGUUCAGACGGUUGCUCUUGUUUGCCCUAAGACCACCUAUUACUGCCCGCUCUGCUGAAACGCGAUAAGUCAUUCGAGACGUGUCUAUUCCAUGCGACAGAUAAUGCAUCGUCCUAUACCACAAGACGGGCCUAGCAGGGACGAAUGCUCGUAUACUCGACCACCAACCGUUUGUCGGGCGCACAAUCGUAAUGCUCUGCGGCCCUAUGCUGUGCAGAUCCCACUUGAUCUGUUGACAUGCAUCUCGAGGGAGAUUGAACCUUCCACCCUCCUAACCACCAGAAACCGCAAUCGGGUACACUCCGCCUCUGGACAGUAAAGGCGUUUCGCGUCCAGUCUGGCAAGACGGGUGCAGGUUGCUGGCUGCGAGUUCGUGUAAGAGCGUGGCUAUGCGCUCCGUGUACUGACAACUCCUACUGAAGCUUGGACUUACAGCGCUAUCGUAUAUGCCUGCUCCACGUGAUAUGGGUGUGAAUAAGGUUUAGGAGCG

### Problem 3 - Complementing a Strand of DNA

In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'.

The reverse complement of a DNA string $s$
is the string $s^{c}$ formed by reversing the symbols of $s$, then taking the complement of each symbol (e.g., the reverse complement of "GTCA" is "TGAC").

Given:
A DNA string $s$ of length at most 1000 bp.

Return: The reverse complement $s^{c}$
of s.

Sample Dataset:

AAAACCCGGT

Sample Output:

ACCGGGTTTT

### Solution 3:

In [8]:
tr = str.maketrans(dict(zip('ACGT', 'TGCA')))
def replace(string):
    return string.translate(tr)[::-1]

In [9]:
string = "AAAACCCGGT"
replace(string)

'ACCGGGTTTT'

In [10]:
file = open(DATA_PATH + 'rosalind_revc.txt', 'r')
dna = file.read()
compdna = replace(dna)
compdna

'\nTTTGTGTCTTACAACTGCTAGAGTCCTGATGGGTTTCGCAGGGTTGCGTTGTCGGTCGCGGCGGAATCAACGCGATACAGGATAGCGATTTCATCGAGTGAGCTCTTCAATTTGTAATAGGCCGACGTGAAACGACGAGATGAGCCTAGGAGCTACTATTTGTCCCTCAGTTAGTTTCCCTTCGCATAGCTTCGAGATAACGCGACCCTAAGGGTTGCTGCTGAGCCTGTTGAGGATTGCTGCCCCGTACACTCTGAGCCGTTAACTGATTCATCTGGGAGCGGGCCAAAGACGTACTCTCCAGACGATTTCTACATACAGACGAGGGGTTATCATAGTGTATGCCCCGTGAAGGGTTGCCGGTGAGCAACACTGCCCCCTGAACTTCGAACCATTTAATCGCCTAACCTAACGAACACGTTCCCGTAGAAGGGACCGGCGGAGGAGCGAGTCTCCGTTTTGTCGAAACAGTCGGGTAACAAACGTGCTGTATCCAGCGTCCTGACTTCTCTATAATCCAGCGCGGGCAAATCGCTTCTCAATGTCTTAGGTATGTTCACCAACAGTGCCTCGATATTGTCAGGCCGACGGATCTCAATAAGGTCATTGGACGAAGACCACGACATTATGCACACAGAAAACTTTTCCGGCGCCACGGCACTCCACCACTGCTGGGACGGTATCTAAGGGTTAACCGGGCTTAGGGATCAGCGTCACACGTTCAGTAGTGTTGAAGCTTAGATTTTGATTGGTGAAAGCCTCTTATCAACGCGTGCCATCCCCTACTGCAACCGTACCCTTTCATAAACGTTCCTACTAACTCTAGCGTAAGGGAAAGAAAAGAGAAGTAGGTCAATCTAGGATGTCAGCCCACTTAGGTGCATGAGCCGCGGTTACGTGAAGAAGTCAAAAGCGCGTACGACTTGTATCCTATGTTAAAACCGACGTGAAAATTCCATTATTTGGTGGTATGAAGTTCCAAGAGAAGC'

### Problem 4 - Computing GC Content

The GC-content of a DNA string is given by the percentage of symbols in the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" is 37.5%. Note that the reverse complement of any DNA string has the same GC-content.

DNA strings must be labeled when they are consolidated into a database. A commonly used method of string labeling is called FASTA format. In this format, the string is introduced by a line that begins with '>', followed by some labeling information. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.

In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.

Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).

Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in (0 mm)˜ all decimal answers unless otherwise stated; please see the note on absolute error below.

Sample Dataset:

$\gt$Rosalind_6404

CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG

$\gt$Rosalind_5959

CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC

$\gt$Rosalind_0808

CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT

Sample outcome:

Rosalind_0808

60.919540


### Solution 4:

In [11]:
dic = {'Rosalind_6404':'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG',
      'Rosalind_5959':'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC',
      'Rosalind_0808':'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'
}

In [12]:
def s4(d):
    for key, value in d.items():
        d[key] = (value.count('C') + value.count('G'))*100/len(value)
    for i in max(d.items(), key = lambda x: x[1]):
        print(i)

In [13]:
s4(dic)

Rosalind_0808
60.91954022988506


In [14]:
FASTAfile = parse_sequences(DATA_PATH + 'rosalind_gc.txt')

In [15]:
s4(FASTAfile)

Rosalind_0007
51.81711606096131


### Problem 5 - Counting Point Mutations

Given two strings $s$ and $t$ of equal length, the Hamming distance between $s$ and $t$, denoted $d_{H}(s,t)$, is the number of corresponding symbols that differ in $s$ and $t$. See Figure 2.

Given: Two DNA strings $s$ and $t$ of equal length (not exceeding 1 kbp).

Return: The Hamming distance $d_{H}(s,t)$.

Sample Dataset:

GAGCCTACTAACGGGAT

CATCGTAATGACGGCCT

Sample Outcome:

7


### Solution 5:

In [16]:
string1 = 'GAGCCTACTAACGGGAT'
string2 = 'CATCGTAATGACGGCCT'

In [17]:
def s5(s1, s2):
    counter = 0
    differences = 0
    for i in s1:
        if i != s2[counter]:
            differences += 1
        counter += 1
    return differences

In [18]:
s5(string1, string2)

7

In [19]:
lines = open(DATA_PATH + 'rosalind_hamm.txt').readlines()
s5(lines[0], lines[1])

485

### Problem 6 - Rabbits and Recurrence Relations

A sequence is an ordered collection of objects (usually numbers), which are allowed to repeat. Sequences can be finite or infinite. Two examples are the finite sequence $(\pi,\sqrt2,0,\pi)$ and the infinite sequence of odd numbers $(1,3,5,7,9,…)$. We use the notation an to represent the n-th term of a sequence.

A recurrence relation is a way of defining the terms of a sequence with respect to the values of previous terms. In the case of Fibonacci's rabbits from the introduction, any given month will contain the rabbits that were alive the previous month, plus any new offspring. A key observation is that the number of offspring in any month is equal to the number of rabbits that were alive two months prior. As a result, if $F_{n}$
represents the number of rabbit pairs alive after the $n$-th month, then we obtain the Fibonacci sequence having terms Fn that are defined by the recurrence relation $F_{n}=F_{n-1}+F_{n-2}$ (with $F_{1}=F_{2}=1$ to initiate the sequence). Although the sequence bears Fibonacci's name, it was known to Indian mathematicians over two millennia ago.

When finding the $n$-th term of a sequence defined by a recurrence relation, we can simply use the recurrence relation to generate terms for progressively larger values of $n$. This problem introduces us to the computational technique of dynamic programming, which successively builds up solutions by using the answers to smaller cases.

Given: Positive integers $n≤40$ and $k≤5$.

Return: The total number of rabbit pairs that will be present after n
months, if we begin with 1 pair and in each generation, every pair of reproduction-age rabbits produces a litter of k rabbit pairs (instead of only 1 pair).

Sample Dataset:

5 3

Sample Output:

19


### Solution 6

In [8]:
def s6(n, k):
    # Following the equation: 
    rabbits = []
    rabbits.insert(0, 1)
    rabbits.insert(1, 1)
    for i in range(2, n):
        rabbits.insert(i, (rabbits[i-1]+ rabbits[i-2]* k))
    return rabbits[n-1]

In [21]:
s6(5, 3)

19

In [39]:
n, k = parse_numbers(DATA_PATH + 'rosalind_fib.txt')
n, k

(32, 2)

In [38]:
s6(n, k)

1431655765

### Problem 7 - Mendel's First Law

Probability is the mathematical study of randomly occurring phenomena. We will model such a phenomenon with a random variable, which is simply a variable that can take a number of different distinct outcomes depending on the result of an underlying random process.

For example, say that we have a bag containing 3 red balls and 2 blue balls. If we let $X$
represent the random variable corresponding to the color of a drawn ball, then the probability of each of the two outcomes is given by $Pr(X=red)= \frac{3}{5}$ and $Pr(X=blue)= \frac{2}{5}$.

Random variables can be combined to yield new random variables. Returning to the ball example, let $Y$
model the color of a second ball drawn from the bag (without replacing the first ball). The probability of Y being red depends on whether the first ball was red or blue. To represent all outcomes of $X$ and $Y$, we therefore use a probability tree diagram. This branching diagram represents all possible individual probabilities for $X$ and $Y$, with outcomes at the endpoints ("leaves") of the tree. The probability of any outcome is given by the product of probabilities along the path from the beginning of the tree; see Figure 2 for an illustrative example.

An event is simply a collection of outcomes. Because outcomes are distinct, the probability of an event can be written as the sum of the probabilities of its constituent outcomes. For our colored ball example, let $A$
be the event "$Y$ is blue." $Pr(A)$ is equal to the sum of the probabilities of two different outcomes: $Pr(X=blue$ and $Y=blue)$ + $Pr(X=red$ and $Y=blue)$, or $\frac{3}{10}+\frac{1}{10}=\frac{2}{5}$.

Given: Three positive integers $k$, $m$, and $n$, representing a population containing k+m+n organisms: k individuals are homozygous dominant for a factor, m are heterozygous, and n are homozygous recessive.

Return: The probability that two randomly selected mating organisms will produce an individual possessing a dominant allele (and thus displaying the dominant phenotype). Assume that any two organisms can mate.

Sample Dataset:

2 2 2

Sample Output:

0.78333


### Solution 7

In [41]:
from itertools import product

def s7(a, b, c):
    population = (['AA'] * a) + (['Aa'] * b) + (['aa'] * c)

    all_children = []
    for parent1 in population:
        # remove selected parent from population.
        chosen = population[:]
        chosen.remove(parent1)
        for parent2 in chosen:
            # get all possible children from 2 parents. Punnet square
            children = product(parent1, parent2)
            all_children.extend([''.join(child) for child in children])
    dominants = filter(lambda c: 'A' in c, all_children)
    return float(len(list(dominants))) / len(all_children)

In [42]:
k = 2
m = 2
n = 2

s7(k,m,n)

0.7833333333333333

In [45]:
k, m, n = parse_numbers(DATA_PATH + 'rosalind_iprb.txt')
k, m, n

(28, 16, 17)

In [46]:
s7(k,m,n)

0.8349726775956284

### Problem 8 - Translating RNA into Protein

The 20 commonly occurring amino acids are abbreviated by using 20 letters from the English alphabet (all letters except for B, J, O, U, X, and Z). Protein strings are constructed from these 20 symbols. Henceforth, the term genetic string will incorporate protein strings along with DNA strings and RNA strings.

The RNA codon table dictates the details regarding the encoding of specific codons into the amino acid alphabet.

Given: An RNA string s corresponding to a strand of mRNA (of length at most 10 kbp).

Return: The protein string encoded by s.

Sample Dataset:

AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA

Sample Output:

MAMAPRTEINSTRING


### Solution 8:

In [28]:
# RNA codon table
rna_codon = {"UUU" : "F", "CUU" : "L", "AUU" : "I", "GUU" : "V",
           "UUC" : "F", "CUC" : "L", "AUC" : "I", "GUC" : "V",
           "UUA" : "L", "CUA" : "L", "AUA" : "I", "GUA" : "V",
           "UUG" : "L", "CUG" : "L", "AUG" : "M", "GUG" : "V",
           "UCU" : "S", "CCU" : "P", "ACU" : "T", "GCU" : "A",
           "UCC" : "S", "CCC" : "P", "ACC" : "T", "GCC" : "A",
           "UCA" : "S", "CCA" : "P", "ACA" : "T", "GCA" : "A",
           "UCG" : "S", "CCG" : "P", "ACG" : "T", "GCG" : "A",
           "UAU" : "Y", "CAU" : "H", "AAU" : "N", "GAU" : "D",
           "UAC" : "Y", "CAC" : "H", "AAC" : "N", "GAC" : "D",
           "UAA" : "STOP", "CAA" : "Q", "AAA" : "K", "GAA" : "E",
           "UAG" : "STOP", "CAG" : "Q", "AAG" : "K", "GAG" : "E",
           "UGU" : "C", "CGU" : "R", "AGU" : "S", "GGU" : "G",
           "UGC" : "C", "CGC" : "R", "AGC" : "S", "GGC" : "G",
           "UGA" : "STOP", "CGA" : "R", "AGA" : "R", "GGA" : "G",
           "UGG" : "W", "CGG" : "R", "AGG" : "R", "GGG" : "G" 
           }

In [29]:
rna_string =  'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'

In [30]:
def s8(rna):
    protein_string = ""
    for i in range(0, len(rna)-(3+len(rna)%3), 3):
        if rna_codon[rna[i:i+3]] == "STOP" :
            break
        protein_string += rna_codon[rna[i:i+3]]
    return protein_string

In [31]:
s8(rna_string)

'MAMAPRTEINSTRING'

In [32]:
file = open(DATA_PATH + 'rosalind_prot.txt', 'r')
rna = file.read()
s8(rna)

'MLTAYHERAYKFVENIELKEQMCHAETPSILVVKRATVTCTVCVCVNSEPIAPTLTSPDKGYSFSADFVVKSILTPIIIDTNSDVLDRPFSSEQASRQAVERTLRCFQGAFLYGRISHGLSVHTRGFARTLYGYMCALLNTYLQVSRLTNGKFTTSFVRVVATLSHFIIFSVRSALQNTTRTQGVANWTFSKSSAVTVTSCYTTTSGHAFVPRGYIKGGVASCPSVEKQFSGRRRILPTYRRRGSQGNIHSPPRRRCLSTIVTMTVKDPDHLGSHGAPRHLGEGPIAFIIKVAPCTHRSPRWQRRQSVQWTDIFRLKRRTQGTTAAQIRHSNMLDLHRMQSGTQNPTVSAGSNRWTKTFRLGGTRHIVIHGPDGASKLIKGEGRLMILGSPPAGRERKPRPIIMRSQCPGHTVLYRRSIAHHHGRLRLSIYRTRTAPKLARRCWWRRLTELMRQRKPASSVRDLPHHKHCLCICLQVCVQFVSGLASRNSLSSLSPCQRLCKRTIWLCILRKSFITYSKRPPEPIFVFCLDSSIFRQADSSSTYCHATFSRCELMVDDKRILNIRVATAACKLMSYVRRHLPPPPLFRRGLTVRFDGASTPASLLLNRTDASGWARPTWIDGGVFSELPTVPHSGTSSAAGWLSNDQALRRGIGELSGDTHNSAKRRGTTCSDGDTRVTDQSTAVATHHVNWQHLTPYFGCPPLTCRGRLVGQRNKLHPASYTDASSNALGCWYDQCAVPIVASVRCASMVLESFAVMIDHGVTQDRTGETEDATLDLSISQGEGYRPRNSYVPIRGRNVSTLHIETPSYQEVPQPPRIHYKRAIYIQRNSIIARIGLTHRSIGSRRYSPKWMNLLTNTEGDTLCTEIKSPWRSSHSVVLVYPDATVCCTNFEGFFADSGHNPCVLSNAGLSVATPKPQISLQVNAHTDTSVVIPLTTGAPLEGAPNPVGVHTEDDQSFSLNYPVCEHSNGTWNLPSSLLRGVHQSQCSLSISCAFNAL

### Problem 9 - Finding a Motif in DNA

Given two strings $s$ and $t$, $t$ is a substring of $s$ if t is contained as a contiguous collection of symbols in $s$ (as a result, $t$ must be no longer than $s$).

The position of a symbol in a string is the total number of symbols found to its left, including itself (e.g., the positions of all occurrences of 'U' in "AUGCUUCAGAAAGGUCUUACG" are 2, 5, 6, 15, 17, and 18). The symbol at position i
of $s$ is denoted by $s$.

A substring of s can be represented as $s$, where $j$ and $k$ represent the starting and ending positions of the substring in $s$; for example, if $s$ = "AUGCUUCAGAAAGGUCUUACG", then $s$ = "UGCU".

The location of a substring $s$ is its beginning position j; note that $t$ will have multiple locations in $s$ if it occurs more than once as a substring of $s$ (see the Sample below).

Given: Two DNA strings $s$ and $t$(each of length at most 1 kbp).

Return: All locations of $t$ as a substring of $s$.

Sample Dataset:

GATATATGCATATACTT

ATAT

Sample Output:

2 4 10


### Solution 9

In [33]:
def s9(a, b):
    result = []
    for i in range(0, len(a) - len(b) + 1):
        if a[i:i+len(b)] == b:
            result.append(i+1)        
    for i in result:
        print(i, end=" ")

In [34]:
s1= 'GATATATGCATATACTT'

s2= 'ATAT'

In [35]:
s9(s1, s2)

2 4 10 

In [36]:
sequences = open(DATA_PATH + 'rosalind_subs.txt', 'r')
seq1 = sequences.readline().rstrip()
seq2 = sequences.readline().rstrip()

In [37]:
s9(seq1, seq2)

1 8 35 64 71 108 142 215 231 238 245 326 371 378 385 402 533 540 589 619 659 687 704 720 736 765 780 

### Problem 10 - Consensus and Profile

A matrix is a rectangular table of values divided into rows and columns. An $m×n$ matrix has m rows and n columns. Given a matrix $A$, we write Ai,j to indicate the value found at the intersection of row $i$ and column $j$.

Say that we have a collection of DNA strings, all having the same length $n$. Their profile matrix is a $4×n$ matrix $P$ in which $P_{1,j}$ represents the number of times that 'A' occurs in the jth position of one of the strings, $P_{2,j}$ represents the number of times that C occurs in the $j$th position, and so on (see below).

A consensus string $c$ is a string of length $n$ formed from our collection by taking the most common symbol at each position; the $j$-th symbol of $c$ therefore corresponds to the symbol having the maximum value in the $j$-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.

Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

Sample Dataset:

$\gt$Rosalind_1

ATCCAGCT

$\gt$Rosalind_2

GGGCAACT

$\gt$Rosalind_3

ATGGATCT

$\gt$Rosalind_4

AAGCAACC

$\gt$Rosalind_5

TTGGAACT

$\gt$Rosalind_6

ATGCCATT

$\gt$Rosalind_7

ATGGCACT

Sample Output:

ATGCAACT

A: 5 1 0 0 5 5 0 0

C: 0 0 1 4 2 0 6 1

G: 1 1 6 3 0 1 0 0

T: 1 5 0 0 0 1 1 6

### Solution - 10:

In [38]:
dic = {'Rosalind_1':'ATCCAGCT',
       'Rosalind_2':'GGGCAACT',
       'Rosalind_3':'ATGGATCT',
       'Rosalind_4':'AAGCAACC',
       'Rosalind_5':'TTGGAACT',
       'Rosalind_6':'ATGCCATT',
       'Rosalind_7':'ATGGCACT'}

In [39]:
def s10(dictionary):
    #bases = ['A', 'C', 'G', 'T']
    strings = list(dictionary.values())
    strings2 = []
    for i in range(len(strings[0])):
        for a in range(len(strings)):
            strings2.append(strings[a][i])
    strings3 = []
    strings3 = [strings2[x:x+len(strings)] for x in range(0, len(strings2), len(strings))]
    strings4 = []
    for a in bases:
        for i in (strings3):
            strings4.append(i.count(a))
    chunks = [strings4[(i*len(strings4))//4:((i+1)*len(strings4))//4] for i in range(4)]
    for i in range(len(strings3)):
        print(sorted([ss for ss in set(strings3[i]) if len(ss)>0 and ss.istitle()], 
                   key=strings3[i].count, 
                   reverse=True)[0], end = '')
    print("")
    for a, b in zip(bases, range(4)):
        print(a + ": " + " ".join([str(x) for x in chunks[b]]))

In [40]:
s10(dic)

ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6


In [41]:
FASTA_file = parse_sequences(DATA_PATH + 'rosalind_cons.txt')

In [42]:
s10(FASTA_file)

TGACATAAACGAGCGCGATTATAACCTATTCTTAGCTGCTATGATATCCCTATGCAATTCCTATTACTACGCTAAAGATTTGGTAATTTTCTTCGGGAATTTGACAATGATTGTGTATCGCCGGTGTTTGAGAATTCGTTATGGCCGGTTGGTAACCCTGATTGTCGTATATGTTGCGCTCTAGTATGCGGGTACGAACGTCACATCAATGCTTCTTGAAAACATGCTGTTTCTTAACTCACCCCGCTATGTTTAATTACCTAGTTAGATAAGTAGATAGAAGGGGCGAGCCTATGCGGCATTCTAACTTTAGTTGCGATTTTCTTCTGGGAGTAGTTGTACTTGTAGGATGCACAACATTGCACAGACCTGGATATACATTAATGTGCTTCTTATACTGTGGCTGCCTTTTAGCGGGGGAGCGGGATCGTTCTCATAGACACTACATGAGCCTGGGGGATATGTCTTCGCGTATTCCGTATGTGTATGGGCGTGGACAATATCTTCGTACCGTTTAAAACGAGTCCCTAACGCTGTTATCCTGAAGGATGGTGTATATCAAGACGGAACTCTGATTGGAGCAATGATGGCAATTTGTTGGGAGAGAAACTTGAATGGTCGCTATGTTGTGTTTGTGTGATAGGTGGCGTCTTATGAGCTGAGAGCTCGCCGGTCATGGATGGTGCAACAGATTGTTAGTGTCGTGATTGGATAATAGTCGACCTCTATCCCTTGTCCGAAGAAATTAGTTCGGGTTTCACGCACTTCAACAGTTAGCCCGTTTAGAGTTGGATTTCGACACATGTCTCATTCTTCTGTTAGTAACTTCTTATAAGCTTTTTATCATCTTATCGGAGGCGGTGAAAGGCCTTACGATCATCAAGGTTTAGGGTCCTTTAGCTG
A: 3 2 5 3 4 3 4 4 4 2 2 4 1 3 3 2 3 4 3 1 3 2 4 4 1 0 1 3 2 1 2 3 1 5 0 1 1 3 2 2 4 2 2 4 3 4 1