In [12]:
import re
import numpy as np

In [13]:
def read_fasta(file_in):
    list_name = []
    list_dna = []
    str_dna = ""
    with open(file_in, 'r') as f:
        for line in f:
            if (line[0] == '>'):
                list_dna.append(str_dna)
                str_dna = ""
                list_name.append(line[1:-1])
            else:
                str_dna += line[:-1]
        list_dna.append(str_dna)
        list_dna = list_dna[1:]
    return (list_name, list_dna)

## Counting DNA Nucleotides

A string is simply an ordered collection of symbols selected from some alphabet and formed into a word; the length of a string is the number of symbols that it contains.

An example of a length 21 DNA string (whose alphabet contains the symbols 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG."

Given: A DNA string ss of length at most 1000 nt.

Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in ss.

In [55]:
def count_nucleotides (st_dna):
    n = len(st_dna)
    count = []
    count.append(st_dna.count('A'))
    count.append(st_dna.count('C'))
    count.append(st_dna.count('G'))
    count.append(st_dna.count('T'))
    return (count)

In [56]:
def counting_dna_nucleotides(file_in, file_out):
    with open(file_in, 'r') as f:
        st_dna = f.readline()
    count = count_nucleotides(st_dna)
    with open(file_out, 'w') as f:
        for i in count:
            if (i == count[-1]):
                f.write(str(i))
            else:
                f.write(str(i) + ' ')

## Transcribing DNA into RNA

An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', and 'U'.

Given a DNA string tt corresponding to a coding strand, its transcribed RNA string uu is formed by replacing all occurrences of 'T' in tt with 'U' in uu.

Given: A DNA string tt having length at most 1000 nt.

Return: The transcribed RNA string of tt.

In [53]:
def dna_into_rna (st_dna):
    return (re.sub('T', 'U', st_dna))

In [54]:
def transcribing_dns_into_rna(file_in, file_out):
    with open(file_in, 'r') as f:
        st_dna = f.readline()
    st_rna = dna_into_rna(st_dna)
    with open(file_out, 'w') as f:
        f.write(st_rna.strip('\n'))

## Complementing a Strand of DNA

In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'.

The reverse complement of a DNA string ss is the string scsc formed by reversing the symbols of ss, then taking the complement of each symbol (e.g., the reverse complement of "GTCA" is "TGAC").

Given: A DNA string ss of length at most 1000 bp.

Return: The reverse complement scsc of ss.

In [30]:
def revers_compl_dna(file_in, file_out):
    with open(file_in, 'r') as f:
        st_dna = f.readline()
    kn = re.sub('C', 'N', re.sub('A', 'K', st_dna))
    with open(file_out, 'w') as f:
        f.write((re.sub('K', 'T', re.sub('N', 'G', re.sub('T', 'A', re.sub('G', 'C', kn)))))[::-1].strip('\n'))

## Computing GC Content 

The GC-content of a DNA string is given by the percentage of symbols in the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" is 37.5%. Note that the reverse complement of any DNA string has the same GC-content.

DNA strings must be labeled when they are consolidated into a database. A commonly used method of string labeling is called FASTA format. In this format, the string is introduced by a line that begins with '>', followed by some labeling information. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.

In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.

Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).

Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.

In [6]:
def percenCG(st_dna):
    n = len(st_dna)
    m = st_dna.count('C') + st_dna.count('G')
    return m*100./n

def top_percenCG(file_in, file_out):
    list_name, list_dna = read_fasta(file_in)
    maxCG = percenCG(list_dna[0])
    maxi = 0
    for i in range(1,len(list_name)):
        n = percenCG(list_dna[i])
        if (maxCG < n):
            maxCG = n
            maxi = i
    with open(file_out, 'w') as f:
        f.write(list_name[maxi] + '\n' + str(maxCG))

## Finding a Motif in DNA

Given two strings s and t, t is a substring of s if t is contained as a contiguous collection of symbols in s (as a result, t must be no longer than s).

position of a symbol in a string is the total number of symbols found to its left, including itself (e.g., the positions of all occurrences of 'U' in "AUGCUUCAGAAAGGUCUUACG" are 2, 5, 6, 15, 17, and 18). The symbol at position i of s is denoted by s[i].

A substring of s can be represented as s[j:k], where j and k represent the starting and ending positions of the substring in s; for example, if s = "AUGCUUCAGAAAGGUCUUACG", then s[2:5] = "UGCU".

location of a substring s[j:k] is its beginning position j; note that t will have multiple locations in s if it occurs more than once as a substring of s (see the Sample below).

Given: Two DNA strings s and t (each of length at most 1 kbp).

Return: All locations of t as a substring of s.

In [60]:
def finding_motif_in_dna (file_in, file_out):
    with open(file_in, 'r') as f:
        s = f.readline()[:-1]
        t = f.readline()[:-1]
    k = []
    i = 0
    while (t in s):
        n = s.find(t) + 1
        s = s[n:]
        i += n
        k.append(i)
    with open(file_out, 'w') as f:
        for i in k:
            if (i == k[-1]):
                f.write(str(i))
            else:
                f.write(str(i) + ' ')

## Consensus and Profile

A matrix is a rectangular table of values divided into rows and columns. An m×n matrix has m rows and n columns. Given a matrix A, we write Ai,j to indicate the value found at the intersection of row i and column j.

Say that we have a collection of DNA strings, all having the same length n. Their profile matrix is a 4×n matrix P in which P1,j represents the number of times that 'A' occurs in the jth position of one of the strings, P2,j represents the number of times that C occurs in the jth position, and so on (see below).

A consensus string c is a string of length n formed from our collection by taking the most common symbol at each position; the jth symbol of c therefore corresponds to the symbol having the maximum value in the j-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.

Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

In [51]:
def consensus_profile (file_in, file_out):
    k = 0
    list_name, list_dna = read_fasta(file_in)
    
    n = len(list_dna[0])
    profile = {'A':np.zeros(n, dtype = np.int8), 'C':np.zeros(n, dtype = np.int8),
               'G':np.zeros(n, dtype = np.int8), 'T':np.zeros(n, dtype = np.int8)}
    
    for line in list_dna:
        for i in range(n):
            profile[line[i]][i] += 1
    consensus = []  
    for i in range(n):
        max_i = 0
        max_s = 'A'
        for key in profile.keys():
            if (profile[key][i] > max_i):
                max_i = profile[key][i]
                max_s = key
        consensus.append(max_s)
        
    with open(file_out, 'w') as f:
        for sym in consensus:
            f.write(sym)
        f.write('\n')
        for key in profile.keys():
            f.write(key + ': ')
            for val in profile[key]:
                f.write(str(val) + ' ')
            f.write('\n')

## RNA Splicing

After identifying the exons and introns of an RNA string, we only need to delete the introns and concatenate the exons to form a new string ready for translation.

Given:DNA string s (of length at most 1 kbp) and a collection of substrings of s acting as introns. All strings are given in FASTA format.

Return:protein string resulting from transcribing and translating the exons of s. (Note: Only one solution will exist for the dataset provided.)

In [62]:
rnaToProtein = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V',
'UUC': 'F',      'CUC': 'L',      'AUC': 'I',      'GUC': 'V',
'UUA': 'L',     'CUA': 'L',      'AUA': 'I',      'GUA': 'V',
'UUG': 'L',     'CUG': 'L',      'AUG': 'M',      'GUG': 'V',
'UCU': 'S',     'CCU': 'P',      'ACU': 'T',      'GCU': 'A',
'UCC': 'S',      'CCC': 'P',      'ACC': 'T',      'GCC': 'A',
'UCA': 'S',      'CCA': 'P',      'ACA': 'T',      'GCA': 'A',
'UCG': 'S',      'CCG': 'P',      'ACG': 'T',      'GCG': 'A',
'UAU': 'Y',      'CAU': 'H',      'AAU': 'N',      'GAU': 'D',
'UAC': 'Y',      'CAC': 'H',      'AAC': 'N',      'GAC': 'D',
'UAA': 'Stop',   'CAA': 'Q',      'AAA': 'K',      'GAA': 'E',
'UAG': 'Stop',   'CAG': 'Q',      'AAG': 'K',      'GAG': 'E',
'UGU': 'C',      'CGU': 'R',      'AGU': 'S',      'GGU': 'G',
'UGC': 'C',      'CGC': 'R',      'AGC': 'S',      'GGC': 'G',
'UGA': 'Stop',   'CGA': 'R',      'AGA': 'R',      'GGA': 'G',
'UGG': 'W',      'CGG': 'R',      'AGG': 'R',      'GGG': 'G' }

In [66]:
def rna_to_protein(st_rna):
    st_protein = ""
    while (st_rna != ""):
        triple = st_rna[:3]
        st_rna = st_rna[3:]
        protein = rnaToProtein[triple]
        if (protein == 'Stop'):
            break
        st_protein += protein
    return st_protein

In [71]:
def rna_splicing(file_in, file_out):
    list_name, list_dna = read_fasta(file_in)
    list_rna = []
    st_dna = list_dna[0]
    introns = list_dna[1:]
    
    for intron in introns:
        while (intron in st_dna):
            index = st_dna.find(intron)
            n = len(intron)
            st_dna = st_dna[:index] + st_dna[index + n:]
    st_rna = dna_into_rna(st_dna)
    st_protein = rna_to_protein(st_rna)
    with open(file_out, 'w') as f:
        f.write(st_protein)

## Finding a Shared Motif

common substring of a collection of strings is a substring of every member of the collection. We say that a common substring is a longest common substring if there does not exist a longer common substring. For example, "CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as long as possible; in this case, "CGTA" is a longest common substring of "ACGTACGT" and "AACCGTATA".

Note that the longest common substring is not necessarily unique; for a simple example, "AA" and "CC" are both longest common substrings of "AACC" and "CCAA".

Given: A collection of k (k≤100) DNA strings of length at most 1 kbp each in FASTA format.

Return: A longest common substring of the collection. (If multiple solutions exist, you may return any single solution.)

In [23]:
def finding_shared_motif(file_in, file_out):
    list_name, list_dna = read_fasta(file_in)
    min_dna = list_dna[0]
    for st_dna in list_dna:
        if (len(st_dna) < len(min_dna)):
            min_dna = st_dna
    list_dna.remove(min_dna)
    n = len(min_dna)
    find = False
    for i in range(n-1):
        for j in range(i+1):
            st_sub = min_dna[j:n-i+j] 
            for st_dna in list_dna:
                if (st_sub in st_dna):
                    find = True
                else:
                    find = False
                    break
            if (find):
                break
        if (find):
            break
    with open(file_out, 'w') as f:
        f.write(st_sub)