### Rosalind RNA Splicing
Given: A DNA string s (of length at most 1 kbp) and a collection of substrings of s acting as introns. All strings are given in FASTA format.

Return: A protein string resulting from transcribing and translating the exons of s (Note: Only one solution will exist for the dataset provided.)

In [1]:

codons = { "UUU": "F",
      "CUU": "L",     
      "AUU": "I",      
      "GUU": "V",
      "UUC": "F" ,     
      "CUC": "L", 
      "AUC": "I",      
      "GUC": "V",
      "UUA": "L",   
      "CUA": "L",      
      "AUA": "I",
      "GUA": "V",
      "UUG": "L",   
      "CUG": "L",
      "AUG": "M", 
      "GUG": "V",
      "UCU": "S",   
      "CCU": "P",     
      "ACU": "T",
      "GCU": "A",
      "UCC": "S",
      "CCC": "P",
      "ACC": "T",
      "GCC": "A",
      "UCA": "S",
      "CCA": "P",
      "ACA": "T",    
      "GCA": "A",
      "UCG": "S",
      "CCG": "P", 
      "ACG": "T",
      "GCG": "A",
      "UAU": "Y",
      "CAU": "H",
      "AAU": "N",
      "GAU": "D",
      "UAC": "Y",
      "CAC": "H",
      "AAC": "N",
      "GAC": "D",
      "UAA": " ", #stop codon
      "CAA": "Q", 
      "AAA": "K", 
      "GAA": "E",
      "UAG": " ", #stop codon
      "CAG": "Q",
      "AAG": "K", 
      "GAG": "E",
      "UGU": "C",
      "CGU": "R",
      "AGU": "S",
      "GGU": "G",
      "UGC": "C",
      "CGC": "R",
      "AGC": "S",
      "GGC": "G",
      "UGA": " ", #stop codon
      "CGA": "R",
      "AGA": "R",
      "GGA": "G",
      "UGG": "W", 
      "CGG": "R", 
      "AGG": "R",
      "GGG": "G"}


    

In [2]:
def read_fasta(file_path):
    DNA = {}
    current_id = None
    current_seq = [] 
    with open(file_path, "r") as infile:
        for line in infile:
            line = line.strip()
            if line.startswith(">"):
                if current_id is not None:
                    DNA[current_id] = "".join(current_seq)
                current_id = line[1:]
                current_seq = []
            else:
                current_seq.append(line)
        if current_id is not None:
            DNA[current_id] = "".join(current_seq)
    return DNA

In [3]:
Rosalind = "ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG"
introns = ['ATCGGTCGAA', 'ATCGGTCGAGCGTGT']

In [4]:
exons = Rosalind
for intron in introns:
    exons = exons.replace(intron,"")
print(exons)
print(len(Rosalind))
print(len(exons))

mrna = exons.replace('T', 'U')
print(mrna)

start_index = mrna.find("AUG")
if start_index == -1:
    print('no start codon found')  

mrna = mrna[start_index:]

ATGGTCTACATAGCTGACAAACAGCACGTAGCATCTCGAGAGGCATATGGTCACATGTTCAAAGTTTGCGCCTAG
100
75
AUGGUCUACAUAGCUGACAAACAGCACGUAGCAUCUCGAGAGGCAUAUGGUCACAUGUUCAAAGUUUGCGCCUAG


In [5]:
triple = ''
for i in range(0, len(mrna), 3):   
    codon = mrna[i:i+3]
    if len(codon) < 3:   
        break
    if codon in ('UAA', 'UAG', 'UGA'):  
        triple = triple + ' ' + codon
        break
    if triple == '':
        triple = codon
    else:
        triple = triple + " " + codon
print(triple)

AUG GUC UAC AUA GCU GAC AAA CAG CAC GUA GCA UCU CGA GAG GCA UAU GGU CAC AUG UUC AAA GUU UGC GCC UAG


In [6]:
protein = ""
for key in triple.split(" "):
    protein = protein + codons[key]
print(protein)


MVYIADKQHVASREAYGHMFKVCA 


'MVYIADKQHVASREAYGHMFKVCA '