# Project : ADN to protein conversion

In [24]:
def separate_sections(filename):
    """Read the file and separate sections of DNA into strings."""
    sections = []
    current_section = ""
    with open(filename, "r") as file:
        for line in file:
            line = line.strip()
            if line == "//":
                sections.append(current_section)
                current_section = ""
            else:
                current_section += line
    return sections

# Example usage:
sections = separate_sections("output.txt")


First, we extract the genetic code from a file and we create a dictionary that we can use later.

In [25]:
def read_genetic_code(filename):
    genetic_code = {}
    with open(filename, 'r') as file:
        for line in file:
            codon, amino_acid = line.strip().split()
            genetic_code[codon] = amino_acid
    return genetic_code

# Example usage:
genetic_code = read_genetic_code("genetic_code.txt")

Next, we can make functions that transcribe the inputted DNA into the respective RNA. Then, we can find the important parts of that dna code by locating the start codons in the string of ARN.

In [26]:
def transcribe_dna_to_rna(dna_sequence):
    return dna_sequence.replace('T', 'U')
    

def find_start_codons_rna(rna_sequence):
    start_codons = []
    for i in range(len(rna_sequence)):
        if rna_sequence[i:i+3] == "AUG":
            start_codons.append(i)
    if not start_codons:  # If start_codons is empty
        return -1
    return start_codons


Now that the indexes of the start codons have been obtained, we can start the translation of the protein from this value. In this part, the codons are identified, and the function searches for the amino acid linked to a respective codon in the dictionary of the genetic code previously created.

In [27]:
def translate_rna_to_protein(rna_sequence):
    start_index = find_start_codons_rna(rna_sequence)
    if start_index == -1:  # No start codon found
        return "Start codon not found"
    rna_sequence = rna_sequence[start_index[0]:]  # Use the first start index
    protein_sequence = ""
    for i in range(0, len(rna_sequence) - 2, 3):
        codon = rna_sequence[i:i+3]
        if codon in genetic_code:
            amino_acid = genetic_code[codon]
            if amino_acid == "*":
                break
            protein_sequence += amino_acid
        else:
            protein_sequence += "X"
    return protein_sequence


To read all the proteins that can be translated from the other strand of DNA, we must first find the complementary strand. Then, we also need the following flip function because ADN transcriptase only reads in the direction 5' to 3', if we want to be able to read the complementary brand of ARN correctly, we must flip it. 

In [28]:

def complementary_sequences(dna_sequence):
    complementary_dna = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    # Generate complementary DNA sequence
    comp_dna_sequence = ''.join(complementary_dna[base] for base in dna_sequence)

    return comp_dna_sequence


def flip_rna_sequence(rna_sequence):
    return rna_sequence[::-1]


The next function serves to read the different frames in a strand, starting at all the different possibilities for start codons.

In [29]:
def translate_rna_to_proteins_all_frames(rna_sequence):
    proteins = []
    for start_index in find_start_codons_rna(rna_sequence):
        protein_sequence = translate_rna_to_protein(rna_sequence[start_index:])
        proteins.append(protein_sequence)
    return proteins


The following function is used to transform the one letter representation of an amino acid into a three letter representation.

In [30]:

def translate_one_letter_to_three_letter_list(one_letter_sequences):
    three_letter_code = {
        "A": "Ala", "C": "Cys", "D": "Asp", "E": "Glu",
        "F": "Phe", "G": "Gly", "H": "His", "I": "Ile",
        "K": "Lys", "L": "Leu", "M": "Met", "N": "Asn",
        "P": "Pro", "Q": "Gln", "R": "Arg", "S": "Ser",
        "T": "Thr", "V": "Val", "W": "Trp", "Y": "Tyr",
        "*": "Stop"
    }
    three_letter_sequences = []
    for one_letter_sequence in one_letter_sequences:
        three_letter_sequence = [three_letter_code.get(aa, "Unknown") for aa in one_letter_sequence]
        three_letter_sequences.append("-".join(three_letter_sequence))
    return three_letter_sequences


These functions contribute to the layout of the final output given by the code 

In [31]:

def print_proteins_in_frames1L53(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C5'-C3', 1 letters : {protein}")

def print_proteins_in_frames3L53(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C5'-C3', 3 letters : {protein}")

def print_proteins_in_frames1L35(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C3'-C5', 1 letters : {protein}")

def print_proteins_in_frames3L35(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C3'-C5', 3 letters : {protein}")
        


This calls the functions above using the input data:

In [32]:
import os
import pandas as pd

def main(dna_sequence, output_folder, section_number):
    # Translate RNA to protein with start codon "AUG" for 5'3'
    protein_sequence_rna53 = translate_rna_to_proteins_all_frames(rna_sequence)
    
    # Find the complementary of the DNA and RNA sequences
    dna_sequence35 = complementary_sequences(dna_sequence)
    rna_sequence35 = transcribe_dna_to_rna(dna_sequence35)
    rna_sequence35_inv = flip_rna_sequence(rna_sequence35)
    
    # Translate RNA to protein with start codon "AUG" for 5'3'
    protein_sequence_rna35 = translate_rna_to_proteins_all_frames(rna_sequence35_inv)
    
    # Translating one-letter symbol amino acid into three
    protein_sequence_3letters53 = translate_one_letter_to_three_letter_list(protein_sequence_rna53)
    protein_sequence_3letters35 = translate_one_letter_to_three_letter_list(protein_sequence_rna35)
    
    # Create DataFrames for displaying protein sequences
    df1L53 = pd.DataFrame(protein_sequence_rna53, columns=["Frame 1L (5'->3')"])
    df3L53 = pd.DataFrame(protein_sequence_3letters53, columns=["Frame 3L (5'->3')"])
    df1L35 = pd.DataFrame(protein_sequence_rna35, columns=["Frame 1L (3'->5')"])
    df3L35 = pd.DataFrame(protein_sequence_3letters35, columns=["Frame 3L (3'->5')"])
    
    # Save DataFrames to a single CSV file
    output_folder = os.getcwd()  # Get current working directory
    with pd.ExcelWriter(f"{output_folder}/section_large.{section_number}.xlsx") as writer:
        df1L53.to_excel(writer, sheet_name="Frame 1L (5'->3')")
        df3L53.to_excel(writer, sheet_name="Frame 3L (5'->3')")
        df1L35.to_excel(writer, sheet_name="Frame 1L (3'->5')")
        df3L35.to_excel(writer, sheet_name="Frame 3L (3'->5')")

output_folder = "output_folder"
for i in range(len(sections)):
    dna_sequence = sections[i]
    rna_sequence = transcribe_dna_to_rna(dna_sequence)
    main(dna_sequence, output_folder, i+1)
