# Project : ADN to protein conversion

In [None]:
def find_shine_dalgarno(sequence, shine_dalgarno="AGGAGG"):
    """Find the Shine-Dalgarno sequence in the given DNA sequence."""
    index = sequence.find(shine_dalgarno)
    if index != -1:
        return index
    else:
        return None

def cut_sequence(sequence, shine_dalgarno="AGGAGG"):
    """Cut the DNA sequence based on the Shine-Dalgarno sequence."""
    sections = []
    start_index = find_shine_dalgarno(sequence, shine_dalgarno)
    if start_index is not None:
        start_index += len(shine_dalgarno)
        while True:
            index = find_shine_dalgarno(sequence[start_index:], shine_dalgarno)
            if index is not None:
                sections.append(sequence[start_index:start_index + index])
                start_index += index + len(shine_dalgarno)
            else:
                sections.append(sequence[start_index:])
                break
    return sections
    
def translate_to_uppercase(sequence):
    """Translate the DNA sequence to uppercase."""
    return sequence.upper()

def filter_dna_sequence(sequence):
    """Filter out characters that are not 'A', 'T', 'C', or 'G'."""
    return ''.join(filter(lambda x: x in 'ATCG', sequence.upper()))

def read_dna_sequence(filename):
    """Read DNA sequence from the document where the first column is empty."""
    sequence = ""
    read_started = False
    with open(filename, 'r') as file:
        for line in file:
            if not read_started:
                if line.startswith(" "):
                    read_started = True  # Start reading from the first line with a space
            else:
                if line.strip() == "//":  # Stop reading if encounter "//"
                    read_started = False  # Reset flag to allow reading subsequent sequences
                else:
                    # Find the start and stop points within the line
                    start_index = line.find(' ')
                    while start_index != -1:  # Loop as long as there are occurrences of ' '
                        # Find the next occurrence of '//' after the start_index
                        stop_index = line.find('//', start_index)
                        if stop_index == -1:
                            stop_index = len(line)  # If no more occurrences found, read till the end of the line
                        # Add the content between ' ' and '//' to the sequence
                        sequence += line[start_index:stop_index]

                        # Find the next start_index after the current stop_index
                        start_index = line.find(' ', stop_index)

    return sequence

def main():
    # Read DNA sequence from the document
    filename = "large_input.txt"  # Specify your document filename
    dna_sequence = read_dna_sequence(filename)
    
    # Filter and process DNA sequence
    dna_sequence = filter_dna_sequence(dna_sequence)
    sections = cut_sequence(dna_sequence)
    
    # Write resulting sections to a file
    with open("output.txt", "w") as file:
        for i, section in enumerate(sections):
            file.write(section + "\n")
            if i < len(sections) - 1:
                file.write("//\n")  # Separate sections with "//"
        file.write("//\n")
    
    print("Sections written to output.txt")


if __name__ == "__main__":
    main()


In [24]:
def separate_sections(filename):
    """Read the file and separate sections of DNA into strings."""
    sections = []
    current_section = ""
    with open(filename, "r") as file:
        for line in file:
            line = line.strip()
            if line == "//":
                sections.append(current_section)
                current_section = ""
            else:
                current_section += line
    return sections

# Example usage:
sections = separate_sections("output.txt")


First, we extract the genetic code from a file and we create a dictionary that we can use later.

In [25]:
def read_genetic_code(filename):
    genetic_code = {}
    with open(filename, 'r') as file:
        for line in file:
            codon, amino_acid = line.strip().split()
            genetic_code[codon] = amino_acid
    return genetic_code

# Example usage:
genetic_code = read_genetic_code("genetic_code.txt")

Next, we can make functions that transcribe the inputted DNA into the respective RNA. Then, we can find the important parts of that dna code by locating the start codons in the string of ARN.

In [26]:
def transcribe_dna_to_rna(dna_sequence):
    return dna_sequence.replace('T', 'U')
    

def find_start_codons_rna(rna_sequence):
    start_codons = []
    for i in range(len(rna_sequence)):
        if rna_sequence[i:i+3] == "AUG":
            start_codons.append(i)
    if not start_codons:  # If start_codons is empty
        return -1
    return start_codons


Now that the indexes of the start codons have been obtained, we can start the translation of the protein from this value. In this part, the codons are identified, and the function searches for the amino acid linked to a respective codon in the dictionary of the genetic code previously created.

In [27]:
def translate_rna_to_protein(rna_sequence):
    start_index = find_start_codons_rna(rna_sequence)
    if start_index == -1:  # No start codon found
        return "Start codon not found"
    rna_sequence = rna_sequence[start_index[0]:]  # Use the first start index
    protein_sequence = ""
    for i in range(0, len(rna_sequence) - 2, 3):
        codon = rna_sequence[i:i+3]
        if codon in genetic_code:
            amino_acid = genetic_code[codon]
            if amino_acid == "*":
                break
            protein_sequence += amino_acid
        else:
            protein_sequence += "X"
    return protein_sequence


To read all the proteins that can be translated from the other strand of DNA, we must first find the complementary strand. Then, we also need the following flip function because ADN transcriptase only reads in the direction 5' to 3', if we want to be able to read the complementary brand of ARN correctly, we must flip it. 

In [28]:

def complementary_sequences(dna_sequence):
    complementary_dna = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    # Generate complementary DNA sequence
    comp_dna_sequence = ''.join(complementary_dna[base] for base in dna_sequence)

    return comp_dna_sequence


def flip_rna_sequence(rna_sequence):
    return rna_sequence[::-1]


The next function serves to read the different frames in a strand, starting at all the different possibilities for start codons.

In [29]:
def translate_rna_to_proteins_all_frames(rna_sequence):
    proteins = []
    for start_index in find_start_codons_rna(rna_sequence):
        protein_sequence = translate_rna_to_protein(rna_sequence[start_index:])
        proteins.append(protein_sequence)
    return proteins


The following function is used to transform the one letter representation of an amino acid into a three letter representation.

In [30]:

def translate_one_letter_to_three_letter_list(one_letter_sequences):
    three_letter_code = {
        "A": "Ala", "C": "Cys", "D": "Asp", "E": "Glu",
        "F": "Phe", "G": "Gly", "H": "His", "I": "Ile",
        "K": "Lys", "L": "Leu", "M": "Met", "N": "Asn",
        "P": "Pro", "Q": "Gln", "R": "Arg", "S": "Ser",
        "T": "Thr", "V": "Val", "W": "Trp", "Y": "Tyr",
        "*": "Stop"
    }
    three_letter_sequences = []
    for one_letter_sequence in one_letter_sequences:
        three_letter_sequence = [three_letter_code.get(aa, "Unknown") for aa in one_letter_sequence]
        three_letter_sequences.append("-".join(three_letter_sequence))
    return three_letter_sequences


These functions contribute to the layout of the final output given by the code 

In [31]:

def print_proteins_in_frames1L53(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C5'-C3', 1 letters : {protein}")

def print_proteins_in_frames3L53(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C5'-C3', 3 letters : {protein}")

def print_proteins_in_frames1L35(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C3'-C5', 1 letters : {protein}")

def print_proteins_in_frames3L35(protein_sequence_list):
    for i, protein in enumerate(protein_sequence_list, start=1):
        print(f"Frame {i} ,C3'-C5', 3 letters : {protein}")
        


Using the Kyte & Doolittle hydropathic scale we can determine the hydrophobic or hydrophilic character of a protein knowing its amino acids. The more positive the hydropathic score, the more hydrophobic the protein


In [2]:
def calculate_hydrophobicity(protein: str) -> float:
    hydrophobicity_scale = {
        'A': 1.800,  # Alanine
        'R': -4.500, # Arginine
        'N': -3.500, # Asparagine
        'D': -3.500, # Aspartic Acid
        'C': 2.500,  # Cysteine
        'Q': -3.500, # Glutamine
        'E': -3.500, # Glutamic Acid
        'G': -0.400, # Glycine
        'H': -3.200, # Histidine
        'I': 4.500,  # Isoleucine
        'L': 3.800,  # Leucine
        'K': -3.900, # Lysine
        'M': 1.900,  # Methionine
        'F': 2.800,  # Phenylalanine
        'P': -1.600, # Proline
        'S': -0.800, # Serine
        'T': -0.700, # Threonine
        'W': -0.900, # Tryptophan
        'Y': -1.300, # Tyrosine
        'V': 4.200   # Valine
    }
    
    total_score = 0.0
    for amino_acid in protein:
        if amino_acid in hydrophobicity_scale:
            total_score += hydrophobicity_scale[amino_acid]
        else:
            raise ValueError(f"Invalid amino acid: {amino_acid}")
    
    return total_score

# Example usage:
protein_sequence = "ACDEFGHIKLMNPQRSTVWY"
print(f"Hydrophobicity score: {calculate_hydrophobicity(protein_sequence)}")

Hydrophobicity score: -9.8


This function calculates the molecular weight of a protein given its structure

In [None]:
def calculate_molecular_weight(protein: str) -> float:
    molecular_weights = {
        'A': 89.000,  # Alanine
        'R': 174.000, # Arginine
        'N': 132.000, # Asparagine
        'D': 133.000, # Aspartic Acid
        'C': 121.000, # Cysteine
        'Q': 146.000, # Glutamine
        'E': 147.000, # Glutamic Acid
        'G': 75.000,  # Glycine
        'H': 155.000, # Histidine
        'I': 131.000, # Isoleucine
        'L': 131.000, # Leucine
        'K': 146.000, # Lysine
        'M': 149.000, # Methionine
        'F': 165.000, # Phenylalanine
        'P': 115.000, # Proline
        'S': 105.000, # Serine
        'T': 119.000, # Threonine
        'W': 204.000, # Tryptophan
        'Y': 181.000, # Tyrosine
        'V': 117.000  # Valine
    }
    
    total_weight = 0.0
    for amino_acid in protein:
        if amino_acid in molecular_weights:
            total_weight += molecular_weights[amino_acid]
        else:
            raise ValueError(f"Invalid amino acid: {amino_acid}")
    
    return total_weight

# Example usage:
protein_sequence = "ACDEFGHIKLMNPQRSTVWY"
print(f"Total molecular weight: {calculate_molecular_weight(protein_sequence)}")

Next is a function that calculates scores for beta-sheet, alpha-helix and beta-turn likeliness using values provided next for each amino acid in a protein sequence. This values were obtained from the Chou and Fasman scales. Then, it looks for the highest score and shows which configuration is most likely.

In [1]:
def calculate_configuration_likelihoods(protein: str):
    beta_sheet = {
        'A': 0.830, 'R': 0.930, 'N': 0.890, 'D': 0.540, 'C': 1.190, 'Q': 1.100, 'E': 0.370,
        'G': 0.750, 'H': 0.870, 'I': 1.600, 'L': 1.300, 'K': 0.740, 'M': 1.050, 'F': 1.380,
        'P': 0.550, 'S': 0.750, 'T': 1.190, 'W': 1.370, 'Y': 1.470, 'V': 1.700
    }

    alpha_helix = {
        'A': 1.420, 'R': 0.980, 'N': 0.670, 'D': 1.010, 'C': 0.700, 'Q': 1.110, 'E': 1.510,
        'G': 0.570, 'H': 1.000, 'I': 1.080, 'L': 1.210, 'K': 1.160, 'M': 1.450, 'F': 1.130,
        'P': 0.570, 'S': 0.770, 'T': 0.830, 'W': 1.080, 'Y': 0.690, 'V': 1.060
    }

    beta_turn = {
        'A': 0.660, 'R': 0.950, 'N': 1.560, 'D': 1.460, 'C': 1.190, 'Q': 0.980, 'E': 0.740,
        'G': 1.560, 'H': 0.950, 'I': 0.470, 'L': 0.590, 'K': 1.010, 'M': 0.600, 'F': 0.600,
        'P': 1.520, 'S': 1.430, 'T': 0.960, 'W': 0.960, 'Y': 1.140, 'V': 0.500
    }

    def calculate_score(protein, score_dict):
        total_score = 0.0
        for amino_acid in protein:
            if amino_acid in score_dict:
                total_score += score_dict[amino_acid]
            else:
                raise ValueError(f"Invalid amino acid: {amino_acid}")
        return total_score

    beta_sheet_score = calculate_score(protein, beta_sheet)
    alpha_helix_score = calculate_score(protein, alpha_helix)
    beta_turn_score = calculate_score(protein, beta_turn)

    scores = {
        'beta-sheet': beta_sheet_score,
        'alpha-helix': alpha_helix_score,
        'beta-turn': beta_turn_score
    }

    highest_likelihood = max(scores, key=scores.get)
    highest_score = scores[highest_likelihood]

    return highest_likelihood, highest_score, scores

# Example usage:
protein_sequence = "ACDEFGHIKLMNPQRSTVWY"
result = calculate_configuration_likelihoods(protein_sequence)
print(f"Most likely configuration: {result[0]} with a score of {result[1]}")
print(f"Scores: {result[2]}")


Most likely configuration: beta-sheet with a score of 20.570000000000004
Scores: {'beta-sheet': 20.570000000000004, 'alpha-helix': 19.999999999999996, 'beta-turn': 19.830000000000002}


This function uses the same method to calculate the retention coefficients for an HPLC in TFA for proteins.

In [2]:
def calculate_retention_coefficient(protein: str) -> float:
    retention_coefficients = {
        'A': 7.300,   # Alanine
        'R': -3.600,  # Arginine
        'N': -5.700,  # Asparagine
        'D': -2.900,  # Aspartic Acid
        'C': -9.200,  # Cysteine
        'Q': -0.300,  # Glutamine
        'E': -7.100,  # Glutamic Acid
        'G': -1.200,  # Glycine
        'H': -2.100,  # Histidine
        'I': 6.600,   # Isoleucine
        'L': 20.000,  # Leucine
        'K': -3.700,  # Lysine
        'M': 5.600,   # Methionine
        'F': 19.200,  # Phenylalanine
        'P': 5.100,   # Proline
        'S': -4.100,  # Serine
        'T': 0.800,   # Threonine
        'W': 16.300,  # Tryptophan
        'Y': 5.900,   # Tyrosine
        'V': 3.500    # Valine
    }
    
    total_retention = 0.0
    for amino_acid in protein:
        if amino_acid in retention_coefficients:
            total_retention += retention_coefficients[amino_acid]
        else:
            raise ValueError(f"Invalid amino acid: {amino_acid}")
    
    return total_retention

# Example usage:
protein_sequence = "ACDEFGHIKLMNPQRSTVWY"
retention_coefficient = calculate_retention_coefficient(protein_sequence)
print(f"Retention coefficient: {retention_coefficient}")


Retention coefficient: 50.4


This function calculates a polarity score based on the Zimmerman scale

In [None]:
def calculate_polarity_score(protein: str) -> float:
    polarity_scores = {
        'A': 0.000,   # Alanine
        'R': 52.000,  # Arginine
        'N': 3.380,   # Asparagine
        'D': 49.700,  # Aspartic Acid
        'C': 1.480,   # Cysteine
        'Q': 3.530,   # Glutamine
        'E': 49.900,  # Glutamic Acid
        'G': 0.000,   # Glycine
        'H': 51.600,  # Histidine
        'I': 0.130,   # Isoleucine
        'L': 0.130,   # Leucine
        'K': 49.500,  # Lysine
        'M': 1.430,   # Methionine
        'F': 0.350,   # Phenylalanine
        'P': 1.580,   # Proline
        'S': 1.670,   # Serine
        'T': 1.660,   # Threonine
        'W': 2.100,   # Tryptophan
        'Y': 1.610,   # Tyrosine
        'V': 0.130    # Valine
    }
    
    total_polarity = 0.0
    for amino_acid in protein:
        if amino_acid in polarity_scores:
            total_polarity += polarity_scores[amino_acid]
        else:
            raise ValueError(f"Invalid amino acid: {amino_acid}")
    
    return total_polarity

This calls the functions above using the input data:

In [32]:
import os
import pandas as pd

def DNAtoPROT_analysis(dna_sequence, output_folder, section_number):
    # Translate RNA to protein with start codon "AUG" for 5'3'
    protein_sequence_rna53 = translate_rna_to_proteins_all_frames(rna_sequence)
    
    # Find the complementary of the DNA and RNA sequences
    dna_sequence35 = complementary_sequences(dna_sequence)
    rna_sequence35 = transcribe_dna_to_rna(dna_sequence35)
    rna_sequence35_inv = flip_rna_sequence(rna_sequence35)
    
    # Translate RNA to protein with start codon "AUG" for 5'3'
    protein_sequence_rna35 = translate_rna_to_proteins_all_frames(rna_sequence35_inv)
    
    # Translating one-letter symbol amino acid into three
    protein_sequence_3letters53 = translate_one_letter_to_three_letter_list(protein_sequence_rna53)
    protein_sequence_3letters35 = translate_one_letter_to_three_letter_list(protein_sequence_rna35)

    # Compute additional properties
    hydrophobicity_53 = [calculate_hydrophobicity(protein) for protein in protein_sequence_rna53]
    molecular_weight_53 = [calculate_molecular_weight(protein) for protein in protein_sequence_rna53]
    retention_coefficient_53 = [calculate_retention_coefficient(protein) for protein in protein_sequence_rna53]
    config_likelihoods_53 = [calculate_configuration_likelihoods(protein) for protein in protein_sequence_rna53]
    polarity_53 = [calculate_polarity_score(protein) for protein in protein_sequence_rna53]

    hydrophobicity_35 = [calculate_hydrophobicity(protein) for protein in protein_sequence_rna35]
    molecular_weight_35 = [calculate_molecular_weight(protein) for protein in protein_sequence_rna35]
    retention_coefficient_35 = [calculate_retention_coefficient(protein) for protein in protein_sequence_rna35]
    config_likelihoods_35 = [calculate_configuration_likelihoods(protein) for protein in protein_sequence_rna35]
    polarity_35 = [calculate_polarity_score(protein) for protein in protein_sequence_rna35]
    
     # Create DataFrames for displaying protein sequences
    df1L53 = pd.DataFrame({
        "Frame 1L (5'->3')": protein_sequence_rna53,
        "Hydrophobicity": hydrophobicity_53,
        "Molecular Weight": molecular_weight_53,
        "Retention Coefficient": retention_coefficient_53,
        "Configuration Likelihood": [likelihood for likelihood, _, _ in config_likelihoods_53],
        "polarity": polarity_53
    })

    df3L53 = pd.DataFrame({
        "Frame 3L (5'->3')": protein_sequence_3letters53,
        "Hydrophobicity": hydrophobicity_53,
        "Molecular Weight": molecular_weight_53,
        "Retention Coefficient": retention_coefficient_53,
        "Configuration Likelihood": [likelihood for likelihood, _, _ in config_likelihoods_53],
        "Polarity": polarity_53
    })

    df1L35 = pd.DataFrame({
        "Frame 1L (3'->5')": protein_sequence_rna35,
        "Hydrophobicity": hydrophobicity_35,
        "Molecular Weight": molecular_weight_35,
        "Retention Coefficient": retention_coefficient_35,
        "Configuration Likelihood": [likelihood for likelihood, _, _ in config_likelihoods_35],
        "polarity": polarity_35
    })

    df3L35 = pd.DataFrame({
        "Frame 3L (3'->5')": protein_sequence_3letters35,
        "Hydrophobicity": hydrophobicity_35,
        "Molecular Weight": molecular_weight_35,
        "Retention Coefficient": retention_coefficient_35,
        "Configuration Likelihood": [likelihood for likelihood, _, _ in config_likelihoods_35],
        "polarity": polarity_35
    })
    
    # Save DataFrames to a single CSV file
    output_folder = os.getcwd()  # Get current working directory
    with pd.ExcelWriter(f"{output_folder}/section.{section_number}.xlsx") as writer:
        df1L53.to_excel(writer, sheet_name="Frame 1L (5'->3')")
        df3L53.to_excel(writer, sheet_name="Frame 3L (5'->3')")
        df1L35.to_excel(writer, sheet_name="Frame 1L (3'->5')")
        df3L35.to_excel(writer, sheet_name="Frame 3L (3'->5')")

output_folder = "output_folder"
for i in range(len(sections)):
    dna_sequence = sections[i]
    rna_sequence = transcribe_dna_to_rna(dna_sequence)
    DNAtoPROT_analysis(dna_sequence, output_folder, i+1)