In [15]:
# Define the codon usage table for E. coli (actual data):
codon_usage = {
        'UUU':14.0, 'UCU':17.8, 'UAU':13.0, 'UGU':6.8,
        'UUC':27.4, 'UCC':14.3, 'UAC':16.1, 'UGC':9.3,
        'UUA': 4.4, 'UCA':9.8, 'UAA':1.1, 'UGA':1.0,
        'UUG':15.2, 'UCG':10.5, 'UAG':0.8, 'UGG':14.8,


        'CUU':19.8, 'CCU': 19.3, 'CAU':12.1, 'CGU':11.2,
        'CUC':26.6, 'CCC':16.5, 'CAC':11.1, 'CGC':9.2,
        'CUA':5.8, 'CCA':9.0, 'CAA':20.5, 'CGA':6.3,
        'CUG':9.5, 'CCG':8.0, 'CAG':14.5, 'CGG':4.0,


        'AUU':22.9, 'ACU':21.0, 'AAU':21.3, 'AGU':10.3,
        'AUC':28.4, 'ACC':21.7, 'AAC':28.1, 'AGC':11.2,
        'AUA':5.6, 'ACA':11.5, 'AAA':21.0, 'AGA':5.1,
        'AUG':18.4, 'ACG':9.2, 'AAG':26.9, 'AGG':6.9,


        'GUU':23.6, 'GCU':32.2, 'GAU':30.5, 'GGU':32.4,
        'GUC':31.0, 'GCC':25.7, 'GAC':25.0, 'GGC':26.4,
        'GUA':7.3, 'GCA':16.8, 'GAA':26.5, 'GGA':21.5,
        'GUG':9.7, 'GCG':11.6, 'GAG':22.7, 'GGG':8.1
}

# Define the input sequence to be optimized:
input_sequence = "ATGCAACGTTACTGCAGT"

# Function to perform frequency-based codon optimization:
def optimize_codons(input_sequence, codon_usage):
    optimized_sequence = ""

    for i in range(0, len(input_sequence), 3):
        codon = input_sequence[i:i+3]
        print(codon)

        # Check if the codon exists in the codon usage table
        if codon in codon_usage:
            optimized_sequence += codon
        else:
            # Find the most frequent codon for the amino acid
            amino_acid = "X"  # Placeholder for unknown codons
            if i < len(input_sequence) - 3:
                next_amino_acid = input_sequence[i+3:i+3+3]
                if next_amino_acid in codon_usage:
                    amino_acid = next_amino_acid
            #print(amino_acid)

            # Find the most frequent codon for the amino acid
            best_codon = max(codon_usage, key=lambda x: codon_usage[x] if x.endswith(amino_acid) else 0)
            #will return the codon from the codon_usage dictionary that has the maximum value among the codons ending with the specified amino_acid
            optimized_sequence += best_codon

    return optimized_sequence

# Perform codon optimization
optimized_sequence = optimize_codons(input_sequence, codon_usage)

# Print the original and optimized sequences
print("Original Sequence: ", input_sequence)
print("Optimized Sequence: ", optimized_sequence)


ATG
CAA
CGT
TAC
TGC
AGT
Original Sequence:  ATGCAACGTTACTGCAGT
Optimized Sequence:  CAACAAUUUUUUUUUUUU


In [14]:
def optimize_codon_sequence(sequence, codon_frequencies):
    optimized_sequence = ""
    
    codon_to_amino_acid = {
        'TTT': 'Phe', 'TTC': 'Phe',
        'TTA': 'Leu', 'TTG': 'Leu',
        'CTT': 'Leu', 'CTC': 'Leu', 'CTA': 'Leu', 'CTG': 'Leu',
        'ATT': 'Ile', 'ATC': 'Ile', 'ATA': 'Ile',
        'ATG': 'Met',
        'GTT': 'Val', 'GTC': 'Val', 'GTA': 'Val', 'GTG': 'Val',
        'TCT': 'Ser', 'TCC': 'Ser', 'TCA': 'Ser', 'TCG': 'Ser',
        'CCT': 'Pro', 'CCC': 'Pro', 'CCA': 'Pro', 'CCG': 'Pro',
        'ACT': 'Thr', 'ACC': 'Thr', 'ACA': 'Thr', 'ACG': 'Thr',
        'GCT': 'Ala', 'GCC': 'Ala', 'GCA': 'Ala', 'GCG': 'Ala',
        'TAT': 'Tyr', 'TAC': 'Tyr',
        'TAA': 'Stop', 'TAG': 'Stop', 'TGA': 'Stop',
        'CAT': 'His', 'CAC': 'His',
        'CAA': 'Gln', 'CAG': 'Gln',
        'AAT': 'Asn', 'AAC': 'Asn',
        'AAA': 'Lys', 'AAG': 'Lys',
        'GAT': 'Asp', 'GAC': 'Asp',
        'GAA': 'Glu', 'GAG': 'Glu',
        'TGT': 'Cys', 'TGC': 'Cys',
        'TGG': 'Trp',
        'CGT': 'Arg', 'CGC': 'Arg', 'CGA': 'Arg', 'CGG': 'Arg',
        'AGT': 'Ser', 'AGC': 'Ser',
        'AGA': 'Arg', 'AGG': 'Arg',
        'GGT': 'Gly', 'GGC': 'Gly', 'GGA': 'Gly', 'GGG': 'Gly',
    }
    
    amino_acid_to_codons = {
        'Phe': ['TTT', 'TTC'],
        'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'Ile': ['ATT', 'ATC', 'ATA'],
        'Met': ['ATG'],
        'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
        'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
        'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
        'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
        'Tyr': ['TAT', 'TAC'],
        'Stop': ['TAA', 'TAG', 'TGA'],
        'His': ['CAT', 'CAC'],
        'Gln': ['CAA', 'CAG'],
        'Asn': ['AAT', 'AAC'],
        'Lys': ['AAA', 'AAG'],
        'Asp': ['GAT', 'GAC'],
        'Glu': ['GAA', 'GAG'],
        'Cys': ['TGT', 'TGC'],
        'Trp': ['TGG'],
        'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC'],
        'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
    }
    
    for i in range(0, len(sequence), 3):
        codon = sequence[i:i + 3]
        
        # Check if the codon is in the codon_frequencies dictionary
        if codon in codon_frequencies:
            optimized_sequence += codon
        else:
            # If the codon is not found, choose the most frequent synonymous codon
            amino_acid = codon_to_amino_acid[codon]
            synonyms = amino_acid_to_codons[amino_acid]
            most_frequent_synonym = max(synonyms, key=lambda syn: codon_frequencies.get(syn, 0))
            optimized_sequence += most_frequent_synonym
    
    return optimized_sequence

# Replace with your actual codon frequencies data
codon_frequencies = {
        'UUU':14.0, 'UCU':17.8, 'UAU':13.0, 'UGU':6.8,
        'UUC':27.4, 'UCC':14.3, 'UAC':16.1, 'UGC':9.3,
        'UUA': 4.4, 'UCA':9.8, 'UAA':1.1, 'UGA':1.0,
        'UUG':15.2, 'UCG':10.5, 'UAG':0.8, 'UGG':14.8,


        'CUU':19.8, 'CCU': 19.3, 'CAU':12.1, 'CGU':11.2,
        'CUC':26.6, 'CCC':16.5, 'CAC':11.1, 'CGC':9.2,
        'CUA':5.8, 'CCA':9.0, 'CAA':20.5, 'CGA':6.3,
        'CUG':9.5, 'CCG':8.0, 'CAG':14.5, 'CGG':4.0,


        'AUU':22.9, 'ACU':21.0, 'AAU':21.3, 'AGU':10.3,
        'AUC':28.4, 'ACC':21.7, 'AAC':28.1, 'AGC':11.2,
        'AUA':5.6, 'ACA':11.5, 'AAA':21.0, 'AGA':5.1,
        'AUG':18.4, 'ACG':9.2, 'AAG':26.9, 'AGG':6.9,


        'GUU':23.6, 'GCU':32.2, 'GAU':30.5, 'GGU':32.4,
        'GUC':31.0, 'GCC':25.7, 'GAC':25.0, 'GGC':26.4,
        'GUA':7.3, 'GCA':16.8, 'GAA':26.5, 'GGA':21.5,
        'GUG':9.7, 'GCG':11.6, 'GAG':22.7, 'GGG':8.1
}


# Replace with your initial DNA sequence
input_sequence = "ATGCAACGTTACTGCAGT"

optimized_sequence = optimize_codon_sequence(input_sequence, codon_frequencies)
print(f"Optimized Sequence: {optimized_sequence}")


Optimized Sequence: ATGCAAAGCTATTGTAGC
