In [10]:
# Define the codon usage table for E. coli (actual data):
codon_usage_ecoli = {
    'TTT': 0.0124, 'TTC': 0.0222,  # Phenylalanine (Phe)
    'TTA': 0.0041, 'TTG': 0.0031,  # Leucine (Leu)
    'CTT': 0.0108, 'CTC': 0.0153, 'CTA': 0.0069, 'CTG': 0.0154,  # Leucine (Leu)
    'ATT': 0.0244, 'ATC': 0.0296, 'ATA': 0.0056,  # Isoleucine (Ile)
    'ATG': 0.0186,  # Methionine (Met)
    'GTT': 0.0196, 'GTC': 0.0205, 'GTA': 0.0132, 'GTG': 0.0155,  # Valine (Val)
    'TCT': 0.0197, 'TCC': 0.0182, 'TCA': 0.0244, 'TCG': 0.0109,  # Serine (Ser)
    'CCT': 0.0142, 'CCC': 0.0152, 'CCA': 0.0146, 'CCG': 0.0134,  # Proline (Pro)
    'ACT': 0.0167, 'ACC': 0.0176, 'ACA': 0.0213, 'ACG': 0.0124,  # Threonine (Thr)
    'GCT': 0.0235, 'GCC': 0.0311, 'GCA': 0.0211, 'GCG': 0.0134,  # Alanine (Ala)
    'TAT': 0.0111, 'TAC': 0.0127,  # Tyrosine (Tyr)
    'TAA': 0.0004, 'TAG': 0.0004, 'TGA': 0.0007,  # Stop codons
    'CAT': 0.0144, 'CAC': 0.0183,  # Histidine (His)
    'CAA': 0.0196, 'CAG': 0.0197,  # Glutamine (Gln)
    'AAT': 0.0160, 'AAC': 0.0299,  # Asparagine (Asn)
    'AAA': 0.0177, 'AAG': 0.0203,  # Lysine (Lys)
    'GAT': 0.0140, 'GAC': 0.0225,  # Aspartic Acid (Asp)
    'GAA': 0.0217, 'GAG': 0.0299,  # Glutamic Acid (Glu)
    'TGT': 0.0075, 'TGC': 0.0145,  # Cysteine (Cys)
    'TGG': 0.0132,  # Tryptophan (Trp)
    'CGT': 0.0036, 'CGC': 0.0046, 'CGA': 0.0053, 'CGG': 0.0090,  # Arginine (Arg)
    'AGT': 0.0122, 'AGC': 0.0144,  # Serine (Ser)
    'AGA': 0.0047, 'AGG': 0.0062,  # Arginine (Arg)
    'GGT': 0.0127, 'GGC': 0.0221, 'GGA': 0.0150, 'GGG': 0.0074,  # Glycine (Gly)
}

# Define the input sequence to be optimized:
input_sequence = "CGTAGTTACCATGGAGCAGT"

# Function to perform frequency-based codon optimization:
def optimize_codons(input_sequence, codon_usage):
    optimized_sequence = ""

    for i in range(0, len(input_sequence), 3):
        codon = input_sequence[i:i+3]

        # Check if the codon exists in the codon usage table
        if codon in codon_usage:
            optimized_sequence += codon
        else:
            # Find the most frequent codon for the amino acid
            amino_acid = "X"  # Placeholder for unknown codons
            if i < len(input_sequence) - 3:
                next_amino_acid = input_sequence[i+3:i+3+3]
                if next_amino_acid in codon_usage:
                    amino_acid = next_amino_acid

            # Find the most frequent codon for the amino acid
            best_codon = max(codon_usage, key=lambda x: codon_usage[x] if x.endswith(amino_acid) else 0)
            optimized_sequence += best_codon

    return optimized_sequence

# Perform codon optimization
optimized_sequence = optimize_codons(input_sequence, codon_usage_ecoli)

# Print the original and optimized sequences
print("Original Sequence: ", input_sequence)
print("Optimized Sequence: ", optimized_sequence)


Original Sequence:  CGTAGTTACCATGGAGCAGT
Optimized Sequence:  CGTAGTTACCATGGAGCATTT
