In [1]:
import random

# Define the target protein sequence
target_protein = "ATGCAACGTTACTGCAGT"  # Replace with your actual protein sequence

# Define the codon usage table for the host organism, including A, C, and G nucleotides
host_codon_table = {
    "A": ["GCT", "GCC", "GCA", "GCG"],
    "C": ["TGT", "TGC"],
    "D": ["GAT", "GAC"],
    "E": ["GAA", "GAG"],
    "F": ["TTT", "TTC"],
    "G": ["GGT", "GGC", "GGA", "GGG"],
    "H": ["CAT", "CAC"],
    "I": ["ATT", "ATC", "ATA"],
    "K": ["AAA", "AAG"],
    "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
    "M": ["ATG"],
    "N": ["AAT", "AAC"],
    "P": ["CCT", "CCC", "CCA", "CCG"],
    "Q": ["CAA", "CAG"],
    "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
    "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
    "T": ["ACT", "ACC", "ACA", "ACG"],
    "V": ["GTT", "GTC", "GTA", "GTG"],
    "W": ["TGG"],
    "Y": ["TAT", "TAC"],
    "*": ["TAA", "TAG", "TGA"],  # Stop codons
}

# Define the genetic algorithm parameters
population_size = 50
mutation_rate = 0.01
generations = 10  # Number of refinement iterations

# Initialize the population with random codon sequences
def generate_random_sequence(target_protein):
    return [random.choice(host_codon_table[aa]) for aa in target_protein]

population = [generate_random_sequence(target_protein) for _ in range(population_size)]

# Define the fitness function
def calculate_fitness(sequence, target_protein):
    # Calculate a fitness score based on how well the sequence matches the target protein
    return sum(codon == target_codon for codon, target_codon in zip(sequence, target_protein))

# Iterative refinement loop
for _ in range(generations):
    # Calculate fitness scores for the entire population
    fitness_scores = [calculate_fitness(seq, target_protein) for seq in population]
    
    # Select the top-performing sequences to be parents
    parents = [population[i] for i in sorted(range(len(fitness_scores)), key=lambda x: fitness_scores[x], reverse=True)[:10]]
    
    # Create the next generation
    new_population = parents[:]
    
    while len(new_population) < population_size:
        parent1 = random.choice(parents)
        parent2 = random.choice(parents)
        crossover_point = random.randint(1, len(target_protein) - 1)
        child = parent1[:crossover_point] + parent2[crossover_point:]
        
        # Apply mutation
        if random.random() < mutation_rate:
            mutation_position = random.randint(0, len(target_protein) - 1)
            new_codon = random.choice(host_codon_table[target_protein[mutation_position]])
            child = child[:mutation_position] + [new_codon] + child[mutation_position+1:]
        
        new_population.append(child)
    
    population = new_population

# Select the best sequence from the final generation
best_sequence = max(population, key=lambda seq: calculate_fitness(seq, target_protein))

#print("Best codon-optimized sequence:", best_sequence)
# Select the best sequence from the final generation
# Convert the list of codons to a single string
optimized_sequence_str = "".join(best_sequence)

print("Best codon-optimized sequence:", optimized_sequence_str)



Best codon-optimized sequence: GCAACCGGATGTGCTGCTTGCGGAACTACTGCATGTACAGGTTGCGCGGGTACC
