In [4]:
import random
# Define the target protein sequence
target_protein = "CGTAGTTACCATGGAGCAGT"  # Replace with your actual protein sequence

# Define the codon usage table for the host organism (example)
host_codon_table = {
    "M": ["ATG"],
    "S": ["TCA", "TCT", "TCC", "TCG", "AGT", "AGC"],
    "T": ["ACA", "ACT", "ACC", "ACG"],
    "H": ["CAT", "CAC"],
    "D": ["GAT", "GAC"],
    "K": ["AAA", "AAG"],
    "E": ["GAA", "GAG"],
    "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
    "A": ["GCA", "GCC", "GCG", "GCT"],
    "C": ["TGC", "TGT"],
    "G": ["GGG", "GGC", "GGA", "GGT"],
}

# Define the genetic algorithm parameters
population_size = 50
mutation_rate = 0.01
generations = 100

# Initialize the population with random codon sequences
def generate_random_sequence(target_protein):
    return [random.choice(host_codon_table[aa]) for aa in target_protein]

population = [generate_random_sequence(target_protein) for _ in range(population_size)]

# Define the fitness function
def calculate_fitness(sequence, target_protein):
    # Calculate a fitness score based on how well the sequence matches the target protein
    return sum(codon == target_codon for codon, target_codon in zip(sequence, target_protein))

# Genetic algorithm loop
for generation in range(generations):
    # Calculate fitness scores for the entire population
    fitness_scores = [calculate_fitness(seq, target_protein) for seq in population]
    
    # Select the top-performing sequences to be parents
    parents = [population[i] for i in sorted(range(len(fitness_scores)), key=lambda x: fitness_scores[x], reverse=True)[:10]]
    
    # Create the next generation
    new_population = parents[:]
    
    while len(new_population) < population_size:
        parent1 = random.choice(parents)
        parent2 = random.choice(parents)
        crossover_point = random.randint(1, len(target_protein) - 1)
        child = parent1[:crossover_point] + parent2[crossover_point:]
        
        # Apply mutation
        if random.random() < mutation_rate:
            mutation_position = random.randint(0, len(target_protein) - 1)
            new_codon = random.choice(host_codon_table[target_protein[mutation_position]])
            child = child[:mutation_position] + [new_codon] + child[mutation_position+1:]
        
        new_population.append(child)
    
    population = new_population

# Select the best sequence from the final generation
best_sequence = max(population, key=lambda seq: calculate_fitness(seq, target_protein))
optimized_sequence_str = "".join(best_sequence)

print("Best codon-optimized sequence:", optimized_sequence_str)


Best codon-optimized sequence: TGCGGGACTGCAGGGACTACCGCGTGTTGTGCTACGGGGGGTGCGGGTTGCGCAGGCACA
