In [5]:
import random
import numpy as np
import matplotlib.pyplot as plt
import csv

In [6]:
import random

bases = ['A', 'C', 'T', 'G']
start_codon = 'ATG'  # Fixed as a string, not a list
stop_codons = ['TGA', 'TAG', 'TAA']
gen_len = 51


table = { 
            'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
            'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
            'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
            'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
            'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
            'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
            'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
            'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
        } 

hydrophilic = ['R', 'N', 'D', 'Q', 'E', 'H', 'K', 'S', 'T']

hydrophobic = ['A', 'I', 'L', 'M', 'F', 'W', 'Y', 'V']

neutral = ['G', 'C', 'P']

stop = ["_"]

class indiv:  # genome, fitness, constructor, print, ...
    def __init__(self):  # constructor
        self.fitness = 0

        # generates random mutations of length 51
        self.genome = []
        self.amino_acid = []
        for i in range(0, gen_len):
            self.genome.append(random.choice(bases))
        
        for i in range (0, len(self.genome), 3):
            codon = ''.join(self.genome[i:i + 3]) 
            amino_acid = table.get(codon, "X")
            self.amino_acid.append(amino_acid)
        
        # print('Original genome = ', ''.join(self.genome))

    def to_amino_acid(self):
        self.amino_acid = []
        for i in range (0, len(self.genome), 3):
            codon = ''.join(self.genome[i:i + 3])  # Extract a
            # print('codon = ', codon)
            amino_acid_find = table.get(codon)
            # print('amino_acid = ', amino_acid)
            self.amino_acid.append(amino_acid_find)
        return self.amino_acid
            
    def print(self):
        print(''.join(self.genome))
        print("fitness = ", self.fitness)

    def __str__(self):
        return_str = ''.join(self.genome)
        return_str += "\nfitness = " + str(self.fitness)
        return return_str

    def copy(self, source):
        self.genome = source.genome.copy()  # Efficient copy of genome
        self.amino_acid = source.amino_acid.copy()

    # One-point crossover
    # def crossover(self, other):
    #     crossPoint = random.randint(0, len(self.genome) - 1)
    #     for i in range(crossPoint, len(self.genome)):
    #         print('temp1 = ', len(self.genome))
    #         temp = self.genome[i]
    #         print('temp2 = ', len(other.genome))

    #         self.genome[i] = other.genome[i]
    #         other.genome[i] = temp

    def crossover(self, other):
        # Determine the minimum length of the two genomes
        min_len = min(len(self.genome), len(other.genome))
        
        # Select a random crossover point within the valid range
        crossPoint = random.randint(0, min_len - 1)
        
        # Perform the crossover operation up to the minimum length
        for i in range(crossPoint, min_len):
            temp = self.genome[i]
            self.genome[i] = other.genome[i]
            other.genome[i] = temp

        ### Check if the genome is divisible by 3. If not, remove the last nucleic acid
        # because it will not be able to form a codon. Returns none for amino acid. This makes things easier
        if len(self.genome) % 3 != 0:
            self.genome = self.genome[:len(self.genome) - 1]
            
        if len(self.genome) % 3 != 0:
            self.genome = self.genome[:len(self.genome) - 1]
        
        if len(self.genome) % 3 != 0:
            self.genome = self.genome[:len(self.genome) - 1]
        
        self.amino_acid = self.to_amino_acid()
        other.amino_acid = other.to_amino_acid()



    def insertion(self):
        new_gene = []
        new_genome = []
        start = random.randint(0, len(self.genome) - 1)
        for i in range(3):
            new_gene += random.choice(bases)
        new_genome = self.genome[:start] + new_gene + self.genome[start:]
        self.genome = new_genome
        # print('insert_genome = ', ''.join(new_genome))
        self.amino_acid = self.to_amino_acid()
        # print('insert_aa = ', ''.join(self.amino_acid))


    def deletion(self):
        new_genome = []
        start = random.randint(0, len(self.genome) - 1)
        length = 3 #only 3 nucleic acids can be deleted
        new_genome = self.genome[:start] + self.genome[start + length:]
        self.genome = new_genome
        # print('delete_genome = ', ''.join(new_genome))
        self.amino_acid = self.to_amino_acid()
        # print('delete_aa = ', ''.join(new_genome))

         
    def mutate(self):
        first_25 = round(len(self.genome) * .25)
        first_75 = round(len(self.genome) * .75)
        first_100 = len(self.genome)

        # print("first_25 = ", first_25)
        # print("first_75 = ", first_75)
        # print("first_100 = ", first_100)

        genome_25 = self.genome[:first_25]
        genome_75 = self.genome[first_25:first_75]
        genome_100 = self.genome[first_75:]

        for codon in range(0, first_25, 3):
            if table.get(''.join(self.genome[codon:codon + 3])) in hydrophilic:
                # Mutate randomly 3 genes within the first 25 percent of the genome
                for i in range(3):
                    self.genome[random.randint(0, first_25 - 1)] = random.choice(bases)
            elif table.get(''.join(self.genome[codon:codon + 3])) == "_":
                for i in range(3):
                    self.genome[random.randint(0, first_25 - 1)] = random.choice(bases)
            else:
                pass
            
        for codon in range(first_25, first_75, 3):
            if table.get(''.join(self.genome[codon:codon + 3])) in hydrophilic:
                # Mutate randomly 3 genes within the first 25 percent of the genome
                for i in range(3):
                    self.genome[random.randint(first_25, first_75 - 1)] = random.choice(bases)
            elif table.get(''.join(self.genome[codon:codon + 3])) == "_":
                for i in range(3):
                    self.genome[random.randint(first_25, first_75 - 1)] = random.choice(bases)
            else:
                pass

        for codon in range(first_75, first_100, 3):
            if table.get(''.join(self.genome[codon:codon + 3])) in hydrophilic:
                # Mutate randomly 3 genes within the first 25 percent of the genome
                for i in range(3):
                    self.genome[random.randint(first_75, first_100 - 1)] = random.choice(bases)
            elif table.get(''.join(self.genome[codon:codon + 3])) == "_":
                for i in range(3):
                    self.genome[random.randint(first_75, first_100 - 1)] = random.choice(bases)
            else:
                pass

        self.amino_acid = self.to_amino_acid()

    
    def calcFitness(self):
        # Fitness is the range between the start and stop codon. Fitness is counted by three.
        self.fitness = 0

        # Search for the start and stop codons
        genome_str = self.amino_acid

        for i in range(0, round(len(genome_str) *.25)):
            if genome_str[i] in hydrophilic:
                self.fitness += 1
            elif genome_str[i] in hydrophobic:
                self.fitness -= 1
            elif genome_str[i] in stop:
                self.fitness -= 1
            elif genome_str[i] in neutral:
                self.fitness += 0

        for i in range(round(len(genome_str) *.25), round(len(genome_str) *.75)):
            if genome_str[i] in hydrophilic:
                self.fitness -= 1
            elif genome_str[i] in hydrophobic:
                self.fitness += 1
            elif genome_str[i] in stop:
                self.fitness -= 1
            elif genome_str[i] in neutral:
                self.fitness += 0

        for i in range(round(len(genome_str) *.75), len(genome_str) - 1):
            if genome_str[i] in hydrophilic:
                self.fitness += 1
            elif genome_str[i] in hydrophobic:
                self.fitness -= 1
            elif genome_str[i] in stop:
                self.fitness -= 1
            elif genome_str[i] in neutral:
                self.fitness += 0

        return self.fitness



In [7]:
popSize = 200
class pop:
    def __init__(self):  # constructor
        self.population = []
        for i in range(0, popSize):
            self.population.append(indiv())
        self.bestFit = 0 # best fitness
        self.best = 0 # index of best individual
        self.avgFit = 0
        self.calcStats()


    def generation(self):
        tempPop = pop()
        num = -1
        for i in range(0,popSize,2):

            p1 = self.tourn() # tournament selection
            p2 = self.tourn()

            tempPop.population[i].copy(self.population[p1])
            tempPop.population[i+1].copy(self.population[p2])

            num = random.randint(0,1)
            ## Population mutation either insertion or deletion on a coin flip 
            if(num == 0):
                tempPop.population[i].insertion()
                tempPop.population[i+1].insertion()
            elif(num == 1):
                tempPop.population[i].deletion()
                tempPop.population[i+1].deletion()

            tempPop.population[i].crossover(tempPop.population[i+1])
            
            tempPop.population[i].mutate()
            tempPop.population[i+1].mutate()
        for i in range(0,popSize):
            self.population[i].copy(tempPop.population[i])

    def tourn(self):
        best = random.randint(0,popSize-1) # the winner so far
        bestfit = self.population[best].fitness # best fit so far
        for i in range(5): # tournament size of 6!!!!
            p2 = random.randint(0,popSize-1)
            if(self.population[p2].fitness > bestfit):
                bestfit = self.population[p2].fitness
                best = p2
        return best

    def calcStats(self):
        self.avgFit = 0
        self.population[0].calcFitness()
        self.bestFit = self.population[0].fitness
        self.best = 0
        for i in range(len(self.population)):
            self.population[i].calcFitness() # update fitnesses
            if(self.population[i].fitness > self.bestFit): # compare fitness to best
                self.bestFit = self.population[i].fitness
                self.best = i
            self.avgFit += self.population[i].fitness
        self.avgFit = self.avgFit/len(self.population)

    # Function that returns a randome genome from population size when called
    def randGenome(self):
        return self.population[random.randint(0,popSize-1)]

    def clear(self):
        ## clear the population back to zero
        self.population = []
        


In [13]:
gen_array = []
with open("gen_array.csv", "r") as genfile: 
    reader = csv.reader(genfile)
    for row in reader:
        gen_array.append(''.join(row))

gen_array

['YGLPCRPDKVYNTDTESLSAALAALCRWLG_VNSV',
 'PQYPAPAGSNILGSVGLVSVIANFSYASTSQTPV_QQ',
 '_ANICHELSLPF_LAMPPQLVWYYD_G_RRYDEQKMI',
 'GHNRRNRTQLRSITTDADSARYCLHNQMFSHTR',
 'KYPRHVFGLGVVPGGLMGV_LFGAC_VVRQHSL',
 'GESASVGPRIFSILGLVRLPAPLRYVSASTQ',
 'RARLAPQYEELVRLAIMCMSSTAFRSLDNPW',
 'HVD_ESKAQMAFIAHIWCFVGVCKAWEGL_INRLK',
 'LHATKH__LLVYLTTPLIIVLLLSAGNLSSTQPIA',
 'AKRGLYSKFVFLALAFCSGIYIRPPQSTSIQ']

In [15]:
type(gen_array[0])

str