In [3]:
import numpy as np
import random

In [4]:
# Read from fastq file
reads = [
    'AACCTTTCACGGTCACCCGCGG',
    'TTTCACGGTCACCCAGTCAACC',
    'GGTTAAACCCGGTAACCGTCAT',
    'AACCTTGTGCTCCCAACGTAAA',
    'GGTTCCAAACACTTGGTCAATC',
    'TTGGAACCTTTCACGGTCACCC']

In [5]:
overlap_minimum = 12
max_error = 3
n_pieces = max_error + 1
piece_size = overlap_minimum // n_pieces
num_of_reads = len(reads)
read_len = len(reads[0])
read_len

22

In [6]:
def divide(text):
    pieces = [text[i: i + piece_size] for i in range(0, overlap_minimum, piece_size)]
    
    return pieces

In [7]:
def build_index(reads):
    index = [None] * n_pieces
    read_n = 1
    
    for read in reads:
        for i in range(n_pieces):
            
            start = i * piece_size
            
            piece = read[start:start + piece_size]
            
            if index[i] == None:
                index[i] = {}
            if piece not in index[i]:
                index[i][piece] = []
                
            index[i][piece].append(read_n)
        read_n += 1
    return index

In [8]:
def get_suffixes(read):
    N = len(read)
    suffixes = [read[i: ] for i in range(N - overlap_minimum + 1)]
    
    return suffixes

In [9]:
def overlap_scores(reads):
    
    num_of_reads = len(reads)
    
    matrix = np.zeros(shape = [num_of_reads, num_of_reads])
    
    index = build_index(reads)
    
    for read_index in range(num_of_reads):
        for S in get_suffixes(reads[read_index]):
            
            pieces = divide(S[:overlap_minimum])
            
            for i in range(n_pieces):
                
                if pieces[i] in index[i]:
                    
                    Li = index[i][pieces[i]]
                    
                    for read_no in Li:
                        
                        temp, end = 0, (i * piece_size)                        
                        s1, s2 = S[:end], reads[read_no - 1][:end]
                        
                        for char_index in range(end):
                            if(s1[char_index] != s2[char_index]):
                                temp += 1
                                
                        if temp < max_error:
                            
                            temp1, reached_end, start = 0, True, i * piece_size + piece_size
                            s1, s2 = S[start:], reads[read_no - 1][start:]
                            
                            for char_index in range(len(s1)):
                                if temp1 == max_error:
                                    reached_end = False
                                    break
                                if s1[char_index] != s2[char_index]:
                                    temp1 += 1
                                    
                            if (reached_end) and (temp1 < max_error):
                                if read_index + 1 != read_no:
                                    score = len(S)
                                    matrix[read_index, read_no - 1] = score - temp
#                                   print(read_index + 1," -> ", read_no, "Score :", score, "Error : ", temp)
                                    
                        
                    
                    break
    return matrix

In [10]:
overlap_matrix = overlap_scores(reads)
overlap_matrix

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [18., 14.,  0.,  0.,  0.,  0.]])

In [11]:
# appends s2 to s1 after removing overlap
# For example - if s1 is 'datarea' and if s2 is 'eats' then this function will return 'datareats'
def add(s1, s2):
    offset = int(overlap_matrix[s1][s2])
    return reads[s2][offset:]

In [12]:
len(add(0, 1))

22

In [13]:
def generate_genome(index_list):
    genome = reads[index_list[0]]
    for i in range(1, len(index_list)):
        genome += add(index_list[i - 1], index_list[i])
    return genome

In [14]:
len(generate_genome([5, 0, 3, 2, 1]))

92

In [15]:
def initialize_population(size):
    i = 0
    population = {} # key is genome and value is index of the reads used to create that genome
    while i < size:
        index_list = [] # To remember the order of the reads used for the generation of genome -
        # - helps in calculation of fitness scores
        temp = list(range(num_of_reads))
        while temp:
            index = random.choice(temp)
            temp.remove(index)
            index_list.append(index)
        genome = generate_genome(index_list)
        if genome not in population:
            population[genome] = index_list
            i += 1
    return population

In [16]:
population = initialize_population(3)
print(population)

{'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAAACCTTTCACGGTCACCCGCGGTTGGAACCTTTCACGGTCACCCGGTTAAACCCGGTAACCGTCATTTTCACGGTCACCCAGTCAACC': [4, 3, 0, 5, 2, 1], 'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAGGTTAAACCCGGTAACCGTCATAACCTTTCACGGTCACCCGCGGTTTCACGGTCACCCAGTCAACCTTGGAACCTTTCACGGTCACCC': [4, 3, 2, 0, 1, 5], 'TTTCACGGTCACCCAGTCAACCGGTTAAACCCGGTAACCGTCATGGTTCCAAACACTTGGTCAATCAACCTTTCACGGTCACCCGCGGAACCTTGTGCTCCCAACGTAAATTGGAACCTTTCACGGTCACCC': [1, 2, 4, 0, 3, 5]}


In [17]:
# Calculates overlap score for adjacent fragments
def fitness_score1(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        score += overlap_matrix[index_list[i]][index_list[i + 1]]
    return score

In [18]:
for i in population:
    print(fitness_score1(population[i]))

0.0
0.0
0.0


In [19]:
# Calculates overlap score for all pairs of fragments - see paper
def fitness_score2(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        for j in range(len(index_list) - 1):
            score = score + (abs(i - j) * overlap_matrix[index_list[i]][index_list[j]])
    return score

In [20]:
for i in population:
    print(fitness_score2(population[i]))

18.0
0.0
0.0


In [21]:
# Selection based on ranking - so sorting required
# fn - which fitness function to use
def selection(population, n, fn):
    # Sort the genomes based on fitness_scores in descending order
    # return the first n genomes
    new_population = {}
    fitness = {}
    for i in population:
        fitness[i] = fn(population[i])
    ordered = sorted(fitness.items(), key = lambda x : x[1], reverse = True)
    for i in range(n):
        genome = ordered[i][0]
        new_population[genome] = population[genome]
    return new_population, fitness

In [22]:
population, fitness = selection(population, 2, fitness_score1)
print(population, fitness)

{'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAAACCTTTCACGGTCACCCGCGGTTGGAACCTTTCACGGTCACCCGGTTAAACCCGGTAACCGTCATTTTCACGGTCACCCAGTCAACC': [4, 3, 0, 5, 2, 1], 'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAGGTTAAACCCGGTAACCGTCATAACCTTTCACGGTCACCCGCGGTTTCACGGTCACCCAGTCAACCTTGGAACCTTTCACGGTCACCC': [4, 3, 2, 0, 1, 5]} {'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAAACCTTTCACGGTCACCCGCGGTTGGAACCTTTCACGGTCACCCGGTTAAACCCGGTAACCGTCATTTTCACGGTCACCCAGTCAACC': 0.0, 'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAGGTTAAACCCGGTAACCGTCATAACCTTTCACGGTCACCCGCGGTTTCACGGTCACCCAGTCAACCTTGGAACCTTTCACGGTCACCC': 0.0, 'TTTCACGGTCACCCAGTCAACCGGTTAAACCCGGTAACCGTCATGGTTCCAAACACTTGGTCAATCAACCTTTCACGGTCACCCGCGGAACCTTGTGCTCCCAACGTAAATTGGAACCTTTCACGGTCACCC': 0.0}


In [23]:
# Order 1 crossover - swath(strip of area) is from start to end both inclusive
# p1, p2 are the index_list of the population
# child is also an index_list
def crossover1(p1, p2, start, end):
    temp = p1[start : end + 1]
    count = 0
    for i in p2:
        if i not in temp:
            if count < start:
                temp = [i] + temp
            else:
                temp = temp + [i]
            count += 1
    genome = generate_genome(temp)
    return genome, temp

In [24]:
print(population)
keys = list(population.keys())
crossover1(population[keys[0]], population[keys[1]], 1, 3)

{'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAAACCTTTCACGGTCACCCGCGGTTGGAACCTTTCACGGTCACCCGGTTAAACCCGGTAACCGTCATTTTCACGGTCACCCAGTCAACC': [4, 3, 0, 5, 2, 1], 'GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAGGTTAAACCCGGTAACCGTCATAACCTTTCACGGTCACCCGCGGTTTCACGGTCACCCAGTCAACCTTGGAACCTTTCACGGTCACCC': [4, 3, 2, 0, 1, 5]}


('GGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAAAACCTTTCACGGTCACCCGCGGTTGGAACCTTTCACGGTCACCCGGTTAAACCCGGTAACCGTCATTTTCACGGTCACCCAGTCAACC',
 [4, 3, 0, 5, 2, 1])

In [25]:
# Edge Recombination
def crossover2():
    pass

In [26]:
def mutation(index_list):
    a = random.randint(0, num_of_reads - 1)
    b = random.randint(0, num_of_reads - 1)
    index_list[a], index_list[b] = index_list[b], index_list[a]
    return index_list

In [27]:
temp = mutation([0, 1, 2, 3, 4, 5])
print(temp, generate_genome(temp))

[0, 1, 2, 5, 4, 3] AACCTTTCACGGTCACCCGCGGTTTCACGGTCACCCAGTCAACCGGTTAAACCCGGTAACCGTCATTTGGAACCTTTCACGGTCACCCGGTTCCAAACACTTGGTCAATCAACCTTGTGCTCCCAACGTAAA


In [28]:
def genetic_algorithm(size = 5, generations = 100, select_n = 3, threshold = 20, start = 2, end = 4):
    population = initialize_population(size)
    count = 1
    while count < generations:
        print("Iteration:", count)

        population, fitness = selection(population, select_n, fitness_score1)
        
        #if any fitness core is greater than 100 break
        for i in fitness:
            if fitness[i] >= threshold:
                return i
        
        #crossover
        while len(population) < size:
            #print(len(population), size)
            temp = list(population.values())
            a = random.choice(temp)
            b = random.choice(temp)
            if a != b:
                #print("CHeck", a, b)
                genome, index_list = crossover1(a, b, start, end)
                if genome not in population:
                    population[genome] = index_list
        
        #mutation
        for i in list(population.keys()):
            temp = mutation(population[i])
            population.pop(i)
            temp_gen = generate_genome(temp)
            population[temp_gen] = temp
    
        count += 1

    return None

In [29]:
genetic_algorithm()

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49


KeyboardInterrupt: 

In [None]:
p = {1:2, 3:4}
p.update({1:5})
p

In [None]:
a = [1, 2, 3]
b = [3, 2, 1]
a == b