In [25]:
import numpy as np
import random

In [26]:
num_of_reads = 5
read_len = 4
min_overlap = 0
# Read from fastq file
reads = ['area', 'read', 'adds', 'kind', 'eats']
overlap_matrix = np.zeros([num_of_reads, num_of_reads])

In [27]:
def overlap(a, b, min_len = 3):
    start = 0

    while True:
        start = a.find(b[:min_len], start)

        if start == -1:
            return 0

        if b.startswith(a[start:]):
            return len(a) - start

        start += 1

In [28]:
overlap('area', 'eats', min_overlap)

2

In [29]:
def build_overlap_matrix(min_len = 3):
    global overlap_matrix
    for i in range(num_of_reads):
        for j in range(num_of_reads):
            overlap_matrix[i, j] = overlap(reads[i], reads[j], min_len)

In [30]:
build_overlap_matrix(min_overlap)
overlap_matrix

array([[4., 3., 1., 0., 2.],
       [0., 4., 2., 0., 0.],
       [0., 0., 4., 0., 0.],
       [0., 0., 0., 4., 0.],
       [0., 0., 0., 0., 4.]])

In [31]:
# appends s2 to s1 after removing overlap
# For example - if s1 is 'area' and if s2 is 'eats' then this function will return 'areats'
def add(s1, s2): 
    offset = overlap(s1, s2, min_overlap)
    return s1 + s2[offset:]

In [32]:
add('area', 'eats')

'areats'

In [33]:
def generate_genome(index_list):
    genome = ''
    for i in index_list:
        genome = add(genome, reads[i])
    return genome

In [34]:
generate_genome([3, 2, 4, 1, 0])

'kindaddseatsreadarea'

In [35]:
def initialize_population(size):
    i = 0
    population = {} # key is genome and value is index of the reads used to create that genome
    while i < size:
        index_list = [] # To remember the order of the reads used for the generation of genome -
        # - helps in calculation of fitness scores
        temp = list(range(num_of_reads))
        while temp:
            index = random.choice(temp)
            temp.remove(index)
            index_list.append(index)
        genome = generate_genome(index_list)
        if genome not in population:
            population[genome] = index_list
            i += 1
    return population

In [36]:
population = initialize_population(3)
print(population)

{'areaddseatsreadkind': [0, 2, 4, 1, 3], 'eatsreadareakindadds': [4, 1, 0, 3, 2], 'kindreadareaddseats': [3, 1, 0, 2, 4]}


In [39]:
# Calculates overlap score for adjacent fragments
def fitness_score1(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        score += overlap_matrix[index_list[i]][index_list[i + 1]]
    return score

In [40]:
fitness_score1(population[list(population.keys())[0]])

1.0

In [41]:
# Calculates overlap score for all pairs of fragments - see paper
def fitness_score2(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        for j in range(len(index_list) - 1):
            score = score + (abs(i - j) * overlap_matrix[index_list[i]][index_list[j]])
    return score

In [42]:
fitness_score2(population[list(population.keys())[0]])

18.0

In [43]:
# Selection based on ranking - so sorting required
# fn - which fitness function to use
def selection(population, n, fn):
    # Sort the genomes based on fitness_scores in descending order
    # return the first n genomes
    new_population = {}
    fitness = {}
    for i in population:
        fitness[i] = fn(population[i])
    ordered = sorted(fitness.items(), key = lambda x : x[1], reverse = True)
    for i in range(n):
        genome = ordered[i][0]
        new_population[genome] = population[genome]
    return new_population

In [44]:
population = selection(population, 2, fitness_score1)
population

{'areaddseatsreadkind': [0, 2, 4, 1, 3],
 'kindreadareaddseats': [3, 1, 0, 2, 4]}

In [49]:
# Order 1 crossover - swath(strip of area) is from start to end both inclusive
# p1, p2 are the index_list of the population
def crossover1(p1, p2, start, end):
    child = {}
    temp = p1[start : end + 1]
    count = 0
    for i in p2:
        if i not in temp:
            if count < start:
                temp = [i] + temp
            else:
                temp = temp + [i]
            count += 1
    genome = generate_genome(temp)
    child[genome] = temp
    return child

In [50]:
print(population)
keys = list(population.keys())
crossover1(population[keys[0]], population[keys[1]], 1, 3)

{'areaddseatsreadkind': [0, 2, 4, 1, 3], 'kindreadareaddseats': [3, 1, 0, 2, 4]}


{'kindaddseatsreadarea': [3, 2, 4, 1, 0]}