In [46]:
import numpy as np
import random

In [47]:
num_of_reads = 5
read_len = 4
min_overlap = 0
# Read from fastq file
reads = ['area', 'read', 'adds', 'kind', 'eats']
overlap_matrix = np.zeros([num_of_reads, num_of_reads])

In [48]:
def overlap(a, b, min_len = 3):
    start = 0

    while True:
        start = a.find(b[:min_len], start)

        if start == -1:
            return 0

        if b.startswith(a[start:]):
            return len(a) - start

        start += 1

In [49]:
overlap('area', 'eats', min_overlap)

2

In [50]:
def build_overlap_matrix(min_len = 3):
    global overlap_matrix
    for i in range(num_of_reads):
        for j in range(num_of_reads):
            overlap_matrix[i, j] = overlap(reads[i], reads[j], min_len)

In [51]:
build_overlap_matrix(min_overlap)
overlap_matrix

array([[4., 3., 1., 0., 2.],
       [0., 4., 2., 0., 0.],
       [0., 0., 4., 0., 0.],
       [0., 0., 0., 4., 0.],
       [0., 0., 0., 0., 4.]])

In [52]:
# appends s2 to s1 after removing overlap
# For example - if s1 is 'area' and if s2 is 'eats' then this function will return 'areats'
def add(s1, s2):
    offset = overlap(s1, s2, min_overlap)
    return s1 + s2[offset:]

In [53]:
add('area', 'eats')

'areats'

In [54]:
def initialize_population(size):
    i = 0
    population = {} # key is genome and value is index of the reads used to create that genome
    while i < size:
        genome = ''
        index_list = [] # To remember the order of the reads used for the generation of genome -
        # - helps in calculation of fitness scores
        temp = list(range(num_of_reads))
        while temp:
            index = random.choice(temp)
            genome = add(genome, reads[index])
            temp.remove(index)
            index_list.append(index)
        if genome not in population:
            population[genome] = index_list
            i += 1
    return population

In [55]:
population = initialize_population(3)
print(population)

{'readareaddseatskind': [1, 0, 2, 4, 3], 'addseatskindreadarea': [2, 4, 3, 1, 0], 'areadeatskindadds': [0, 1, 4, 3, 2]}


In [60]:
# Calculates overlap score for adjacent fragments
def fitness_score1(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        score += overlap_matrix[index_list[i]][index_list[i + 1]]
    return score

In [68]:
fitness_score1(population['areadeatskindadds'])

3.0

In [62]:
# Calculates overlap score for all pairs of fragments - see paper
def fitness_score2(index_list):
    score = 0
    for i in range(len(index_list) - 1):
        for j in range(len(index_list) - 1):
            score = score + (abs(i - j) * overlap_matrix[index_list[i]][index_list[j]])
    return score

In [63]:
fitness_score2(population['readareaddseatskind'])

12.0

In [66]:
# Selection based on ranking - so sorting required
# fn - which fitness function to use
def selection(population, n, fn):
    # Sort the genomes based on fitness_scores in descending order
    # return the first n genomes
    new_population = {}
    fitness = {}
    for i in population:
        fitness[i] = fn(population[i])
    ordered = sorted(fitness.items(), key = lambda x : x[1], reverse = True)
    for i in range(n):
        genome = ordered[i][0]
        new_population[genome] = population[genome]
    return new_population

In [69]:
population = selection(population, 2, fitness_score1)
population

{'areadeatskindadds': [0, 1, 4, 3, 2], 'readareaddseatskind': [1, 0, 2, 4, 3]}

In [None]:
def crossover1():
    