In [1]:
from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack, save_npz, load_npz #! pip install scipy
from jellyfish import damerau_levenshtein_distance #! pip install jellyfish
import numpy as np #! pip install numpy
from itertools import product
from math import ceil, fmod
import random

In [2]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    circularize    | boolean indicating whether to add a random amount of the end of the sequence
                   | to the beginning and vice versa
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,min_coverage=None,circularise=False,seed=None,shuffle=True,return_unshuffle=False):
    import random

    random.seed(seed)
    if circularise: sequence = sequence[-random.randint(min_overlap,max_overlap):] + sequence + sequence[:random.randint(min_overlap,max_overlap)]
    reads = []
    while 1: 
        start = 0
        end = random.randint(min_subseq_len,max_subseq_len)
        reads += [sequence[start:end]]
        while end < len(sequence):
            start = random.randint(end-max_overlap,end-min_overlap)
            if (len(sequence) - start)/max_subseq_len < 2:
                if (len(sequence) - start)/max_subseq_len < 1:
                    end = len(sequence)
                else:
                    a = 0
                    while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                    end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
            else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
            reads += [sequence[start:end]]
        if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage:
            if not shuffle: return reads
            reads_ = reads[:]
            random.shuffle(reads_)
            return reads_, list(reads_.index(read) for read in reads)

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C
INPUT
    n          | the length of the sequence
    palindrome | a boolean indicating whether the sequence must be a palidrome or not
    seed       | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,palindrome=False,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    if palindrome: n = ceil(n/2)
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    if palindrome: seq += ''.join(reversed(seq[:int(n-fmod(n,2))]))
    return seq

# Sequitur

In [3]:
def move_col(B: coo_matrix, cols: dict) -> None:
    for c in range(len(B.col)):
        B.col[c] = cols[B.col[c]]
            
def move_row(B: coo_matrix,rows: dict) -> None:
    for r in range(len(B.row)):
        B.row[r] = rows[B.row[r]]

def normalised_damerau_levenshtein_distance(read: str,overlap: str) -> float:
    return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads: list, min_suf_len: int = 3) -> tuple:
    suf_arr = []
    for read in reads:
        read += '$' + str(reads.index(read))
        for i in range(len(read)):
            if len(read[i:]) < min_suf_len + 2: continue 
            suf_arr += [read[i:]]
    suf_arr.sort()
    suf_arr_ind = []
    for s in range(len(suf_arr)):
        suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
        suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
    return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads: list, suf_arr: list = None, suf_arr_ind: list = None, max_diff: float = 0.25, min_suf_len: int = 3) -> dict:
    if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
    reads_map = dict(zip(reads,list(range(len(reads)))))
    B = {}
    for read in reads:
        for j in range(min_suf_len + 1):
            i = suf_arr.index(read[j:]+'$') - 1
            while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
                if not reads[suf_arr_ind[i]] == read and \
                   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
                   read.startswith(suf_arr[i][:-1]):
                    if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
                    else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
                i -= 1
    return B

def find_lower_diagonal_path(B: coo_matrix,reads_map: dict,cols: list,rows: list) -> tuple:
    argpen = lambda l: np.argpartition(l,-2)[-2]

    new_cols = [rows[B.sum(axis=1).argmin()]] + list(c for c in cols if c not in [rows[B.sum(axis=1).argmin()],cols[B.sum(axis=0).argmin()]]) + [cols[B.sum(axis=0).argmin()]]
    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
    move_col(B,cols_map)
    cols = new_cols

    new_rows = cols[:]
    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
    move_row(B,rows_map)
    rows = new_rows

    i,j,k = len(rows), len(cols) - 1, B.sum(axis=1).argmin()

    while j > k:
        if B.getrow(rows.index(cols[j])).argmax() == k: 
            cols_,c_ = [], 0

            while True:
                c_ += 1
                if len(B.getrow(j+c_).nonzero()[1]) > 1:
                    cols_ = np.argpartition(B.getrow(j+c_).toarray().flatten(),-2)[::-1][:2]
                    if cols[cols_[1]] in cols[:j] and B.getcol(cols_[1]).argmax() == j+c_: break

            new_cols = cols[:k+1] + cols[j:j+c_] + list(c for c in cols[k+1:j] if c not in [cols[min(cols_)]]) + [cols[min(cols_)]] + cols[j+c_:]
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            new_rows = cols[:k+c_+1] + list(r for r in rows[k:j+c_] if r not in cols[:k+c_+1] + cols[j+c_:]) + cols[j+c_:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows

            i,j,k = j + c_ + 1, j + c_, k + c_
        elif B.getrow(rows.index(cols[j])).argmax() > j:
            cmax = B.getrow(rows.index(cols[j])).argmax()
            cpen = argpen(B.getrow(rows.index(cols[j])).toarray().flatten()) 

            if len(B.getrow(rows.index(cols[j])).nonzero()[1]) > 1 and \
                (len(B.getrow(cmax+1).nonzero()[1]) == 1 or \
                B.getrow(cmax+1).getcol(argpen(B.getrow(cmax+1).toarray().flatten())).data[0] <=  B.getrow(rows.index(cols[j])).getcol(cpen).data[0]):
                crange = [cpen,cmax]
            else: crange = [argpen(B.getrow(cmax).toarray().flatten()),cmax]
            while crange[0] > j:
                if len(B.getrow(crange[0]).nonzero()[1]) > 1:
                    crange = [argpen(B.getrow(crange[0]).toarray().flatten())] + crange
                else:
                    crange = [B.getrow(crange[0]).argmax()] + crange

            new_cols = list(c for c in cols[:j] if c not in list(cols[cr] for cr in crange)) + list(cols[cr] for cr in crange) + list(c for c in cols[j:] if c not in list(cols[cr] for cr in crange))
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            new_rows = list(r for r in rows[:i] if r not in cols[j:]) + cols[j:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows
        else:
            new_cols = list(c for c in cols[:j] if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]]) + [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            new_rows = list(r for r in rows[:i] if r not in [cols[j]]) + [cols[j]] + rows[i:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows
        j -= 1
        i -= 1

    seq = ''
    for s,d in zip(list(reads_map[k] for k in rows)[:-1],B.diagonal(-1)):
        seq += s[:-d]
    seq += list(reads_map[k] for k in rows)[-1]
    return seq, rows, cols

In [None]:
seed = 1
seq = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(seq,250,500,50,100,seed=seed)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
seq_ == seq

In [None]:
seed = 0
seq = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                           'ught_better_butter_to',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

In [None]:
seq = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

In [None]:
seq = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

In [None]:
successes = 0
n = 50
for seed in range(n):  
    seq = generate_genome_sequence(10000,seed=seed)
    reads = generate_reads(seq,250,500,50,100,seed=seed)
    reads_map = dict(zip(list(range(len(reads))),reads))
    rows = list(range(len(reads)))
    cols = list(range(len(reads)))
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
    s = '| Seed: ' + str(seed) + ' | '
    if seq_ == seq:
        s+='SUC | ' + seq_ + ' == ' + seq
        successes+=1
    else: 
        s+='FAI | ' + seq_ + ' != ' + seq
        print(s)
        break
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [4]:
from Bio import SeqIO #! pip install Bio
# import pandas as pd #! pip install pandas

##### Seed = 0

In [5]:
seed = 0
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads,reads_order = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
seq_, rows_, cols_ = find_lower_diagonal_path(B,reads_map,cols,rows)

Retrieving stored sparse bipartite adjacency matrix...
Commencing sequence construction...


In [11]:
for r1,r2 in zip(reads_order,rows_):
    print("{}: {}".format(str(r1).ljust(4),r1==r2))

1269: True
1131: True
1052: True
1267: True
155 : True
1149: True
66  : True
899 : True
980 : True
1250: True
38  : True
95  : True
282 : True
486 : True
1056: True
820 : True
94  : True
1206: True
522 : True
366 : True
1094: True
981 : True
752 : True
765 : True
1168: True
729 : True
746 : True
686 : True
471 : True
114 : True
1252: True
1287: True
921 : True
1222: True
916 : True
1019: True
81  : True
753 : True
144 : True
383 : True
264 : True
1044: True
189 : True
1184: True
17  : True
704 : True
1153: True
1189: True
363 : True
1105: True
93  : True
680 : True
589 : True
76  : True
208 : True
1225: True
472 : True
548 : True
103 : True
581 : True
46  : True
47  : True
374 : True
312 : True
799 : True
1097: True
1074: True
1174: True
723 : True
2   : True
475 : True
482 : True
294 : True
636 : True
491 : True
876 : True
1115: True
1085: True
435 : True
1141: True
265 : True
903 : True
1060: True
574 : True
237 : True
140 : True
328 : True
1059: True
1050: True
614 : True
458 : True

In [22]:
damerau_levenshtein_distance(str(seq),str(seq_))

: 

##### Seed = 1

In [None]:
seed = 1
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

##### Seed = 2

In [None]:
seed = 2
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()