In [1]:
from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack, save_npz, load_npz #! pip install scipy
from jellyfish import damerau_levenshtein_distance #! pip install jellyfish
import numpy as np #! pip install numpy
from itertools import product
from math import ceil, fmod
import random

In [2]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    circularize    | boolean indicating whether to add a random amount of the end of the sequence
                   | to the beginning and vice versa
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,min_coverage=None,circularise=False,seed=None,shuffle=True):
    import random

    random.seed(seed)
    if circularise: sequence = sequence[-random.randint(min_overlap,max_overlap):] + sequence + sequence[:random.randint(min_overlap,max_overlap)]
    reads = []
    while 1: 
        start = 0
        end = random.randint(min_subseq_len,max_subseq_len)
        reads += [sequence[start:end]]
        while end < len(sequence):
            start = random.randint(end-max_overlap,end-min_overlap)
            if (len(sequence) - start)/max_subseq_len < 2:
                if (len(sequence) - start)/max_subseq_len < 1:
                    end = len(sequence)
                else:
                    a = 0
                    while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                    end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
            else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
            reads += [sequence[start:end]]
        if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage:
            if not shuffle: return reads
            random.shuffle(reads)
            return reads
        # if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage: return list(set(reads))

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C
INPUT
    n          | the length of the sequence
    palindrome | a boolean indicating whether the sequence must be a palidrome or not
    seed       | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,palindrome=False,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    if palindrome: n = ceil(n/2)
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    if palindrome: seq += ''.join(reversed(seq[:int(n-fmod(n,2))]))
    return seq

# Sequitur

In [6]:
def move_col(B: coo_matrix, cols: dict) -> None:
    for c in range(len(B.col)):
        B.col[c] = cols[B.col[c]]
            
def move_row(B: coo_matrix,rows: dict) -> None:
    for r in range(len(B.row)):
        B.row[r] = rows[B.row[r]]

def normalised_damerau_levenshtein_distance(read: str,overlap: str) -> float:
    return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads: list, min_suf_len: int = 3) -> tuple:
    suf_arr = []
    for read in reads:
        read += '$' + str(reads.index(read))
        for i in range(len(read)):
            if len(read[i:]) < min_suf_len + 2: continue 
            suf_arr += [read[i:]]
    suf_arr.sort()
    suf_arr_ind = []
    for s in range(len(suf_arr)):
        suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
        suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
    return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads: list, suf_arr: list = None, suf_arr_ind: list = None, max_diff: float = 0.25, min_suf_len: int = 3) -> dict:
    if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
    reads_map = dict(zip(reads,list(range(len(reads)))))
    B = {}
    for read in reads:
        for j in range(min_suf_len + 1):
            i = suf_arr.index(read[j:]+'$') - 1
            while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
                if not reads[suf_arr_ind[i]] == read and \
                   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
                   read.startswith(suf_arr[i][:-1]):
                    if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
                    else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
                i -= 1
    return B

def find_lower_diagonal_path(B: coo_matrix,reads_map: dict,cols: list,rows: list) -> tuple:
    argpenultimate = lambda l: np.argpartition(l,-2)[-2]
    getval = lambda x: x.data[0] if len(x.data) else 0

    if B.sum(axis=1).min() == 0:
        new_cols = [rows[B.sum(axis=1).argmin()]] + list(c for c in cols if c not in [rows[B.sum(axis=1).argmin()],cols[B.sum(axis=0).argmin()]]) + [cols[B.sum(axis=0).argmin()]]
    else: new_cols = list(c for c in cols if c != cols[B.sum(axis=0).argmin()]) + [cols[B.sum(axis=0).argmin()]]
    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
    move_col(B,cols_map)
    cols = new_cols
    new_rows = [rows[B.sum(axis=1).argmin()]] + list(r for r in rows if r != rows[B.sum(axis=1).argmin()])
    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
    move_row(B,rows_map)
    rows = new_rows
    i,j = len(rows), len(cols) - 1
    while j > 0:
        if cols[B.getrow(rows.index(cols[j])).argmax()] in cols[j:]:
            ts_rows = {rows.index(cols[j])}
            ts_rows_ = set()
            ts_cols = {B.getrow(rows.index(cols[j])).argmax()}
            ts_cols_ = set()
            while not any(cols[c] in cols[:j] for c in ts_cols):
                for c_ in ts_cols.difference(ts_cols_):
                    ts_rows.update(np.argpartition(B.getcol(c_).toarray().flatten(),-2)[::-1][:min(2,len(B.getcol(c_).nonzero()[0]))])
                ts_cols_.update(ts_cols)
                for r_ in ts_rows.difference(ts_rows_):
                    ts_cols.update(np.argpartition(B.getrow(r_).toarray().flatten(),-2)[::-1][:min(2,len(B.getrow(r_).nonzero()[1]))])
                ts_rows_.update(ts_rows)
            ts_cols = list(ts_cols)
            ts_cols.sort(reverse=True)
            for c in ts_cols:
                r0 = B.getcol(c).argmax()
                c0 = argpenultimate(B.getrow(r0).toarray().flatten())
                r1 = B.getcol(c0).argmax()
                c1 = argpenultimate(B.getrow(r1).toarray().flatten())
                if getval(B.getrow(r0).getcol(c0)) >= getval(B.getrow(r1).getcol(c1)):
                    # because the second biggest value on row 0 is greater than or equal to 
                    # the value on row 2, we swap the columns
                    # second biggest value, we swap the row indices
                    
                    new_cols = list(c for c in cols[:max(j,min(c0,c1))] if c not in [c0,c1]) + [cols[max(c0,c1)]] + cols[max(j,min(c0,c1))+1:max(c0,c1)] + [cols[min(c0,c1)]] + cols[max(c0,c1)+1:]
                    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
                    move_col(B,cols_map)
                    cols = new_cols

                if c0 < j:
                    # if the column is prior to j, we need to swap the rows
                    
                    new_rows = [cols[0]] + list(r for r in rows[:i] if r not in [cols[0],cols[c0]]) + [cols[c0]] + rows[i:]
                    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
                    move_row(B,rows_map)
                    rows = new_rows
                else:
                    print("Use second best value for c1")

            # # print(2)
            # # return B, cols, new_cols, rows, new_rows, i, j, None
            # # Extract the coordinates
            # ts_rows = {rows.index(cols[j])}
            # ts_cols = {B.getrow(rows.index(cols[j])).argmax()}
            # while not any(c in cols[:j] for c in ts_cols):

            # ts = list((r, c) for r, c in product(
            #                                             np.argpartition(B.getcol(B.getrow(rows.index(cols[j])).argmax()).toarray().flatten(), -2)[::-1][:min(2,len(B.getcol(B.getrow(rows.index(cols[j])).argmax()).nonzero()[0]))],
            #                                             np.argpartition(B.getrow(rows.index(cols[j])).toarray().flatten(), -2)[::-1][:min(2,len(B.getrow(rows.index(cols[j])).nonzero()[1]))]
            #                                             ))
            # if len(ts) == 2:
            #     # TODO: need to find a column index in the front of the matrix
            #     # this code runs the risk of putting a 0 on the diagonal
            #     # if that happens, i should put the second highest value of
            #     # the row into the ts and use that instead
            #     return B, cols, new_cols, rows, new_rows, i, j, None
            #     ts_ = []
            #     for r,_ in ts:
            #         if r == rows.index(cols[j]): continue
            #         for c in B.getrow(r).nonzero()[1]:
            #             if c == ts[0][1]: continue
            #             for r_,_ in ts:
            #                 ts_ += [(r_,c)]
            #     ts = [ts[0]] + [ts_[0]] + [ts[1]] + [ts_[1]]
            # # if len(B.getrow(rows.index(cols[j])).nonzero()[1]) > 1:
            # try:
            #     if getval(B.getrow(ts[1][0]).getcol(ts[1][1])) >= getval(B.getrow(ts[3][0]).getcol(ts[3][1])) :
            #         # because the second biggest value on the row with the biggest value is
            #         # larger than the second biggest value on the row of the column with the 
            #         # second biggest value, we swap the row indices
            #         new_cols = list(c for c in cols[:j] if c not in [cols[ts[3][1]]]) + [cols[ts[3][1]]] + cols[j:]
            #         cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            #         move_col(B,cols_map)
            #         cols = new_cols

            #         new_rows = [cols[0]] + list(r for r in rows[:i] if r not in [cols[0],cols[ts[3][1]]]) + [cols[ts[0][1]]] + rows[i:ts[0][1]] + [cols[ts[3][1]]] + rows[ts[0][1]+1:]
            #         rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            #         move_row(B,rows_map)
            #         rows = new_rows
            #     else:
            #         # because the second biggest value on the row with the biggest value is
            #         # smaller than the second biggest value on the row of the column with the
            #         # second biggest value, we simply treat the second largest value as the max
            #         new_cols = list(c for c in cols if c not in [cols[ts[1][0]]] + cols[j:]) + [cols[ts[1][0]]] + cols[j:]
            #         cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            #         move_col(B,cols_map)
            #         cols = new_cols
            #         # no need for row swap
            #         new_rows = [cols[0]] + list(r for r in rows if r not in [cols[0],cols[j]] + rows[i:]) + [cols[j]] + rows[i:]
            #         rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            #         move_row(B,rows_map)
            #         rows = new_rows
            # except:
            #     print(0)
            #     return B, cols, new_cols, rows, new_rows, i, j, ts
        else:
            new_cols = list(c for c in cols[:j] if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]]) + [cols[B.getrow(rows.index(cols[j])).argmax()]] + list(c for c in cols[j:] if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]])
            try:
                cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            except:
                print(1)
                return B, cols, new_cols, rows, new_rows, i, j, None
            move_col(B,cols_map)
            cols = new_cols

            new_rows = [cols[0]] + list(r for r in rows[:i] if r not in [cols[0],cols[j]]) + [cols[j]] + list(r for r in rows[i:] if r not in [cols[0],cols[j]])
            try:
                rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            except:
                print(2)
                return B, cols, new_cols, rows, new_rows, i, j, None
            move_row(B,rows_map)
            rows = new_rows
        j -= 1
        i -= 1

    seq = ''
    for s,d in zip(list(reads_map[k] for k in rows)[:-1],B.diagonal(-1)):
        seq += s[:-d]
    seq += list(reads_map[k] for k in rows)[-1]
    return seq

In [315]:
seed = 1
seq = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(seq,250,500,50,100,seed=seed)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
# seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)

In [97]:
seed = 0
seq = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                           'ught_better_butter_to',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'

In [98]:
seq = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'you say hello world, i bellow go to hell'

In [99]:
seq = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'she_sells_sea_shells_on_the_sea_shore'

In [343]:
successes = 0
n = 50
for seed in range(n):  
    seq = generate_genome_sequence(10000,seed=seed)
    reads = generate_reads(seq,250,500,50,100,seed=seed)
    reads_map = dict(zip(list(range(len(reads))),reads))
    rows = list(range(len(reads)))
    cols = list(range(len(reads)))
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
    s = '| Seed: ' + str(seed) + ' | '
    if seq_ == seq:
        s+='SUC | ' + seq_ + ' == ' + seq
        successes+=1
    else: 
        s+='FAI | ' + seq_ + ' != ' + seq
        print(s)
        break
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

| Seed: 0 | SUC | TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAAACCATAATGTTCTAGCGCTCGAAATCATTGCACCACTTGCATCTTTGTTCCAGGGACGCTGTAAAACCAGATGCCTGTAAATCGTTTCAACGGGATGGTTTACCCGGAATTCTACGTATTTAATCAACGAGCTTAATGAGCTGACATTGCTGAAATGACCATGACTTAATAATCATTTATGGAGAAGAGGCACGACCACAAGGACCCTATGGCACGGTGGGCAAGCTCCCGCCCGGTACATAACTGTCTGGACTGATTATGTCGGTACAGACTTCTTCCTGCGTATCGATTACGAGCTTATCTGAAGAAGTTTAGGGCAAAGGGACCATGGCCATTGGTGCCAATTTCGGTTCTTGTATGCTACAGTTAAATAGAAAGGCCGCATTGTCGTTCTCGCCCTGTTTTCCTCATACACGACCGAGGTTATTTGTCGGAAACGAGACATCTCTCGAAGGTGGAACGACGCCGGGTGTGCAGAATTTATTTTAAACACTCTATTACCTCCGGGTAGCGTTGGCAAACTCCGATAATGAGCGCCAGGCGTGCCAGGACTCCACCTCCCCTGCTAAGTTGACCTTGAGCTCGGTACAGCGTCGGCGAGACGATAACAACGAAGTCCTTCGGCGTTATGTAATTCACCAGCCCACCATATCAGGTAATAGGCTCGCTGGTTAGGTAGATT

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [4]:
from Bio import SeqIO #! pip install Bio
# import pandas as pd #! pip install pandas

##### Seed = 0

In [7]:
seed = 0
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
# seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
# if seq_ == seq: print("Sequence reconstruction success!")
# else: print("Sequence reconstruction failure.")
b, cols, new_cols, rows, new_rows, i, j, ts = find_lower_diagonal_path(B,reads_map,cols,rows)

Retrieving stored sparse bipartite adjacency matrix...
Commencing sequence construction...


In [77]:
argpenultimate = lambda l: np.argpartition(l,-2)[-2]
getval = lambda x: x.data[0] if len(x.data) else 0

In [72]:
ts_rows = {rows.index(cols[j])}
ts_rows_ = set()
ts_cols = {B.getrow(rows.index(cols[j])).argmax()}
ts_cols_ = set()
while not any(cols[c] in cols[:j] for c in ts_cols):
    for c_ in ts_cols.difference(ts_cols_):
        ts_rows.update(np.argpartition(B.getcol(c_).toarray().flatten(),-2)[::-1][:min(2,len(B.getcol(c_).nonzero()[0]))])
    ts_cols_.update(ts_cols)
    for r_ in ts_rows.difference(ts_rows_):
        ts_cols.update(np.argpartition(B.getrow(r_).toarray().flatten(),-2)[::-1][:min(2,len(B.getrow(r_).nonzero()[1]))])
    ts_rows_.update(ts_rows)
for c in ts_cols:
    r0 = B.getcol(c).argmax()
    c0 = argpenultimate(B.getrow(r0).toarray().flatten())
    r1 = B.getcol(c0).argmax()
    c1 = argpenultimate(B.getrow(r1).toarray().flatten())
    if getval(B.getrow(r0).getcol(c0)) >= getval(B.getrow(r1).getcol(c1)):
        print("Use second best value for c0")
    else:
        print("Use second best value for c1")

True
True
True


In [69]:
argpenultimate([56,89,78,21,3,569])

1

In [65]:
B.getrow(550).getcol(549)

<1x1 sparse matrix of type '<class 'numpy.intc'>'
	with 1 stored elements in Compressed Sparse Row format>

In [62]:
ts_rows,ts_cols

({16, 549, 550}, {141, 548, 549})

In [10]:
b.getrow(rows.index(cols[j])).nonzero()

(array([0]), array([549]))

In [11]:
b.getcol(549).nonzero()

(array([ 16, 550]), array([0, 0]))

In [30]:
b.getrow(549).getcol(141).data

array([50], dtype=int32)

In [31]:
b.getcol(549).getrow(696).data

array([], dtype=int32)

In [32]:
cols[141] in cols[:j]

True

In [13]:
b.getcol(549).getrow(550).data

array([151], dtype=int32)

In [14]:
b.getrow(550).nonzero()

(array([0, 0]), array([549, 548]))

In [27]:
b.getrow(697).getcol(548).data[0]

6

In [18]:
b.getcol(549).nonzero()

(array([549, 550, 697]), array([0, 0, 0]))

In [28]:
for c in b.getrow(697).nonzero()[1]:
    print("(697,{}): {}".format(c,b.getrow(697).getcol(c).data[0]))

(697,1262): 3
(697,1217): 3
(697,1212): 3
(697,1207): 3
(697,1165): 3
(697,1126): 3
(697,1104): 3
(697,1092): 3
(697,1003): 5
(697,982): 3
(697,971): 3
(697,961): 5
(697,950): 3
(697,939): 4
(697,927): 4
(697,918): 4
(697,890): 4
(697,871): 6
(697,846): 4
(697,741): 3
(697,731): 3
(697,727): 6
(697,696): 50
(697,694): 3
(697,691): 4
(697,686): 5
(697,662): 3
(697,615): 3
(697,594): 3
(697,592): 3
(697,559): 4
(697,551): 4
(697,548): 6
(697,526): 3
(697,499): 3
(697,462): 6
(697,426): 3
(697,415): 3
(697,388): 4
(697,348): 3
(697,318): 3
(697,309): 3
(697,270): 4
(697,265): 5
(697,213): 3
(697,111): 5
(697,110): 3
(697,103): 4
(697,91): 3
(697,81): 5
(697,74): 3
(697,67): 4
(697,15): 4


In [13]:
print("j: {}\ni: {}".format(j,i))

j: 214
i: 215


In [14]:
try:
    print("({},{}): {} | ({},{}): {}\n({},{}): {} | ({},{}): {}".format(
    ts[0][0],ts[0][1],B.getcol(ts[0][0]).getrow(ts[0][1]).data[0],
    ts[1][0],ts[1][1],B.getcol(ts[1][0]).getrow(ts[1][1]).data[0],
    ts[2][0],ts[2][1],B.getcol(ts[2][0]).getrow(ts[2][1]).data[0],
    ts[3][0],ts[3][1],B.getcol(ts[3][0]).getrow(ts[3][1]).data[0]
    ))
except:
    print(ts)

[(549, 550), (549, 16), (548, 550), (548, 16)]


In [25]:
ts[2][0]

548

In [26]:
B.getrow(ts[2][0]).nonzero()#.getrow(ts[1][1])

(array([0]), array([548]))

In [22]:
# B.getcol(ts[2][0]).getrow(ts[2][1]).data[0] if len(B.getcol(ts[2][0]).getrow(ts[2][1]).data) else 0 #>= \
B.getcol(ts[3][0]).getrow(ts[3][1]).data[0] #if len(B.getcol(ts[3][0]).getrow(ts[3][1]).data) else 0

IndexError: index 0 is out of bounds for axis 0 with size 0

In [67]:
B.getrow(rows.index(cols[j])).nonzero()[1]

array([549])

In [70]:
B.getrow(550).nonzero()[1]

array([549, 548])

In [9]:
ts = ts[:2]

In [10]:
ts_ = []
for _,r in ts:
    if r == rows.index(cols[j]): continue
    for c in B.getrow(r).nonzero()[1]:
        if c == ts[0][0]: continue
        for _,r_ in ts:
            print(c,r)
            # ts_ += [(c,r)]

550 550
550 550
214 550
214 550
550 16
550 16


In [10]:
b.getcol(549).getrow(16).data

array([50], dtype=int32)

In [14]:
b.getrow(550).getcol(548).data

array([50], dtype=int32)

In [65]:
rows.index(cols[j])

16

[549, 0]

In [60]:
np.argpartition(B.getrow(rows.index(cols[j])).toarray().flatten(), -2)

array([ 647,  645, 1290, ..., 1291,  430,  549], dtype=int64)

In [48]:
list((c,r) for c,r in zip(cols,rows))

[(4, 4),
 (8, 8),
 (11, 11),
 (20, 20),
 (25, 25),
 (26, 26),
 (32, 32),
 (42, 42),
 (50, 50),
 (55, 55),
 (69, 69),
 (73, 73),
 (77, 77),
 (79, 79),
 (86, 86),
 (92, 92),
 (110, 106),
 (119, 110),
 (120, 119),
 (121, 120),
 (131, 121),
 (137, 131),
 (138, 137),
 (141, 138),
 (159, 141),
 (164, 159),
 (168, 164),
 (180, 168),
 (183, 180),
 (186, 183),
 (194, 186),
 (201, 194),
 (203, 201),
 (205, 203),
 (211, 205),
 (215, 211),
 (221, 215),
 (223, 221),
 (235, 223),
 (241, 235),
 (242, 241),
 (246, 242),
 (250, 246),
 (251, 250),
 (256, 251),
 (257, 256),
 (261, 257),
 (266, 261),
 (267, 266),
 (273, 267),
 (274, 273),
 (286, 274),
 (307, 286),
 (315, 307),
 (321, 315),
 (331, 321),
 (332, 331),
 (335, 332),
 (336, 335),
 (338, 336),
 (342, 338),
 (343, 342),
 (348, 343),
 (357, 348),
 (358, 357),
 (359, 358),
 (370, 359),
 (373, 370),
 (390, 373),
 (392, 390),
 (393, 392),
 (395, 393),
 (402, 395),
 (404, 400),
 (411, 402),
 (427, 404),
 (438, 411),
 (443, 427),
 (446, 438),
 (447, 44

In [89]:
b.getrow(16).nonzero()[1]

array([549])

In [90]:
B.getcol(549).nonzero()[0]

array([ 16, 550])

In [80]:
b.getrow(16).getcol(549).data

array([50], dtype=int32)

In [88]:
b.getrow(289).getcol(288).data

array([50], dtype=int32)

In [73]:
B.getrow(rows.index(cols[j])).argmax()

549

In [47]:
cols[449],rows[449]

(583, 583)

In [9]:
list((c,r) for c,r in zip(new_cols[j-4:j+3],new_rows[j-4:j+3]))

[(1275, 1274),
 (1276, 1275),
 (1278, 1276),
 (1283, 1278),
 (400, 1283),
 (225, 225),
 (35, 35)]

In [18]:
C1 = list(c for c in cols if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]) + [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]

In [21]:
C2 = list(c for c in cols[:j] if c not in [cols[ts[3][0]]]) + [cols[ts[3][0]]] + cols[j:]

In [23]:
len(C2)

1292

In [36]:
list((c,r) for c,r in zip(C2[j-4:j+3],R2[j-4:j+3]))

[(1276, 1275),
 (1278, 1276),
 (1283, 1278),
 (987, 1283),
 (400, 590),
 (225, 225),
 (35, 35)]

In [16]:
b.getrow(rows.index(cols[j])).argmax()

704

In [20]:
cols[ts[3][0]] in cols[j:]

False

In [37]:
[C2[ts[1][0]]]

[590]

In [34]:
R2 = [C2[0]] + list(r for r in rows[:i] if r not in [C2[0],C2[ts[3][0]]]) + [C2[ts[1][0]]] + rows[i:ts[1][0]] + C2[ts[3][0]]] + rows[ts[1][0]+1:]

In [35]:
len(R2)

1292

Ok, so I want to change the 

In [43]:
new_cols = list(c for c in cols[:j] if c not in [cols[ts[3][0]]]) + [cols[ts[3][0]]] + cols[j:]
# cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
# move_col(b,cols_map)
# cols_ = new_cols

In [67]:
new_rows = [cols[0]] + list(r for r in rows[:i] if r not in [cols[0],cols[ts[3][0]]]) + [cols[ts[1][0]]] + rows[i:ts[1][0]] + [cols[ts[3][0]]] + rows[ts[1][0]+1:]

In [None]:
len(new_rows)

In [42]:
cols[ts[1][0]]

590

In [11]:
list((c,r) for c,r in zip(cols,rows))

[(4, 4),
 (7, 7),
 (8, 8),
 (11, 11),
 (20, 20),
 (21, 21),
 (25, 25),
 (26, 26),
 (28, 28),
 (32, 32),
 (37, 37),
 (39, 39),
 (42, 42),
 (50, 50),
 (55, 55),
 (58, 58),
 (62, 62),
 (63, 63),
 (69, 69),
 (72, 72),
 (73, 73),
 (77, 77),
 (78, 78),
 (79, 79),
 (82, 82),
 (86, 86),
 (88, 88),
 (92, 92),
 (102, 102),
 (104, 104),
 (106, 106),
 (107, 107),
 (110, 110),
 (119, 119),
 (120, 120),
 (121, 121),
 (123, 123),
 (124, 124),
 (127, 127),
 (131, 131),
 (136, 136),
 (137, 137),
 (138, 138),
 (141, 141),
 (152, 152),
 (156, 156),
 (159, 159),
 (164, 164),
 (168, 168),
 (170, 170),
 (176, 176),
 (180, 180),
 (183, 183),
 (186, 186),
 (188, 188),
 (194, 194),
 (201, 201),
 (203, 203),
 (205, 205),
 (211, 211),
 (215, 215),
 (217, 217),
 (221, 221),
 (222, 222),
 (223, 223),
 (229, 229),
 (230, 230),
 (235, 235),
 (241, 241),
 (242, 242),
 (244, 244),
 (246, 246),
 (248, 248),
 (250, 250),
 (251, 251),
 (256, 256),
 (257, 257),
 (258, 258),
 (261, 261),
 (263, 263),
 (266, 266),
 (267, 26

In [66]:
B.getrow(rows.index(400)).nonzero()[1]

array([704, 324])

In [68]:
list((c,r) for c,r in zip(new_cols,new_rows))

[(4, 4),
 (7, 7),
 (8, 8),
 (11, 11),
 (20, 20),
 (21, 21),
 (25, 25),
 (26, 26),
 (28, 28),
 (32, 32),
 (37, 37),
 (39, 39),
 (42, 42),
 (50, 50),
 (55, 55),
 (58, 58),
 (62, 62),
 (63, 63),
 (69, 69),
 (72, 72),
 (73, 73),
 (77, 77),
 (78, 78),
 (79, 79),
 (82, 82),
 (86, 86),
 (88, 88),
 (92, 92),
 (102, 102),
 (104, 104),
 (106, 106),
 (107, 107),
 (110, 110),
 (119, 119),
 (120, 120),
 (121, 121),
 (123, 123),
 (124, 124),
 (127, 127),
 (131, 131),
 (136, 136),
 (137, 137),
 (138, 138),
 (141, 141),
 (152, 152),
 (156, 156),
 (159, 159),
 (164, 164),
 (168, 168),
 (170, 170),
 (176, 176),
 (180, 180),
 (183, 183),
 (186, 186),
 (188, 188),
 (194, 194),
 (201, 201),
 (203, 203),
 (205, 205),
 (211, 211),
 (215, 215),
 (217, 217),
 (221, 221),
 (222, 222),
 (223, 223),
 (229, 229),
 (230, 230),
 (235, 235),
 (241, 241),
 (242, 242),
 (244, 244),
 (246, 246),
 (248, 248),
 (250, 250),
 (251, 251),
 (256, 256),
 (257, 257),
 (258, 258),
 (261, 261),
 (263, 263),
 (266, 266),
 (267, 26

In [69]:
list((new_cols.index(c),new_rows.index(r)) for c,r in zip(new_cols[j:],new_rows[i-1:]) if c!=r)

[(414, 414)]

In [11]:
new_cols[414]

400

In [333]:
# no need for row swap
new_rows = [cols_[0]] + list(r for r in rows[:i] if r not in [cols_[0],cols_[j],cols[ts[1][0]]]) + [cols_[j]] + \
           list(r for r in rows[i:ts[1][0]] if r not in [cols_[0],cols_[j],cols[ts[1][0]]]) + [cols[ts[1][0]]] + \
           list(r for r in rows[ts[1][0]:] if r not in [cols_[0],cols_[j],cols[ts[1][0]]])
# rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
# move_row(b,rows_map)
# rows_ = new_rows

In [334]:
cols_[j],new_rows[i]

(400, 225)

In [335]:
cols[ts[1][0]]

590

In [336]:
cols_[j]

400

In [337]:
cols[j-1]

1283

In [338]:
new_rows[i-1]

400

In [339]:
list((c,r) for c,r in zip(cols_[j:],new_rows[i-1:]) if c!=r)

[(987, 590)]

In [23]:
j -= 1
i -= 1

In [33]:
B.getcol(cols[B.getrow(rows.index(cols[j])).argmax()]) #in cols[j:]

<1292x1 sparse matrix of type '<class 'numpy.intc'>'
	with 1 stored elements in Compressed Sparse Row format>

In [168]:
len(new_cols_)

1292

In [147]:
new_cols_ = list(c for c in cols if c not in [cols[ts[1][0]],cols[ts[3][0]]] + cols[j:]) +\
                [cols[ts[1][0]]] + cols[j:cols.index(cols[ts[1][0]])] + [cols[ts[3][0]]] + cols[cols.index(cols[ts[1][0]])+1:]

In [148]:
len(new_cols_)

1292

##### Seed = 1

In [113]:
seed = 1
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

Retrieving bipartite graph...
Commencing sequence construction...
Sequence reconstruction failure.


##### Seed = 2

In [191]:
seed = 2
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

Retrieving bipartite graph...


In [1]:
import networkx as nx

In [None]:
G = nx.Graph()