In [1]:
from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack, save_npz, load_npz #! pip install scipy
from jellyfish import damerau_levenshtein_distance #! pip install jellyfish
import numpy as np #! pip install numpy
from itertools import product
from math import ceil, fmod
import random

In [2]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    circularize    | boolean indicating whether to add a random amount of the end of the sequence
                   | to the beginning and vice versa
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,min_coverage=None,circularise=False,seed=None,shuffle=True):
    import random

    random.seed(seed)
    if circularise: sequence = sequence[-random.randint(min_overlap,max_overlap):] + sequence + sequence[:random.randint(min_overlap,max_overlap)]
    reads = []
    while 1: 
        start = 0
        end = random.randint(min_subseq_len,max_subseq_len)
        reads += [sequence[start:end]]
        while end < len(sequence):
            start = random.randint(end-max_overlap,end-min_overlap)
            if (len(sequence) - start)/max_subseq_len < 2:
                if (len(sequence) - start)/max_subseq_len < 1:
                    end = len(sequence)
                else:
                    a = 0
                    while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                    end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
            else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
            reads += [sequence[start:end]]
        if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage:
            if not shuffle: return reads
            random.shuffle(reads)
            return reads
        # if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage: return list(set(reads))

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C
INPUT
    n          | the length of the sequence
    palindrome | a boolean indicating whether the sequence must be a palidrome or not
    seed       | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,palindrome=False,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    if palindrome: n = ceil(n/2)
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    if palindrome: seq += ''.join(reversed(seq[:int(n-fmod(n,2))]))
    return seq

# Sequitur

In [35]:
def move_col(B: coo_matrix, cols: dict) -> None:
    for c in range(len(B.col)):
        B.col[c] = cols[B.col[c]]
            
def move_row(B: coo_matrix,rows: dict) -> None:
    for r in range(len(B.row)):
        B.row[r] = rows[B.row[r]]

def normalised_damerau_levenshtein_distance(read: str,overlap: str) -> float:
    return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads: list, min_suf_len: int = 3) -> tuple:
    suf_arr = []
    for read in reads:
        read += '$' + str(reads.index(read))
        for i in range(len(read)):
            if len(read[i:]) < min_suf_len + 2: continue 
            suf_arr += [read[i:]]
    suf_arr.sort()
    suf_arr_ind = []
    for s in range(len(suf_arr)):
        suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
        suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
    return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads: list, suf_arr: list = None, suf_arr_ind: list = None, max_diff: float = 0.25, min_suf_len: int = 3) -> dict:
    if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
    reads_map = dict(zip(reads,list(range(len(reads)))))
    B = {}
    for read in reads:
        for j in range(min_suf_len + 1):
            i = suf_arr.index(read[j:]+'$') - 1
            while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
                if not reads[suf_arr_ind[i]] == read and \
                   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
                   read.startswith(suf_arr[i][:-1]):
                    if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
                    else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
                i -= 1
    return B

def find_lower_diagonal_path(B: coo_matrix,reads_map: dict,cols: list,rows: list) -> tuple:
    argpen = lambda l: np.argpartition(l,-2)[-2]

    if B.sum(axis=1).min() == 0:
        new_cols = [rows[B.sum(axis=1).argmin()]] + list(c for c in cols if c not in [rows[B.sum(axis=1).argmin()],cols[B.sum(axis=0).argmin()]]) + [cols[B.sum(axis=0).argmin()]]
    else: new_cols = list(c for c in cols if c != cols[B.sum(axis=0).argmin()]) + [cols[B.sum(axis=0).argmin()]]
    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
    move_col(B,cols_map)
    cols = new_cols
    new_rows = [rows[B.sum(axis=1).argmin()]] + list(r for r in rows if r != rows[B.sum(axis=1).argmin()])
    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
    move_row(B,rows_map)
    rows = new_rows
    i,j = len(rows), len(cols) - 1
    while j > 0:
        if cols[B.getrow(rows.index(cols[j])).argmax()] in cols[j:]:
            # when there is only one nonzero value on the row and that value is in the latter side of the
            # matrix, i need to select that column, find the row it has in the former side of the matrix
            # and then do the swap
            # return B, cols, new_cols, rows, new_rows, i, j, None
            ts_rows = {rows.index(cols[j])}
            ts_rows_ = set()
            ts_cols = {B.getrow(rows.index(cols[j])).argmax()}
            ts_cols_ = set()

            # until there is an element on the off-diagonal
            while not any(cols[c] in cols[:j] for c in ts_cols) or len(ts_rows) < 2:# or (not any(rows[r] in rows[:i] for r in ts_rows) and len(ts_cols) < 2):
                for c_ in ts_cols.difference(ts_cols_):
                    ts_rows.update(np.argpartition(B.getcol(c_).toarray().flatten(),-2)[::-1][:min(2,len(B.getcol(c_).nonzero()[0]))])
                ts_cols_.update(ts_cols)
                
                for r_ in ts_rows.difference(ts_rows_):
                    ts_cols.update(np.argpartition(B.getrow(r_).toarray().flatten(),-2)[::-1][:min(2,len(B.getrow(r_).nonzero()[1]))])
                ts_rows_.update(ts_rows)

            ts_cols = list(ts_cols)
            ts_cols.sort(reverse=True)

            # if len(ts_cols) > 10: return B, cols, new_cols, rows, new_rows, i, j, None
            new_cols = list(c for c in cols[:j] if c not in [cols[ts_cols[-1]]]) + [cols[ts_cols[-1]]] + cols[j:]
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            new_rows = list(r for r in rows[:i] if r not in [cols[j]]) + [cols[j]] + rows[i:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows
            for c in ts_cols[:-1]:
                rmax = B.getcol(c).argmax()
                rpen = argpen(B.getcol(c).toarray().flatten())
                cmax, cpen = B.getrow(rmax).argmax(), argpen(B.getrow(rmax).toarray().flatten())
                if rpen < j: 
                    new_rows = list(r for r in rows[:i] if r not in [cols[j]]) + [cols[rmax]] + rows[i:rmax] + [cols[j]] + rows[rmax+1:]
                    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
                    move_col(B,cols_map)
                    cols = new_cols

                    new_cols = cols[:j] + [cols[rmax]] + cols[i:rmax] + [cols[j]] + cols[rmax+1:]
                    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
                    rows = new_rows
                    move_row(B,rows_map)
                else:
                    new_cols = cols[:rpen] + [cols[rmax]] + cols[rpen+1:rmax] + [cols[rpen]] + cols[rmax+1:]
                    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
                    move_col(B,cols_map)
                    cols = new_cols

                    # new_rows = rows[:rpen] + [cols[rpen]] + rows[rpen+1:rmax] + [cols[rmax]] + rows[rmax+1:]
                    # new_rows = rows[:rpen] + cols[rpen:]
                    new_rows = list(r for r in rows[:rpen] if r not in [cols[rpen],cols[rmax]]) + [cols[rpen]] + list(r for r in rows[rpen:rmax] if r not in [cols[rpen],cols[rmax]]) + [cols[rmax]] + list(r for r in rows[rmax:] if r not in [cols[rpen],cols[rmax]]) 
                    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
                    rows = new_rows
                    move_row(B,rows_map)
                # elif rpen == j: new_rows = rows[:rpen] + cols[rpen:]
                # else: new_rows = rows[:rpen] + [cols[rmax]] + rows[rpen+1:rmax] + [cols[rpen]] + rows[rmax+1:]
                # else: new_rows = rows[:rpen] + [cols[rpen]] + rows[rpen:rmax] + [cols[rmax]] + list(r for r in rows[rmax:] if r not in [cols[rpen],cols[rmax]])
                    
                # cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
                # rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
                # move_col(B,cols_map)
                # move_row(B,rows_map)
                # cols = new_cols
                # rows = new_rows
        else:
            new_cols = list(c for c in cols[:j] if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]]) + [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]
            try:
                cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            except:
                print(1)
                return B, cols, new_cols, rows, new_rows, i, j, None
            move_col(B,cols_map)
            cols = new_cols

            new_rows = list(r for r in rows[:i] if r not in [cols[j]]) + [cols[j]] + rows[i:]
            try:
                rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            except:
                print(2)
                return B, cols, new_cols, rows, new_rows, i, j, None
            move_row(B,rows_map)
            rows = new_rows
        j -= 1
        i -= 1

    seq = ''
    for s,d in zip(list(reads_map[k] for k in rows)[:-1],B.diagonal(-1)):
        seq += s[:-d]
    seq += list(reads_map[k] for k in rows)[-1]
    return seq

In [14]:
([0,1,2,3,4,5][:3] + [6,7,8,9])[3]

6

In [15]:
seed = 1
seq = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(seq,250,500,50,100,seed=seed)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
# seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)

In [5]:
seed = 0
seq = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                           'ught_better_butter_to',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'

In [98]:
seq = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'you say hello world, i bellow go to hell'

In [99]:
seq = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'she_sells_sea_shells_on_the_sea_shore'

In [343]:
successes = 0
n = 50
for seed in range(n):  
    seq = generate_genome_sequence(10000,seed=seed)
    reads = generate_reads(seq,250,500,50,100,seed=seed)
    reads_map = dict(zip(list(range(len(reads))),reads))
    rows = list(range(len(reads)))
    cols = list(range(len(reads)))
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
    s = '| Seed: ' + str(seed) + ' | '
    if seq_ == seq:
        s+='SUC | ' + seq_ + ' == ' + seq
        successes+=1
    else: 
        s+='FAI | ' + seq_ + ' != ' + seq
        print(s)
        break
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

| Seed: 0 | SUC | TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAAACCATAATGTTCTAGCGCTCGAAATCATTGCACCACTTGCATCTTTGTTCCAGGGACGCTGTAAAACCAGATGCCTGTAAATCGTTTCAACGGGATGGTTTACCCGGAATTCTACGTATTTAATCAACGAGCTTAATGAGCTGACATTGCTGAAATGACCATGACTTAATAATCATTTATGGAGAAGAGGCACGACCACAAGGACCCTATGGCACGGTGGGCAAGCTCCCGCCCGGTACATAACTGTCTGGACTGATTATGTCGGTACAGACTTCTTCCTGCGTATCGATTACGAGCTTATCTGAAGAAGTTTAGGGCAAAGGGACCATGGCCATTGGTGCCAATTTCGGTTCTTGTATGCTACAGTTAAATAGAAAGGCCGCATTGTCGTTCTCGCCCTGTTTTCCTCATACACGACCGAGGTTATTTGTCGGAAACGAGACATCTCTCGAAGGTGGAACGACGCCGGGTGTGCAGAATTTATTTTAAACACTCTATTACCTCCGGGTAGCGTTGGCAAACTCCGATAATGAGCGCCAGGCGTGCCAGGACTCCACCTCCCCTGCTAAGTTGACCTTGAGCTCGGTACAGCGTCGGCGAGACGATAACAACGAAGTCCTTCGGCGTTATGTAATTCACCAGCCCACCATATCAGGTAATAGGCTCGCTGGTTAGGTAGATT

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [4]:
from Bio import SeqIO #! pip install Bio
# import pandas as pd #! pip install pandas

##### Seed = 0

In [36]:
seed = 0
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
# seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
# if seq_ == seq: print("Sequence reconstruction success!")
# else: print("Sequence reconstruction failure.")
b, cols, new_cols, rows, new_rows, i, j, ts = find_lower_diagonal_path(B,reads_map,cols,rows)

Retrieving stored sparse bipartite adjacency matrix...
Commencing sequence construction...


ValueError: too many values to unpack (expected 8)

In [20]:
i,j

(414, 413)

In [19]:
len(rows),len(cols)

(1292, 1292)

In [39]:
cols[j]

400

In [10]:
argpen = lambda l: np.argpartition(l,-2)[-2]
getval = lambda x: x.data[0] if len(x.data) else 0

ts_rows = {rows.index(cols[j])}
ts_rows_ = set()
ts_cols = {B.getrow(rows.index(cols[j])).argmax()}
ts_cols_ = set()

while not any(cols[c] in cols[:j] for c in ts_cols) or len(ts_rows) < 2:
    for c_ in ts_cols.difference(ts_cols_):
        ts_rows.update(np.argpartition(B.getcol(c_).toarray().flatten(),-2)[::-1][:min(2,len(B.getcol(c_).nonzero()[0]))])
    ts_cols_.update(ts_cols)
    
    for r_ in ts_rows.difference(ts_rows_):
        ts_cols.update(np.argpartition(B.getrow(r_).toarray().flatten(),-2)[::-1][:min(2,len(B.getrow(r_).nonzero()[1]))])
    ts_rows_.update(ts_rows)

ts_cols = list(ts_cols)
ts_cols.sort(reverse=True)
ts_cols

[704, 324]

In [11]:
B.getrow(i-1).getcol(j-1).data#[0]

array([], dtype=int32)

In [12]:
c = ts_cols[0]
r0 = B.getcol(c).argmax()
c0_max = B.getrow(r0).argmax() 
c0_pen = argpen(B.getrow(r0).toarray().flatten()) 

r1 = B.getcol(c0_pen).argmax() if B.getcol(c0_pen).argmax() != r0 else argpen(B.getcol(c0_pen).toarray().flatten()) 
c1_max = B.getrow(r1).argmax()
c1_pen = argpen(B.getrow(r1).toarray().flatten()) 

In [13]:
mn = min(c0_max,c1_max if c1_max != c0_max else c1_pen)
mx = max(c0_max,c1_max if c1_max != c0_max else c1_pen)
mn,mx

(324, 704)

In [14]:
if mn < j: new_cols = list(c for c in cols[:j] if c not in [cols[mn]]) + [cols[mx]] + cols[j:mx] + [cols[mn]] + cols[mx+1:]
else: new_cols = cols[:mn] + [cols[mx]] + cols[mn+1:mx] + [cols[mn]] + cols[mx+1:]
len(new_cols),len(list(c for c in new_cols if new_cols.count(c) > 1))

(1292, 0)

In [15]:
mn < j

True

In [20]:
new_cols[j],new_cols[mn],new_cols[mx]

(400, 992, 987)

In [18]:
new_cols[mx] in rows[:i]

True

In [22]:
rows.index(new_cols[mn])

326

In [23]:
j

414

In [38]:
# if mn < j: new_rows = list(r for r in rows[:j] if r not in [new_cols[j]]) + [new_cols[j]] + rows[i:] 
if mn < j: new_rows = list(r for r in rows[:i] if r not in [new_cols[mn],new_cols[mx],new_cols[j]]) + [new_cols[mx],new_cols[j]] + rows[i:mx] + [new_cols[mn]] + rows[mx:]
else: new_rows = rows[:mn] + [new_cols[mx]] + rows[mn+1:mx] + [new_cols[mn]] + rows[mx+1:]
len(new_rows),len(list(r for r in new_rows if new_rows.count(r) > 1))

(1292, 0)

In [20]:
if mn < j: new_cols = list(c for c in cols[:j] if c not in [cols[mn]]) + [cols[mn]] + cols[j:]
else: new_cols = cols[:mn] + [cols[mx]] + cols[mn+1:mx] + [cols[mn]] + cols[mx+1:]
cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
move_col(B,cols_map)
cols = new_cols

if mn < j: new_rows = [cols[0]] + list(r for r in rows[:i] if r not in [cols[0],cols[j]]) + [cols[j]] + rows[i:] 
else: new_rows = rows[:mn] + [cols[mx]] + rows[mn+1:mx] + [cols[mn]] + rows[mx+1:]
rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
move_row(B,rows_map)
rows = new_rows
# if c1_max == c0_max:
#     new_rows = [new_cols[0]] + list(r for r in rows[:max(i,mn+1)] if r not in [new_cols[0],new_cols[mx if max(j,mn) == mn else j]]) + [new_cols[max(j,mn)]] + rows[max(i,mn+1):mx+1] + ([new_cols[mn]] if max(j,mn) == mn else []) + rows[mx+1:]
#     rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
#     move_row(B,rows_map)
#     rows = new_rows
# else:
#     # new_cols = list(c for c in cols[:max(j,mn)] if c not in [cols[mn]]) + [cols[mx]] + cols[max(j,mn):mx] + [cols[mn]] + cols[mx+1:]
#     # cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
#     # move_col(B,cols_map)
#     # cols = new_cols

#     new_rows = [cols[0]] + list(r for r in rows[:max(i,min(r0,r1))] if r not in [cols[0],cols[mn],cols[mx]]) + [cols[mx]] + list(r for r in rows[max(i,min(r0,r1)):max(r0,r1)] if r not in [cols[mn],cols[mx]]) + [cols[mn]] + list(r for r in rows[max(r0,r1):] if r not in [cols[mn],cols[mx]])
#     rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
#     move_row(B,rows_map)
#     rows = new_rows

In [21]:
B.getrow(i-1).getcol(j-1).data#[0]

array([50], dtype=int32)

In [22]:
ts_cols,ts_rows

([704, 324], {129, 705})

In [23]:
B.getrow(705).getcol(704).data#[0]

array([50], dtype=int32)

In [24]:
B.getrow(705).nonzero()#getcol(2).data#[0]

(array([0, 0]), array([704, 413]))

##### Seed = 1

In [113]:
seed = 1
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

Retrieving bipartite graph...
Commencing sequence construction...
Sequence reconstruction failure.


##### Seed = 2

In [191]:
seed = 2
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
if seed in B_:
    print("Retrieving sparse bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

Retrieving bipartite graph...


In [1]:
import networkx as nx

In [None]:
G = nx.Graph()