In [1]:
from jellyfish import damerau_levenshtein_distance
from math import ceil, fmod
import pandas as pd
import numpy as np
import scipy as sp
import random

In [22]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    circularize    | boolean indicating whether to add a random amount of the end of the sequence
                   | to the beginning and vice versa
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,min_coverage=None,circularise=False,seed=None,shuffle=True):
    import random

    random.seed(seed)
    if circularise: sequence = sequence[-random.randint(min_overlap,max_overlap):] + sequence + sequence[:random.randint(min_overlap,max_overlap)]
    reads = []
    while 1: 
        start = 0
        end = random.randint(min_subseq_len,max_subseq_len)
        reads += [sequence[start:end]]
        while end < len(sequence):
            start = random.randint(end-max_overlap,end-min_overlap)
            if (len(sequence) - start)/max_subseq_len < 2:
                if (len(sequence) - start)/max_subseq_len < 1:
                    end = len(sequence)
                else:
                    a = 0
                    while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                    end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
            else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
            reads += [sequence[start:end]]
        if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage:
            if not shuffle: return reads
            random.shuffle(reads)
            return reads
        # if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage: return list(set(reads))

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C
INPUT
    n          | the length of the sequence
    palindrome | a boolean indicating whether the sequence must be a palidrome or not
    seed       | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,palindrome=False,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    if palindrome: n = ceil(n/2)
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    if palindrome: seq += ''.join(reversed(seq[:int(n-fmod(n,2))]))
    return seq

# Sequitur

In [36]:
def move_col(B,cols):
    for c in range(len(B.col)):
        B.col[c] = cols[B.col[c]]
    # for c in range(len(B.col)):
    #     if B.col[c] > origin and B.col[c] <= destination:
    #         B.col[c] -= 1
    #     elif B.col[c] == origin:
    #         B.col[c] = destination
            
def move_row(B,rows):
    for r in range(len(B.row)):
        B.row[r] = rows[B.row[r]]
    # for r in range(len(B.row)):
    #     if B.row[r] > origin and B.row[r] <= destination:
    #         B.row[r] -= 1 
    #     elif B.row[r] == origin :
    #         B.row[r] = destination

def normalised_damerau_levenshtein_distance(read,overlap):
    return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads,min_suf_len=3):
    suf_arr = []
    for read in reads:
        read += '$' + str(reads.index(read))
        for i in range(len(read)):
            if len(read[i:]) < min_suf_len + 2: continue 
            suf_arr += [read[i:]]
    suf_arr.sort()
    suf_arr_ind = []
    for s in range(len(suf_arr)):
        suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
        suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
    return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads,suf_arr=None,suf_arr_ind=None,max_diff=0.25,min_suf_len=3):
    if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
    reads_map = dict(zip(reads,list(range(len(reads)))))
    B = {}
    for read in reads:
        B[reads_map[read]] = dict(zip(list(range(len(reads))),[0]*len(reads)))
        for j in range(min_suf_len + 1):
            i = suf_arr.index(read[j:]+'$') - 1
            while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
                if not reads[suf_arr_ind[i]] == read and \
                   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
                   read.startswith(suf_arr[i][:-1]):
                    B[reads_map[read]][reads_map[reads[suf_arr_ind[i]]]] = max(len(suf_arr[i][:-1]),B[reads_map[read]][reads_map[reads[suf_arr_ind[i]]]])
                i -= 1
    return B

def find_lower_diagonal_path(B,reads_map):
    '''
    Description: Finds a lower diagonal matrix from a matrix generated by a bipartite
                 graph by maintaining two conditions:
                    1. The column of all zeros must be the very last column in the matrix
                    2. The row-name-index of the ordinally first row must be the same as
                            the column-name-index of the ordinally first column
                 Simultaneously it seeks a third condition:
                    3. The (j+1)-th ordinal column must have the same column-name-index 
                            as the j-th ordinal row's row-name-index
                 Once this last condition is met, the algorithm returns.
    '''
    # initialisation 
    cols = list(B.columns) 
    rows = list(B.index)
    # send column with minimum sum of elements to the end
    cols = list(c for c in cols if c != cols[B.sum().argmin()]) + [cols[B.sum().argmin()]]
    
    print("cols:",cols)
    print("rows:",rows)
    print()
    B = B[cols]
    i,j = B.shape
    j -= 1
    while j > 0:
        # move column
        cols = list(c for c in cols if c not in [cols[B.loc[cols[j],:].argmax()]] + cols[j:]) + [cols[B.loc[cols[j],:].argmax()]] + cols[j:]
        B = B[cols]
        # move row
        rows = [cols[0]] + list(r for r in rows if r not in [cols[0],cols[j]] + rows[i:]) + [cols[j]] + rows[i:]
        print("cols:",cols)
        print("rows:",rows)
        print()
        B = B.loc[rows,:]
        j -= 1
        i -= 1
    # return B
    seq = ''
    for s,d in zip(list(reads_map[k] for k in B.index)[:-1],np.diagonal(B,-1)):
        seq += s[:-d]
    seq += list(reads_map[k] for k in B.index)[-1]
    return seq

# def create_bipartite_adjacency_matrix(reads,suf_arr=None,suf_arr_ind=None,max_diff=0.25,min_suf_len=3):
#     if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
#     reads_map = dict(zip(reads,list(range(len(reads)))))
#     B = {}
#     for read in reads:
#         for j in range(min_suf_len + 1):
#             i = suf_arr.index(read[j:]+'$') - 1
#             while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
#                 if not reads[suf_arr_ind[i]] == read and \
#                    normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
#                    read.startswith(suf_arr[i][:-1]):
#                     if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
#                     else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
#                 i -= 1
#     return B

# def find_lower_diagonal_path(B,reads_map,cols,rows):
#     # initialisation
#     new_cols = list(c for c in cols if c != cols[B.sum(axis=0).argmin()]) + [cols[B.sum(axis=0).argmin()]]
#     cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
#     move_col(B,cols_map)
#     cols = new_cols
#     rows_map = dict((r,r) for r in range(len(rows)))

#     i,j = len(rows), len(cols) - 1
#     while j > 0:
#         # move col
#         new_cols = list(c for c in cols if c not in [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]) + [cols[B.getrow(rows.index(cols[j])).argmax()]] + cols[j:]
#         cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
#         move_col(B,cols_map)
#         cols = new_cols
#         # move row
#         new_rows = [cols[0]] + list(r for r in rows if r not in [cols[0],cols[j]] + rows[i:]) + [cols[j]] + rows[i:]
#         rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
#         move_row(B,rows_map)
#         rows = new_rows
#         j -= 1
#         i -= 1
#     # return B
#     seq = ''
#     for s,d in zip(list(reads_map[k] for k in rows),B.diagonal(-1)):
#         seq += s[:-d] 
#     seq += list(reads_map[k] for k in rows)[-1]
#     return seq

In [17]:
seed = 0
seq = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                           'ught_better_butter_to',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = sp.sparse.coo_matrix((list(B.values()),list(zip(*B.keys())))).T
find_lower_diagonal_path(B,reads_map,cols,rows)

'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'

In [20]:
seq = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = sp.sparse.coo_matrix((list(B.values()),list(zip(*B.keys())))).T
find_lower_diagonal_path(B,reads_map,cols,rows)

'you say hello world, i bellow go to hell'

In [21]:
seq = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = sp.sparse.coo_matrix((list(B.values()),list(zip(*B.keys())))).T
find_lower_diagonal_path(B,reads_map,cols,rows)

'she_sells_sea_shells_on_the_sea_shore'

In [37]:
seed = 0

seq = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(seq,250,500,50,100,seed=seed)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_graph(reads)
B = sp.sparse.coo_matrix((list(B.values()),list(zip(*B.keys())))).T
find_lower_diagonal_path(B,reads_map,cols,rows)

KeyboardInterrupt: 

In [23]:
successes = 0
n = 50
for seed in range(n):  
    seq = generate_genome_sequence(10000,seed=seed)
    reads = generate_reads(seq,250,500,50,100,seed=seed)
    reads_map = dict(zip(list(range(len(reads))),reads))
    rows = list(range(len(reads)))
    cols = list(range(len(reads)))
    B = create_bipartite_adjacency_matrix(reads)
    B = sp.sparse.coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
    s = '| Seed: ' + str(seed) + ' | '
    if seq_ == seq:
        s+='SUC | ' + seq_ + ' == ' + seq
        successes+=1
    else: 
        s+='FAI | ' + seq_ + ' != ' + seq
        print(s)
        break
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

ValueError: 30 is not in list

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [154]:
#! pip install Bio
from Bio import SeqIO

try:
    B_
except:    
    B_ = {}

##### Seed = 0

In [292]:
seed = 0
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
random.shuffle(reads)
reads_map = dict(zip(reads,list(range(len(reads)))))
reads_map_key = dict(zip(list(range(len(reads))),reads))
if seed in B_:
    print("Retrieving bipartite adjacency matrix...")
    B = B_[seed].copy() 
else:
    print("Building bipartite adjacency matrix...")
    B = pd.DataFrame(create_bipartite_adjacency_matrix(reads)).T
    B_[seed] = B.copy()
print("Commencing sequence construction...")
seq_ = find_lower_diagonal_path(B,reads_map_key)
if seq_ == seq: print("Sequence reconstruction success!")
else: print("Sequence reconstruction failure.")

Retrieving bipartite graph...
Commencing sequence construction...


In [296]:
set(n[1:] if n.startswith('^') else n[:-1] for n in B).intersection(G)

{Seq('AACTATAGAAAGTTGTGGTTGGTTGTTGACCAACAAAAGCATGGGAGAAAACCA...AAA'),
 Seq('AGGAGTAGTGAAGAACTATAGAAAGTTGTGGTTGGTTGTTGACCAACAAAAGCA...GTT'),
 Seq('GAAAGATCGTTTTTCGAAACTATCAATTTCATAAGAGAAGAAAGATCGTTTTTC...AAA'),
 Seq('GGAGAGCCAATGGCGAAACCAAAGAGAGAGGCGGCCTGGTGGGGACACCACGAT...GTG'),
 Seq('GGGGACACCACGATACGATAGGGGATAAGGGCGAAGTTAAGTAAGGGCTCCTCG...CTT'),
 Seq('TATGGATCACTGGAACTTGATTTCAAACTATAATCGTTAATAGTCGTCGTGGAT...AAA'),
 Seq('TATTGCATTCCTCTCACAAACTATCAATTTCATAAGAGAAGAAAGATCGTTTTT...AGT'),
 Seq('TTTTCATTTTAAACAGCAAGGTAGTTGTAGGAATCTCTTTTTAAAATCAAAAAG...AAA')}

In [295]:
len(G)

1200

In [222]:
list(p[-B.edges[('^'+p,s+'$')]['weight']:] in seq for p,s in G.edges if B.edges[('^'+p,s+'$')]['weight'] != 50)
# all(p[:-B.edges[('^'+p,s+'$')]['weight']]+s in seq for p,s in G.edges)
# list(n for n,d in G.in_degree if d==0)

[]

In [164]:
is_connected(G.to_undirected())

False

In [91]:
len(reads)

1292

In [112]:
list((kmer,count) for kmer,count in coverage.items() if count>1)

[(Seq('AACAGT'), 2),
 (Seq('ACAGT'), 2),
 (Seq('CAGT'), 3),
 (Seq('AGT'), 17),
 (Seq('TTTCTAT'), 2),
 (Seq('TTCTAT'), 3),
 (Seq('TCTAT'), 4),
 (Seq('CTAT'), 8),
 (Seq('TAT'), 28),
 (Seq('AGCCCT'), 3),
 (Seq('GCCCT'), 5),
 (Seq('CCCT'), 10),
 (Seq('CCT'), 22),
 (Seq('CAAGG'), 3),
 (Seq('AAGG'), 9),
 (Seq('AGG'), 23),
 (Seq('CTTTC'), 6),
 (Seq('TTTC'), 11),
 (Seq('TTC'), 21),
 (Seq('TTTTCA'), 2),
 (Seq('TTTCA'), 5),
 (Seq('TTCA'), 11),
 (Seq('TCA'), 34),
 (Seq('ATTT'), 4),
 (Seq('TTT'), 36),
 (Seq('TACTA'), 3),
 (Seq('ACTA'), 4),
 (Seq('CTA'), 22),
 (Seq('AAATGGA'), 2),
 (Seq('AATGGA'), 2),
 (Seq('ATGGA'), 2),
 (Seq('TGGA'), 8),
 (Seq('GGA'), 26),
 (Seq('GTCC'), 4),
 (Seq('TCC'), 29),
 (Seq('CGTGG'), 2),
 (Seq('GTGG'), 4),
 (Seq('TGG'), 24),
 (Seq('GG'), 71),
 (Seq('TTTTG'), 3),
 (Seq('TTTG'), 8),
 (Seq('TTG'), 19),
 (Seq('TG'), 66),
 (Seq('AGGTGT'), 2),
 (Seq('GGTGT'), 2),
 (Seq('GTGT'), 3),
 (Seq('TGT'), 13),
 (Seq('GT'), 59),
 (Seq('ACAG'), 6),
 (Seq('CAG'), 16),
 (Seq('AG'), 85),
 (S

In [298]:
outgoing = list(n[1:] for n in b if not B.nodes[n]['bipartite'])
incoming = list(n[:-1] for n in b if B.nodes[n]['bipartite'])

In [299]:
key = dict(zip(outgoing+incoming,list(range(len(outgoing+incoming)))))
unkey = dict((n,r) for r,n in key.items())

In [300]:
df = pd.DataFrame(np.zeros((len(outgoing),len(incoming))),columns=list(key[node] for node in outgoing),index=list(key[node] for node in incoming))
for row in df.index:
    for col in df.columns:
        df.at[row,col] = b.edges[('^'+unkey[col],unkey[row]+'$')]['weight'] if ('^'+unkey[col],unkey[row]+'$') in b.edges else 0
df

Unnamed: 0,174,163,157,105,179,137,121,7,8,9,...,186,138,187,139,188,140,189,141,190,142
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,37.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,10.0,0.0,210.0,0.0,0.0,0.0,0.0,0.0,0.0
188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,50.0,10.0,0.0,210.0,0.0,0.0,0.0,0.0
189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,50.0,10.0,0.0,210.0,0.0,0.0
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,50.0,10.0,0.0,210.0


In [301]:
p,_,_ = sp.linalg.lu(df,p_indices=True)
df = df.iloc[:,p]
df

##### Seed = 1

In [113]:
seed = 1
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
if seed in B_:
    print("Retrieving bipartite graph...")
    B = B_[seed].copy() 
else:
    print("Building bipartite graph...")
    B = build_bipartite_graph(reads)
    B_[seed] = B.copy()
G = nx.DiGraph()
print("Commencing sequence construction...")
try:
    seq_, G_ = sequitur(G,B,reads,random_initial_edge(G,B,seed=seed),seed=seed,true_sequence=seq)
    if seq_ == seq: print("Sequence reconstruction success!")
    else: print("Sequence reconstruction failure.")
except: print("Sequence reconstruction failure.")

Retrieving bipartite graph...
Commencing sequence construction...
Sequence reconstruction failure.


##### Seed = 2

In [191]:
seed = 2
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None) 
if seed in B_:
    print("Retrieving bipartite graph...")
    B = B_[seed].copy() 
else:
    print("Building bipartite graph...")
    B = build_bipartite_graph(reads)
    B_[seed] = B.copy()
# G = nx.DiGraph()
# print("Commencing sequence construction...")
# # try:
# seq_, G_ = sequitur(G,B,reads,random_initial_edge(G,B,seed=seed),seed=seed,true_sequence=seq)
# if seq_ == seq: print("Sequence reconstruction success!")
# else: print("Sequence reconstruction failure.")
# # except: print("Sequence reconstruction failure.")

Retrieving bipartite graph...


In [609]:
# def sequitur_2(B):
g = DiGraph()
g.add_edges_from(list((e[0][1:],e[1][:-1]) for e in B.edges if B.out_degree(e[0])==1 and B.in_degree(e[1])==1))
b = subgraph_view(B,filter_node=show_nodes(set(n for n,d in B.in_degree if d==1).union(n for n,d in B.out_degree if d==1)))
zero_out = list(n for n in g.nodes if '^'+n in b and b.out_degree['^'+n]==1)
while len(zero_out):
    node = '^'+zero_out.pop()
    g.add_edge(node[1:],list(b.successors(node))[0][:-1])
    b = subgraph_view(B,filter_node=hide_nodes(set('^'+n for n,d in g.out_degree if d>0).union(n+'$' for n,d in g.in_degree if d>0)))
    zero_out = list(n for n in g.nodes if '^'+n in b and b.out_degree['^'+n]==1)
zero_in = list(n for n in g.nodes if n+'$' in b and b.in_degree[n+'$']==1)
while len(zero_in):
    node = zero_in.pop()+'$'
    g.add_edge(list(b.predecessors(node))[0][1:],node[:-1])
    b = subgraph_view(B,filter_node=hide_nodes(set('^'+n for n,d in g.out_degree if d>0).union(n+'$' for n,d in g.in_degree if d>0)))
    zero_in = list(n for n in g.nodes if n+'$' in b and b.in_degree[n+'$']==1)
    # for c in connected_components(b.to_undirected()):
    #     for l,r in nx.bipartite.maximum_matching(b.subgraph(c)).items():
    #         g.add_edge(l[1:] if l.startswith('^') else r[1:],r[:-1] if l.startswith('^') else l[:-1])
    # return g,b

In [587]:
seq = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(seq,250,500,50,100,seed=seed)
B = build_bipartite_graph(reads)

In [588]:
nodes = reads.copy()
rows = []
random.shuffle(nodes)
for c in nodes:
    rows += [list(int(('^'+c,a+'$') in B.edges) for a in nodes)]


In [35]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])