In [1]:
from scipy.sparse import coo_matrix, vstack, hstack, save_npz, load_npz #! pip install scipy
from jellyfish import damerau_levenshtein_distance #! pip install jellyfish
import numpy as np #! pip install numpy
from math import ceil, fmod
import random
import networkx as nx

These are utility functions for generating random sequences and for generating reads of sequences.

In [2]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    circularize    | boolean indicating whether to add a random amount of the end of the sequence
                   | to the beginning and vice versa
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,min_coverage=None,circularise=False,seed=None,shuffle=True,return_unshuffle=False):
    import random

    random.seed(seed)
    if circularise: sequence = sequence[-random.randint(min_overlap,max_overlap):] + sequence + sequence[:random.randint(min_overlap,max_overlap)]
    reads = []
    while 1: 
        start = 0
        end = random.randint(min_subseq_len,max_subseq_len)
        reads += [sequence[start:end]]
        while end < len(sequence):
            start = random.randint(end-max_overlap,end-min_overlap)
            if (len(sequence) - start)/max_subseq_len < 2:
                if (len(sequence) - start)/max_subseq_len < 1:
                    end = len(sequence)
                else:
                    a = 0
                    while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                    end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
            else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
            reads += [sequence[start:end]]
        if min_coverage is None or len(set(reads))*(sum(len(read) for read in set(reads))/len(set(reads)))/len(sequence) >= min_coverage:
            if not shuffle: return reads
            reads_ = reads[:]
            random.shuffle(reads_)
            return reads_, list(reads_.index(read) for read in reads)

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C
INPUT
    n          | the length of the sequence
    palindrome | a boolean indicating whether the sequence must be a palidrome or not
    seed       | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,palindrome=False,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    if palindrome: n = ceil(n/2)
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    if palindrome: seq += ''.join(reversed(seq[:int(n-fmod(n,2))]))
    return seq

# De Bruijn Graph

A simple implementation for a De Bruijn Graph assembler. In reality, many statistical, optimisation, and computation techniques are implemented to improve the efficiency and quality of input. Here we use a basic technique to compare to our basic technique. The idea being that with the addition of similar developments, the novel technique presented here might be able to at least match or perhaps surpass this technique.

In [None]:
import networkx as nx

def create_de_bruijn_graph(k: int, sequences: list) -> nx.Graph:
    """
    Create a de Bruijn graph from a set of DNA sequences.
    
    Parameters:
    - k (int): k-mer size
    - sequences (list): List of DNA sequences
    
    Returns:
    - nx.Graph: De Bruijn graph
    """
    graph = nx.DiGraph()

    for sequence in sequences:
        for i in range(len(sequence) - k + 1):
            kmer = sequence[i:i+k]
            prefix = kmer[:-1]
            suffix = kmer[1:]
            
            if not graph.has_edge(prefix, suffix):
                graph.add_edge(prefix, suffix, weight=1)
            else:
                graph[prefix][suffix]['weight'] += 1

    return graph

def eulerian_path(graph: nx.DiGraph) -> list:
    """
    Find an Eulerian path in the given graph.
    
    Parameters:
    - graph (nx.Graph): De Bruijn graph
    
    Returns:
    - list: Eulerian path
    """
    path = []
    
    for node in nx.eulerian_path(graph):
        path.append(node[0])
    
    path.append(node[1])
    
    return ''.join(path)

# Sequitur

These are the methods necessary for implementing the Sequitur assembly technique.

In [3]:
def normalised_damerau_levenshtein_distance(read: str,overlap: str) -> float:
    """
    Find the Damerau-Levenshtein edit distance of two strings normalised to the length
    of the shorter string. This normalisation is because we want to path prefixes to
    suffixes and this means that in general we will be comparing a full string to a
    portion of another string.
    
    Parameters:
    - read (str): string for comparison, usually the longer string 
    - overlap (str): string for comparison, usually the shorter string
    
    Returns:
    - float: the normalised Demarau-Levenshtein edit distance of the input strings
    """
    return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads: list, min_suf_len: int = 3) -> tuple:
    suf_arr = []
    for read in reads:
        read += '$' + str(reads.index(read))
        for i in range(len(read)):
            if len(read[i:]) < min_suf_len + 2: continue 
            suf_arr += [read[i:]]
    suf_arr.sort()
    suf_arr_ind = []
    for s in range(len(suf_arr)):
        suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
        suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
    return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads: list, suf_arr: list = None, suf_arr_ind: list = None, max_diff: float = 0.25, min_suf_len: int = 3) -> dict:
    if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads)
    reads_map = dict(zip(reads,list(range(len(reads)))))
    B = {}
    for read in reads:
        for j in range(min_suf_len + 1):
            i = suf_arr.index(read[j:]+'$') - 1
            while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
                if not reads[suf_arr_ind[i]] == read and \
                   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
                   read.startswith(suf_arr[i][:-1]):
                    if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
                    else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
                i -= 1
    return B

def move_col(B: coo_matrix, cols: dict) -> None:
    for c in range(len(B.col)):
        B.col[c] = cols[B.col[c]]
            
def move_row(B: coo_matrix,rows: dict) -> None:
    for r in range(len(B.row)):
        B.row[r] = rows[B.row[r]]

def find_lower_diagonal_path(B: coo_matrix,reads_map: dict,cols: list,rows: list) -> tuple:
    argpen = lambda l: np.argpartition(l,-2)[-2]

    new_cols = cols[:]
    if B.sum(axis=0).min() == 0: new_cols = list(c for c in new_cols if c not in [new_cols[B.sum(axis=0).argmin()]]) + [new_cols[B.sum(axis=0).argmin()]]
    if B.sum(axis=1).min() == 0: new_cols = [rows[B.sum(axis=1).argmin()]] + list(c for c in new_cols if c not in [rows[B.sum(axis=1).argmin()]])
    cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
    move_col(B,cols_map)
    cols = new_cols

    new_rows = cols[:]
    rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
    move_row(B,rows_map)
    rows = new_rows

    i,j,k = len(rows), len(cols) - 1, B.sum(axis=1).argmin() if B.sum(axis=1).min() == 0 else None

    while j > (k if B.sum(axis=1).min() == 0 else 0):
        if k is not None and B.getrow(rows.index(cols[j])).argmax() == k: 
            cols_,c_ = [], 0

            while j + c_ + 1 < len(rows):
                c_ += 1
                if len(B.getrow(j+c_).nonzero()[1]) > 1:
                    cols_ = np.argpartition(B.getrow(j+c_).toarray().flatten(),-2)[::-1][:2]
                    if cols[cols_[1]] in cols[:j] and B.getcol(cols_[1]).argmax() == j+c_: break
            
            if j + c_ + 1 == len(cols): new_cols = cols[:k+1] + cols[j:] + cols[k+1:j]
            else: new_cols = cols[:k+1] + cols[j:j+c_] + list(c for c in cols[k+1:j] if c not in [cols[min(cols_)]]) + [cols[min(cols_)]] + cols[j+c_:]
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            if j + c_ + 1 == len(rows): new_rows = cols[:]
            else: new_rows = cols[:k+c_+1] + list(r for r in rows[k:j+c_] if r not in cols[:k+c_+1] + cols[j+c_:]) + cols[j+c_:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows

            i,j,k = j + c_ + 1, j + c_, k + c_
        else:
            cmax = B.getrow(rows.index(cols[j])).argmax()
            if len(B.getrow(rows.index(cols[j])).nonzero()[1]) > 1:
                cpen = argpen(B.getrow(rows.index(cols[j])).toarray().flatten()) 
                if cmax > j: 
                    if len(B.getrow(cmax+1).nonzero()[1]) > 1 and \
                    B.getrow(cmax+1).getcol(argpen(B.getrow(cmax+1).toarray().flatten())).data[0] >=  B.getrow(rows.index(cols[j])).getcol(cpen).data[0]: 
                        crange = [argpen(B.getrow(cmax).toarray().flatten()),cmax]
                    else: crange = [cpen]
                else: crange = [cmax]
            else: crange = [cmax]
            while crange[0] > j:
                if len(B.getrow(crange[0]).nonzero()[1]) > 1:
                    crange = [argpen(B.getrow(crange[0]).toarray().flatten())] + crange
                else:
                    crange = [B.getrow(crange[0]).argmax()] + crange

            new_cols = list(c for c in cols[:j] if c not in list(cols[cr] for cr in crange)) + list(cols[cr] for cr in crange) + list(c for c in cols[j:] if c not in list(cols[cr] for cr in crange))
            cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
            move_col(B,cols_map)
            cols = new_cols

            new_rows = list(r for r in rows[:i] if r not in cols[j:]) + cols[j:]
            rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
            move_row(B,rows_map)
            rows = new_rows
        j -= 1
        i -= 1

    seq = ''
    for s,d in zip(list(reads_map[k] for k in rows)[:-1],B.diagonal(-1)):
        seq += s[:-d]
    seq += list(reads_map[k] for k in rows)[-1]
    return seq

In [4]:
seed = 0
seq = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                           'ught_better_butter_to',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'

In [5]:
seq = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'you say hello world, i bellow go to hell'

In [6]:
seq = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
random.seed(seed)
random.shuffle(reads)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
B = create_bipartite_adjacency_matrix(reads)
B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
find_lower_diagonal_path(B,reads_map,cols,rows)

'she_sells_sea_shells_on_the_sea_shore'

In [7]:
successes = 0
n = 50
for seed in range(n):  
    seq = generate_genome_sequence(10000,seed=seed)
    reads, reads_order = generate_reads(seq,250,500,50,100,seed=seed)
    reads_map = dict(zip(list(range(len(reads))),reads))
    rows = list(range(len(reads)))
    cols = list(range(len(reads)))
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    seq_ = find_lower_diagonal_path(B,reads_map,cols,rows)
    s = '| Seed: ' + str(seed) + ' | '
    if seq_ == seq:
        s+='SUC | ' + seq_ + ' == ' + seq
        successes+=1
    else: 
        s+='FAI | ' + seq_ + ' != ' + seq
        print(s)
        break
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

| Seed: 0 | SUC | TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAAACCATAATGTTCTAGCGCTCGAAATCATTGCACCACTTGCATCTTTGTTCCAGGGACGCTGTAAAACCAGATGCCTGTAAATCGTTTCAACGGGATGGTTTACCCGGAATTCTACGTATTTAATCAACGAGCTTAATGAGCTGACATTGCTGAAATGACCATGACTTAATAATCATTTATGGAGAAGAGGCACGACCACAAGGACCCTATGGCACGGTGGGCAAGCTCCCGCCCGGTACATAACTGTCTGGACTGATTATGTCGGTACAGACTTCTTCCTGCGTATCGATTACGAGCTTATCTGAAGAAGTTTAGGGCAAAGGGACCATGGCCATTGGTGCCAATTTCGGTTCTTGTATGCTACAGTTAAATAGAAAGGCCGCATTGTCGTTCTCGCCCTGTTTTCCTCATACACGACCGAGGTTATTTGTCGGAAACGAGACATCTCTCGAAGGTGGAACGACGCCGGGTGTGCAGAATTTATTTTAAACACTCTATTACCTCCGGGTAGCGTTGGCAAACTCCGATAATGAGCGCCAGGCGTGCCAGGACTCCACCTCCCCTGCTAAGTTGACCTTGAGCTCGGTACAGCGTCGGCGAGACGATAACAACGAAGTCCTTCGGCGTTATGTAATTCACCAGCCCACCATATCAGGTAATAGGCTCGCTGGTTAGGTAGATT

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [8]:
from Bio import SeqIO #! pip install Bio
# import pandas as pd #! pip install pandas

##### Seed = 0

In [9]:
seed = 0
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads,reads_order = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
if seq == find_lower_diagonal_path(B,reads_map,cols,rows): print("Sequence reconstruction successful.")
else: print("Sequence reconstruction unsuccessful.")

Retrieving stored sparse bipartite adjacency matrix...
Commencing sequence construction...
Sequence reconstruction successful.


##### Seed = 1

In [10]:
seed = 1
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads,reads_order = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
if seq == find_lower_diagonal_path(B,reads_map,cols,rows): print("Sequence reconstruction successful.")
else: print("Sequence reconstruction unsuccessful.")

Retrieving stored sparse bipartite adjacency matrix...
No stored sparse bipartite adjacency matrix found.
Building sparse bipartite adjacency matrix...
Commencing sequence construction...
Sequence reconstruction successful.


##### Seed = 2

In [11]:
seed = 2
for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
reads,reads_order = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
reads_map = dict(zip(list(range(len(reads))),reads))
rows = list(range(len(reads)))
cols = list(range(len(reads)))
try:
    print("Retrieving stored sparse bipartite adjacency matrix...")
    B = load_npz('data/input/matrices/seed_'+str(seed)+'.npz')
except:
    print("No stored sparse bipartite adjacency matrix found.")
    print("Building sparse bipartite adjacency matrix...")
    B = create_bipartite_adjacency_matrix(reads)
    B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
    if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
    if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
    save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
print("Commencing sequence construction...")
if seq == find_lower_diagonal_path(B,reads_map,cols,rows): print("Sequence reconstruction successful.")
else: print("Sequence reconstruction unsuccessful.")

Retrieving stored sparse bipartite adjacency matrix...
No stored sparse bipartite adjacency matrix found.
Building sparse bipartite adjacency matrix...
Commencing sequence construction...
Sequence reconstruction successful.


# De Bruijn Graph

In [12]:


# Example usage:
k = 3
sequences = ['ATGCG', 'GCATG', 'CATGC', 'AGGCA', 'GGCAT']

# Create de Bruijn graph
graph = create_de_bruijn_graph(k, sequences)

# Find Eulerian path
assembly = eulerian_path(graph)

print("De Bruijn Graph:")
print(graph.edges())
print("\nAssembled Sequence:")
print(assembly)


De Bruijn Graph:
[('AT', 'TG'), ('TG', 'GC'), ('GC', 'CG'), ('GC', 'CA'), ('CA', 'AT'), ('AG', 'GG'), ('GG', 'GC')]

Assembled Sequence:
AGGGGCCAATTGGCCG
