In [1]:
import time, os, random
from typing import Union, Tuple
from scipy.sparse import coo_matrix, vstack, hstack, save_npz, load_npz # pip install scipy
import numpy as np # pip install numpy
from math import ceil, fmod
import networkx as nx # pip install networkx
import pylcs # pip install pylcs
from Bio import SeqIO # pip install Bio
from Bio.Seq import Seq
from fastDamerauLevenshtein import damerauLevenshtein as damerau_levenshtein_distance # pip install fastDamerauLevenshtein

In [12]:
! wgsim



Program: wgsim (short read simulator)
Version: 1.20
Contact: Heng Li <lh3@sanger.ac.uk>

Usage:   wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>

Options: -e FLOAT      base error rate [0.020]
         -d INT        outer distance between the two ends [500]
         -s INT        standard deviation [50]
         -N INT        number of read pairs [1000000]
         -1 INT        length of the first read [70]
         -2 INT        length of the second read [70]
         -r FLOAT      rate of mutations [0.0010]
         -R FLOAT      fraction of indels [0.15]
         -X FLOAT      probability an indel is extended [0.30]
         -S INT        seed for random generator [0, use the current time]
         -A FLOAT      discard if the fraction of ambiguous bases higher than FLOAT [0.05]
         -h            haplotype mode



In [13]:
! wgsim -r 0 -R 0 -X 0 -e 0 -s 0 -S 420 -1 300 -2 300 '/workspaces/Sequitur/data/input/Raphanus sativus_NC_018551.1/Raphanus sativus_NC_018551.1.fasta' '/workspaces/Sequitur/data/input/Raphanus sativus_NC_018551.1/300.0.1.fastq' '/workspaces/Sequitur/data/input/Raphanus sativus_NC_018551.1/300.0.2.fastq'

[wgsim] seed = 420
[wgsim_core] calculating the total length of the reference sequence...
[wgsim_core] 1 sequences, total length: 258426


In [20]:
for record in SeqIO.parse("/workspaces/Sequitur/data/input/Raphanus sativus_NC_018551.1/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
any(seq.startswith(read.seq.upper()) or seq.startswith(read.seq.upper().reverse_complement()) for read in SeqIO.parse(f"/workspaces/Sequitur/data/input/Raphanus sativus_NC_018551.1/300.0.2.fastq",'fastq'))

True

# De Bruijn Graph

A simple implementation for a De Bruijn Graph assembler. In reality, many statistical, optimisation, and computation techniques are implemented to improve the efficiency and quality of input. Here we use a basic technique to compare to our basic technique. The idea being that with the addition of similar developments, the novel technique presented here might be able to at least match or perhaps surpass this technique.

In [None]:
def find_longest_overlap(reads: list) -> int:
	overlaps = []
	for read in reads:
		overlaps += pylcs.lcs2_of_list(read, list(set(reads).symmetric_difference([read])))
	return min(overlaps),max(overlaps)

def create_de_bruijn_graph(
		k: int, 
		sequences: list,
		do_time: bool = False
    ) -> Union[nx.Graph,Tuple[nx.Graph,float]]:
	"""
	Create a de Bruijn graph from a set of DNA sequences.
	
	Parameters:
	- k (int): k-mer size
	- sequences (list): List of DNA sequences
	- do_time (bool): should the function be time of not (default False)
	
	Returns:
	- Union[nx.Graph,Tuple[nx.Graph,float]]: De Bruijn graph or a tuple of the De Bruijn graph and the time
										to create it.
	"""
	if do_time: start_time = time.time()
	graph = nx.DiGraph()

	for sequence in sequences:
		for i in range(len(sequence) - k + 1):
			kmer = sequence[i:i+k]
			prefix = kmer[:-1]
			suffix = kmer[1:]
			
			if not graph.has_edge(prefix, suffix):
				graph.add_edge(prefix, suffix, weight=1)
			else:
				graph[prefix][suffix]['weight'] += 1

	if do_time: return graph, time.time() - start_time
	return graph

def eulerian_path(
    	graph: nx.DiGraph, 
     	do_time: bool = False
	) -> Union[str, Tuple[str, float]]:
    """
    Find an Eulerian path in the given graph or an approximate Eulerian path if none exists.
    
    Parameters:
    - graph (nx.Graph): De Bruijn graph
    - do_time (bool): should the function be timed or not (default False)
    
    Returns:
    - Union[str, Tuple[str, float]]: string describing the Eulerian path or a tuple of this 
                                string and the execution time.
    """
    if do_time: start_time = time.time()
    path = []
    
    if nx.has_eulerian_path(graph):
        for node in nx.eulerian_path(graph):
            if len(path): path.append(node[0][-1])
            else: path.append(node[0])
        
        if len(node[1]): path.append(node[1][-1])
        else: path.append(node[1])
        
        if do_time: return ''.join(path), time.time() - start_time
        return ''.join(path)
    else:
        start_node = next((node for node in graph.nodes if graph.out_degree(node) > 0), None)
        if start_node is None:
            if do_time: return '', time.time() - start_time
            return ''
        
        current_node = start_node
        while True:
            path.append(current_node)
            neighbors = list(graph.successors(current_node))
            if not neighbors:
                break
            next_node = neighbors[0]
            graph.remove_edge(current_node, next_node)
            current_node = next_node
        
        if do_time: return ''.join(path), time.time() - start_time
        return ''.join(path)

# Sequitur

These are the methods necessary for implementing the Sequitur assembly technique.

In [None]:
def normalised_damerau_levenshtein_distance(read: str,overlap: str) -> float:
	"""
	Find the Damerau-Levenshtein edit distance of two strings normalised to the length
	of the shorter string. This normalisation is because we want to path prefixes to
	suffixes and this means that in general we will be comparing a full string to a
	portion of another string.
	
	Parameters:
	- read (str): string for comparison, usually the longer string 
	- overlap (str): string for comparison, usually the shorter string
	
	Returns:
	- float: the normalised Demarau-Levenshtein edit distance of the input strings
	"""
	return damerau_levenshtein_distance(read.__str__()[:min(len(overlap),len(read))],overlap.__str__()[:min(len(overlap),len(read))])/min(len(overlap),len(read))

def build_suffix_array(reads: list, min_suf_len: int = 3,do_time: bool = False) -> tuple:
	if do_time: start = time.time()
	suf_arr = []
	for read in reads:
		read += '$' + str(reads.index(read))
		for i in range(read.index('$')-min_suf_len-1):
			# if len(read[i:]) < min_suf_len + 2: continue 
			suf_arr += [read[i:]]
	suf_arr.sort()
	suf_arr_ind = []
	for s in range(len(suf_arr)):
		suf_arr_ind += [int(suf_arr[s].split('$')[-1].__str__())]
		suf_arr[s] = suf_arr[s][:suf_arr[s].find('$')+1]
	if do_time: return suf_arr, suf_arr_ind, time.time() - start
	return suf_arr,suf_arr_ind

def create_bipartite_adjacency_matrix(reads: list, suf_arr: list = None, suf_arr_ind: list = None, do_time: bool = False,max_diff: float = 0.25, min_suf_len: int = 3) -> dict:
	if do_time: start = time.time()
	if suf_arr is None or suf_arr_ind is None: suf_arr,suf_arr_ind = build_suffix_array(reads,min_suf_len=min_suf_len)
	reads_map = dict(zip(reads,list(range(len(reads)))))
	B = {}
	for read in reads:
		for j in range(min_suf_len + 1):
			i = suf_arr.index(read[j:]+'$') - 1
			while normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) <= 0.5:
				if not reads[suf_arr_ind[i]] == read and \
				   normalised_damerau_levenshtein_distance(read,suf_arr[i][:-1]) < max_diff and \
				   read.startswith(suf_arr[i][:-1]):
					if (reads_map[reads[suf_arr_ind[i]]],reads_map[read]) not in B: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = len(suf_arr[i][:-1])
					else: B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])] = max(len(suf_arr[i][:-1]),B[(reads_map[reads[suf_arr_ind[i]]],reads_map[read])])
				i -= 1
	if do_time: return B, time.time() - start
	return B

def move_col(B: coo_matrix, cols: dict) -> None:
	for c in range(len(B.col)):
		B.col[c] = cols[B.col[c]]
			
def move_row(B: coo_matrix,rows: dict) -> None:
	for r in range(len(B.row)):
		B.row[r] = rows[B.row[r]]

def find_lower_diagonal_path(B: coo_matrix,reads_map: dict,cols: list,rows: list,do_time: bool = False) -> tuple:
	if do_time: start = time.time()
	argpen = lambda l: np.argpartition(l,-2)[-2]

	new_cols = cols[:]
	if B.sum(axis=0).min() == 0: new_cols = list(c for c in new_cols if c not in [new_cols[B.sum(axis=0).argmin()]]) + [new_cols[B.sum(axis=0).argmin()]]
	if B.sum(axis=1).min() == 0: 
		if B.sum(axis=1).argmin() == B.sum(axis=0).argmin():
			new_cols = [new_cols[-1]] + list(c for c in new_cols[:-1] if c not in [cols[B.getrow(rows.index(new_cols[-1])).argmax()]]) + [cols[B.getrow(rows.index(new_cols[-1])).argmax()]]
		else: new_cols = [rows[B.sum(axis=1).argmin()]] + list(c for c in new_cols if c not in [rows[B.sum(axis=1).argmin()]])

	cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
	move_col(B,cols_map)
	cols = new_cols

	new_rows = cols[:]
	rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
	move_row(B,rows_map)
	rows = new_rows

	i,j,k = len(rows), len(cols) - 1, B.sum(axis=1).argmin() if B.sum(axis=1).min() == 0 else None

	while j > (k if B.sum(axis=1).min() == 0 else 0):
		if k is not None and B.getrow(rows.index(cols[j])).argmax() == k: 
			cols_,c_ = [], 0

			while j + c_ + 1 < len(rows):
				c_ += 1
				if len(B.getrow(j+c_).nonzero()[1]) > 1:
					cols_ = np.argpartition(B.getrow(j+c_).toarray().flatten(),-2)[::-1][:2]
					if cols[cols_[1]] in cols[:j] and B.getcol(cols_[1]).argmax() == j+c_: break
			
			if j + c_ + 1 == len(cols): new_cols = cols[:k+1] + cols[j:] + cols[k+1:j]
			else: new_cols = cols[:k+1] + cols[j:j+c_] + list(c for c in cols[k+1:j] if c not in [cols[min(cols_)]]) + [cols[min(cols_)]] + cols[j+c_:]
			cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
			move_col(B,cols_map)
			cols = new_cols

			if j + c_ + 1 == len(rows): new_rows = cols[:]
			else: new_rows = cols[:k+c_+1] + list(r for r in rows[k:j+c_] if r not in cols[:k+c_+1] + cols[j+c_:]) + cols[j+c_:]
			rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
			move_row(B,rows_map)
			rows = new_rows

			i,j,k = j + c_ + 1, j + c_, k + c_
		else:
			cmax = B.getrow(rows.index(cols[j])).argmax()
			if len(B.getrow(rows.index(cols[j])).nonzero()[1]) > 1:
				cpen = argpen(B.getrow(rows.index(cols[j])).toarray().flatten()) 
				if cmax > j: 
					if len(B.getrow(cmax+1).nonzero()[1]) > 1 and \
					B.getrow(cmax+1).getcol(argpen(B.getrow(cmax+1).toarray().flatten())).data[0] >=  B.getrow(rows.index(cols[j])).getcol(cpen).data[0]: 
						crange = [argpen(B.getrow(cmax).toarray().flatten()),cmax]
					else: crange = [cpen]
				else: crange = [cmax]
			else: crange = [cmax]
			while crange[0] > j:
				if len(B.getrow(crange[0]).nonzero()[1]) > 1:
					crange = [argpen(B.getrow(crange[0]).toarray().flatten())] + crange
				else:
					crange = [B.getrow(crange[0]).argmax()] + crange
				if crange[0] == j: crange = [B.getrow(crange[1]).argmax()] + crange[1:]

			new_cols = list(c for c in cols[:j] if c not in list(cols[cr] for cr in crange)) + list(cols[cr] for cr in crange) + list(c for c in cols[j:] if c not in list(cols[cr] for cr in crange))
			cols_map = dict((cols.index(c),new_cols.index(c)) for c in range(len(cols)))
			move_col(B,cols_map)
			cols = new_cols

			new_rows = list(r for r in rows[:i] if r not in cols[j:]) + cols[j:]
			rows_map = dict((rows.index(r),new_rows.index(r)) for r in range(len(rows)))
			move_row(B,rows_map)
			rows = new_rows
		j -= 1
		i -= 1

	seq = ''
	for s,d in zip(list(reads_map[k] for k in rows)[:-1],B.diagonal(-1)):
		seq += s[:-d]
	seq += list(reads_map[k] for k in rows)[-1]
	if do_time: return seq, time.time() - start
	return seq

In [None]:
# natural language sequences
nat_lang_seq = [
		('betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better',
		['betty_bought_butter_th',
							'tter_the_butter_was_',
								'he_butter_was_bitter_',
										'as_bitter_betty_bought',
													'tty_bought_better_butter_t',
														'ught_better_butter_to',
																	'r_butter_to_make_the_',
																				'ke_the_bitter_butter_better']),
		('you say hello world, i bellow go to hell',
		['you say hel',
					' say hello wo',
							'lo world, i be',
								'ld, i bellow go t',
											'ow go to hell']),
		('she_sells_sea_shells_on_the_sea_shore',
		['she_sells_s',
					'lls_sea_shel',
							'ea_shells_o',
							'shells_on_the_s',
										'he_sea_s',
											'ea_shore'])
]
for i, (seq, reads) in enumerate(nat_lang_seq):
	if not os.path.exists('data/output/natural_language_sequences.sequitur.csv') or os.path.getsize('data/output/natural_language_sequences.sequitur.csv') == 0:
		with open('data/output/natural_language_sequences.sequitur.csv', 'a') as f:
			f.write('natural_language_sequence,edit_distance,target_sequence_length,output_sequence_length,suffix_array_construction_time,adjacency_matrix_construction_time,sequence_reconstruction_time\n')
	with open('data/output/natural_language_sequences.sequitur.csv','a') as f:
		for seed in range(10):
			random.seed(seed)
			random.shuffle(reads)
			reads_map = dict(zip(list(range(len(reads))),reads))
			rows = list(range(len(reads)))
			cols = list(range(len(reads)))
			for _ in range(100):
				suf_arr,suf_arr_ind,t1 = build_suffix_array(reads,do_time=True)
				B,t2 = create_bipartite_adjacency_matrix(reads,suf_arr=suf_arr,suf_arr_ind=suf_arr_ind,do_time=True)
				start = time.time()
				B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
				if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1, B.shape[1]),dtype=B.dtype)])
				if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
				t2 += time.time() - start
				seq_,t3 = find_lower_diagonal_path(B,reads_map,cols,rows,do_time=True)
				f.write('{},{},{},{},{},{},{}\n'.format(i,damerau_levenshtein_distance(seq_,seq),len(seq),len(seq_),t1,t2,t3))
	if not os.path.exists('data/output/natural_language_sequences.euler.csv') or os.path.getsize('data/output/natural_language_sequences.euler.csv') == 0:
		with open('data/output/natural_language_sequences.euler.csv', 'a') as f:
			f.write('natural language_sequence,k,edit_distance,target_sequence_length,output_sequence_length,de_bruijn_graph_construction_time,euler_path_reconstruction_time\n')
	with open('data/output/natural_language_sequences.euler.csv','a') as f:
		for seed in range(10):
			random.seed(seed)
			random.shuffle(reads)
			for _ in range(100):
				outputs_seq = {}
				outputs_dbg_time = {}
				outputs_euler_time = {}
				a,b = find_longest_overlap(reads)
				for k in range(a,b):
					G,outputs_dbg_time[k] = create_de_bruijn_graph(k,reads,do_time=True)
					outputs_seq[k],outputs_euler_time[k] = eulerian_path(G,do_time=True)
				for k in outputs_seq:
					f.write('{},{},{},{},{},{},{}\n'.format(i,k,damerau_levenshtein_distance(outputs_seq[k],seq),len(seq),len(outputs_seq[k]),outputs_dbg_time[k],outputs_euler_time[k]))

In [None]:
# real genomic sequence, generated reads
# sequitur
n = 1
m = 1
seed = 37
# for seed in range(n):
with open('data/output/local/real sequence/sequitur/Raphanus sativus_NC_018551.1_seed_'+str(seed)+'_sequitur.csv','a') as f:
	f.write('edit_distance,target_sequence_length,output_sequence_length,suffix_array_construction_time,adjacency_matrix_construction_time,sequence_reconstruction_time\n')
	for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
	reads,_ = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
	reads_map = dict(zip(list(range(len(reads))),reads))
	rows = list(range(len(reads)))
	cols = list(range(len(reads)))
	for _ in range(m):
		suf_arr,suf_arr_ind,t1 = build_suffix_array(reads,do_time=True)
		B,t2 = create_bipartite_adjacency_matrix(reads,suf_arr=suf_arr,suf_arr_ind=suf_arr_ind,do_time=True)
		start = time.time()
		B = coo_matrix((list(B.values()),list(zip(*B.keys())))).T
		if B.shape[0] < len(rows): B = vstack([B,coo_matrix((1,B.shape[1]),dtype=B.dtype)])
		if B.shape[1] < len(cols): B = hstack([B,coo_matrix((B.shape[0], 1),dtype=B.dtype)])
		t2 += time.time() - start
		if not os.path.exists('data/input/matrices/seed_'+str(seed)+'.npz'):
			save_npz('data/input/matrices/seed_'+str(seed)+'.npz', B)
		seq_,t3 = find_lower_diagonal_path(B,reads_map,cols,rows,do_time=True)
		f.write('{},{},{},{},{},{}\n'.format(damerau_levenshtein_distance2(str(seq_),str(seq),similarity=False),len(seq),len(seq_),t1,t2,t3))
# euler
# for seed in range(n):
with open('data/output/local/real sequence/euler/Raphanus sativus_NC_018551.1_seed_'+str(seed)+'_euler.csv','a') as f:
	f.write('k,edit_distance,target_sequence_length,output_sequence_length,de_bruijn_graph_construction_time,euler_path_reconstruction_time\n')
	for record in SeqIO.parse("data/input/Raphanus sativus_NC_018551.1.fasta",'fasta'): seq = record.seq
	reads,_ = generate_reads(seq,250,250,50,50,seed=seed,min_coverage=None)
	outputs_seq = {}
	outputs_dbg_time = {}
	outputs_euler_time = {}
	a,b = find_longest_overlap(list(str(read) for read in reads))
	for _ in range(m):
		for k in range(a,b):
			G,outputs_dbg_time[k] = create_de_bruijn_graph(k,list(str(read) for read in reads),do_time=True)
			outputs_seq[k],outputs_euler_time[k] = eulerian_path(G,do_time=True)
		for k in outputs_seq:
			f.write('{},{},{},{},{},{}\n'.format(k,damerau_levenshtein_distance2(outputs_seq[k],seq,similarity=False),len(seq),len(outputs_seq[k]),outputs_dbg_time[k],outputs_euler_time[k]))

In [27]:
records1 = SeqIO.parse("data/input/Raphanus sativus_NC_018551.1/50.0.1.fastq",'fastq')
records2 = SeqIO.parse("data/input/Raphanus sativus_NC_018551.1/50.0.2.fastq",'fastq')

In [31]:
record = (next(records1),next(records2))

In [32]:
record[1].letter_annotations["phred_quality"]

[40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 40]

In [None]:
print(record[0].seq.upper())

In [None]:
print(record[1].seq.upper())
print(record[1].seq.upper().complement())
print(record[1].seq.upper().reverse_complement())
print(record[1].seq.upper()[::-1])


In [34]:
s = '0123456$'
s.index('$')-3

4