## Load and setup simple test bench

In [10]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../evaluate/")
from aligners.smith_waterman import calculate_smith_waterman_distance
from aligners.bwamem2 import bwa_mem2_align
from aligners.minimap2 import minimap2_align
from aligners.bowtie2 import bowtie2_align

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
string2 = "ATGCTCGATCGATCGATCGAAATCGC"
string1 = "ATCGCCCTATGCTCGATCGATCGATCGAAATCGCAGCTCCTCTGACTCAAGAAGACTCGAATGCTCGATCGATCGATCGAAATCGCAGCCTCGAAGCCTCTTGAAA"
reference_path = "/home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta"
with open(reference_path, "r") as f:
    control = f.read().replace("\n","")
sample_read = "AAATATAAGTCAGATATATCAAAATATAAGTCAGAATTTTACAAATATTGAAGTGTCATATCACATCAGAGTAACAACACCACCTAAGTACCAAATGATGATAATGAAACTTAGACCTACTGGAATTGAGTAGAGGTGAACATCATGTGA"

## Naive Smith-Waterman Distance

In [5]:
import time
start = time.time()
# EXPENSIVE!
print(calculate_smith_waterman_distance(control, 
                                    sample_read, 
                                    match_score=2,
                                    mismatch_penalty=-1,
                                    open_gap_penalty=-0.5,
                                    continue_gap_penalty=-0.1,
                                    debug = True))
print(time.time() - start)

Best match:
target      6409941 AAATATAAGTCAGATATATCAAAATATAAGTCAGAATTTTACAAATATTGAAGTGTCATA
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 AAATATAAGTCAGATATATCAAAATATAAGTCAGAATTTTACAAATATTGAAGTGTCATA

target      6410001 TCACATCAGAGTAACAACACCACCTAAGTACCAAATGATGATAATGAAACTTAGACCTAC
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TCACATCAGAGTAACAACACCACCTAAGTACCAAATGATGATAATGAAACTTAGACCTAC

target      6410061 TGGAATTGAGTAGAGGTGAACATCATGTGA 6410091
                120 ||||||||||||||||||||||||||||||     150
query           120 TGGAATTGAGTAGAGGTGAACATCATGTGA     150

{'elapsed time': 2424.680049419403, 'distance': -300.0, 'begins': [6409941]}
2486.0885140895844


In [4]:
calculate_smith_waterman_distance(string1, 
                                    string2, 
                                    match_score=2,
                                    mismatch_penalty=-1,
                                    open_gap_penalty=-0.5,
                                    continue_gap_penalty=-0.1,
                                    debug = True)

Best match:
target            8 ATGCTCGATCGATCGATCGAAATCGC 34
                  0 |||||||||||||||||||||||||| 26
query             0 ATGCTCGATCGATCGATCGAAATCGC 26



{'elapsed time': 0.0021584033966064453, 'distance': -52.0, 'begins': [8]}

## BWA-Mem 2

In [17]:
import time
# They might be doing optimizations for repeats. Do a more thorough test
start = time.time()
bwa_mem2_align(reference_path, [sample_read]*10000, "/home/pholur/dna2vec/evaluate/aligners", "./test.sam");
print("Time per read: ", (time.time() - start) / 10000 )  

Looking to launch executable "/home/pholur/dna2vec/evaluate/aligners/bwa-mem2-2.2.1_x64-linux/bwa-mem2.avx2", simd = .avx2
Launching executable "/home/pholur/dna2vec/evaluate/aligners/bwa-mem2-2.2.1_x64-linux/bwa-mem2.avx2"
-----------------------------
Executing in AVX2 mode!!
-----------------------------
* SA compression enabled with xfactor: 8
* Ref file: /home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta
* Entering FMI_search
* Index file found. Loading index from /home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta.bwt.2bit.64
* Reference seq len for bi-index = 484387059
* sentinel-index: 206661405
* Count:
0,	1
1,	144601590
2,	242193530
3,	339785470
4,	484387059

* Reading other elements of the index from files /home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta
* Index prefix: /home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta
* Read 0 ALT contigs
* Done reading Index!!
* Reading reference genome..
* Binary seq file = /home/pholu

BWA-MEM2 alignment completed successfully.
Time per read:  0.00023491265773773194


No. of OMP threads: 1
Processor is running @2000.135280 MHz
Runtime profile:

	Time taken for main_mem function: 1.29 sec

	IO times (sec) :
	Reading IO time (reads) avg: 0.02, (0.02, 0.02)
	Writing IO time (SAM) avg: 0.02, (0.02, 0.02)
	Reading IO time (Reference Genome) avg: 0.26, (0.26, 0.26)
	Index read time avg: 0.55, (0.55, 0.55)

	Overall time (sec) (Excluding Index reading time):
	PROCESS() (Total compute time + (read + SAM) IO time) : 0.40
	MEM_PROCESS_SEQ() (Total compute time (Kernel + SAM)), avg: 0.36, (0.36, 0.36)

	 SAM Processing time (sec):
	--WORKER_SAM avg: 0.02, (0.02, 0.02)

	Kernels' compute time (sec):
	Total kernel (smem+sal+bsw) time avg: 0.33, (0.33, 0.33)
		SMEM compute avg: 0.18, (0.18, 0.18)
		SAL compute avg: 0.02, (0.02, 0.02)
				MEM_SA avg: 0.01, (0.01, 0.01)

		BSW time, avg: 0.13, (0.13, 0.13)

Important parameter settings: 
	BATCH_SIZE: 512
	MAX_SEQ_LEN_REF: 256
	MAX_SEQ_LEN_QER: 128
	MAX_SEQ_LEN8: 128
	SEEDS_PER_READ: 500
	SIMD_WIDTH8 X: 32
	SIMD_WID

## Minimap2

In [17]:
minimap2_align("/home/pholur/dna2vec/evaluate/data/chromosome_2/", [sample_read]*10000, 
               "/home/pholur/dna2vec/evaluate/aligners", "./test.sam");

minimap2 alignment completed successfully.


[M::mm_idx_gen::0.010*0.23] collected minimizers
[M::mm_idx_gen::0.012*0.30] sorted minimizers
[M::main::0.012*0.30] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.012*0.30] mid_occ = 1000
[M::mm_idx_stat] kmer size: 21; skip: 11; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.012*0.31] distinct minimizers: 0 (-nan% are singletons); average occurrences: -nan; average spacing: inf; total length: 48
[M::worker_pipeline::0.059*1.44] mapped 10000 sequences
[M::main] Version: 2.26-r1175
[M::main] CMD: /home/pholur/dna2vec/evaluate/aligners/minimap2/minimap2 -ax sr temp_reference.fasta temp_reads.fastq
[M::main] Real time: 0.060 sec; CPU: 0.086 sec; Peak RSS: 0.006 GB


## Bowtie2

In [16]:
bowtie2_align(reference_path, [sample_read]*10000, "/home/pholur/dna2vec/evaluate/aligners/bowtie2-2.5.1-linux-x86_64", "./test.sam");

Error during Bowtie2 alignment: Command '/home/pholur/dna2vec/evaluate/aligners/bowtie2-2.5.1-linux-x86_64/bowtie2 -x /home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta -U temp_interleaved.fastq -S ./test.sam --interleaved' returned non-zero exit status 255.


(ERR): "/home/pholur/dna2vec/evaluate/data/chromosome_2/NC_000002.fasta" does not exist or is not a Bowtie 2 index
Exiting now ...
