# Implementation of the EBfilter by genomon

* EBrun (originally EBFilter) is an argparse wrapper passing command line arguments to run.py (is not needed for internal use)
* passed arguments:
    * targetMutationFile: the .vcf or .anno containing the mutations – needed --> mut_file
    * targetBamPath: path to the tumor bam file (+.bai) – needed --> tumor_bam
    * controlBamPathList: text list of path to PoN bam files (+ .bai) – needed --> pon_list
    * outputPath: clear  – needed --> output_path
    * -f option for anno or vcf – not needed --> will be inferred from .ext
    * thread_num: –not needed --> taken from config
    * -q option for quality threshold – not needed --> default _q config
    * -Q option for base quality threshold - not needed --> default _Q from config
    * --ff option for filter flags – not needed because of preprocessing??
    * --loption for samtools mpileup -l option – must elaborate..
    * --region option for restriction of regions on mpileup -l – must elaborate..
    * --debug – not needed

## Initiation

### imports

In [5]:
import os
from code import run

### snakemake config

In [6]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['loption'] = True
config['EB']['threads'] = 1
config['EB']['params'] = params

### function args
will be passed during the function call

In [7]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/list_normal_sample.txt'
args['output_path'] = 'output/output.anno'
args['region'] = ''

### load the config and GLOBAL STATE into variable _

In [8]:
debug_mode = True
params = config['EB']['params']
threads = config['EB']['threads']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
is_loption = params['loption']   # True
log_path = 'output/logs'         # log_file: 'output/logs/'
log_file = os.path.join(log_path, f"{os.path.splitext(os.path.basename(args['output_path']))[0]}.log")
# dump into _
_ = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'ff':_ff, 'log':log_file, 'threads':threads, 'debug_mode':debug_mode}

## Single threading

In [19]:
_['threads'] = 1
_['debug_mode'] = False
run.main(args,_)
!ls output

TypeError: unsupported operand type(s) for -=: 'str' and 'int'

## Multithreading

In [17]:
_['threads'] = 3
_['debug_mode'] = True
run.main(args,_)
!ls output/

[34mlogs[m[m                             output.anno.sub.0.control.pileup
output.anno                      output.anno.sub.0.target.pileup
output.anno.0                    output.anno.sub.1
output.anno.0.region_list.bed    output.anno.sub.1.control.pileup
output.anno.1                    output.anno.sub.1.target.pileup
output.anno.1.region_list.bed    output.anno.sub.2
output.anno.2                    output.anno.sub.2.control.pileup
output.anno.2.region_list.bed    output.anno.sub.2.target.pileup
output.anno.sub.0


In [15]:
ls output/

[34mlogs[m[m/                             output.anno.sub.0.control.pileup
output.anno                       output.anno.sub.0.target.pileup
output.anno.0                     output.anno.sub.1
output.anno.0.region_list.bed     output.anno.sub.1.control.pileup
output.anno.1                     output.anno.sub.1.target.pileup
output.anno.1.region_list.bed     output.anno.sub.2
output.anno.2                     output.anno.sub.2.control.pileup
output.anno.2.region_list.bed     output.anno.sub.2.target.pileup
output.anno.sub.0


## using advanced dataframes for getting the pileup files 

### get anno and pileup as dataframes (too big?)

In [None]:
anno = pd.read_csv('testdata/input.anno', sep='\t', header=None, names=['Chr','Start', 'End', 'ref', 'var'], dtype={'Chr': str, 'Start':int, 'End': int})
tumorpileup = pd.read_csv('output/tumor.pileup', sep='\t', header=None, names=['Chr', 'Start', 'ref', 'depth', 'reads', 'mapQ'], dtype={'Start':int, 'reads':str, 'mapQ': str})
pd.merge(anno,tumorpileup, on=['Chr', 'Start'])


In [None]:
threads = 4
split = round(len(tumorpileup.index) / threads)
splits = np.array_split(tumorpileup, 4)

In [33]:
import pandas as pd
anno = pd.read_csv('testdata/input.anno', sep='\t', header=None)
anno

Unnamed: 0,0,1,2,3,4
0,chr11,397665,397665,G,C
1,chr11,562012,562012,C,T
2,chr11,824202,824202,C,A
3,chr11,1013896,1013896,C,-
4,chr11,1081746,1081746,G,C
5,chr11,1277322,1277322,G,T
6,chr11,2418116,2418116,-,A
7,chr11,3680752,3680752,G,A
8,chr11,5012707,5012707,C,G
9,chr11,5221726,5221726,A,G


In [34]:
anno[1] = anno[1] - 1 - (anno[3] == '-')
anno[2] = anno[2] - (anno[3] == '-')
anno

Unnamed: 0,0,1,2,3,4
0,chr11,397664,397665,G,C
1,chr11,562011,562012,C,T
2,chr11,824201,824202,C,A
3,chr11,1013895,1013896,C,-
4,chr11,1081745,1081746,G,C
5,chr11,1277321,1277322,G,T
6,chr11,2418114,2418115,-,A
7,chr11,3680751,3680752,G,A
8,chr11,5012706,5012707,C,G
9,chr11,5221725,5221726,A,G


In [30]:
anno

Unnamed: 0,0,1,2,3,4
0,chr11,397665,397665,G,C
1,chr11,562012,562012,C,T
2,chr11,824202,824202,C,A
3,chr11,1013896,1013896,C,-
4,chr11,1081746,1081746,G,C
5,chr11,1277322,1277322,G,T
6,chr11,2418115,2418116,-,A
7,chr11,3680752,3680752,G,A
8,chr11,5012707,5012707,C,G
9,chr11,5221726,5221726,A,G
