# Implementation of the EBfilter by genomon

* EBrun (originally EBFilter) is an argparse wrapper passing command line arguments to run.py (is not needed for internal use)
* passed arguments:
    * targetMutationFile: the .vcf or .anno containing the mutations – needed --> mut_file
    * targetBamPath: path to the tumor bam file (+.bai) – needed --> tumor_bam
    * controlBamPathList: text list of path to PoN bam files (+ .bai) – needed --> pon_list
    * outputPath: clear  – needed --> output_path
    * -f option for anno or vcf – not needed --> will be inferred from .ext
    * thread_num: –not needed --> taken from config
    * -q option for quality threshold – not needed --> default _q config
    * -Q option for base quality threshold - not needed --> default _Q from config
    * --ff option for filter flags – not needed because of preprocessing??
    * --loption for samtools mpileup -l option – must elaborate..
    * --region option for restriction of regions on mpileup -l – must elaborate..
    * --debug – not needed

## Initiation

### imports

In [1]:
import os
from code import run

### snakemake config

In [2]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['loption'] = True
config['EB']['threads'] = 1
config['EB']['params'] = params
config['annovar'] = {'sep': '\t'}

### load the config and GLOBAL STATE into variable _

In [3]:
debug_mode = True
args = {}
params = config['EB']['params']
threads = config['EB']['threads']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
is_loption = params['loption']   # True
log_path = 'output/logs'         # log_file: 'output/logs/'



### function args
will be passed during the function call

In [4]:
args['mut_file'] = '/Users/mahtin/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv'
args['tumor_bam'] = '/Users/mahtin/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/bam/test_Rel1.bam'
args['pon_list'] = '/Users/mahtin/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = os.path.join(log_path, f"{os.path.splitext(os.path.basename(args['output_path']))[0]}.log")
# dump into _
_ = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'ff':_ff, 'log':log_file, 'threads':threads, 'sep': sep, 'debug_mode':debug_mode}

# Using dataframes

In [5]:
##### RUNNIT
_['threads'] = 1
_['debug_mode'] = True
run.main(args,_)
!ls output

     Chr      Start        End  Ref  Alt  depth  \
0      1    3649562    3649562    G    A      7   
1      1   53099279   53099279    C    T     14   
2      2   29416366   29416366    G    C      6   
3      2   85097574   85097574    G    A      8   
4      2  210685100  210685100    G    A      4   
5      4  140651584  140651587  CTG    -      9   
6      5  176930171  176930174  GAG    -      5   
7      6   30996643   30996643    T    C     25   
8      6   31084942   31084945  CTT    -      8   
9      7    1040174    1040174    G    A      5   
10     7  100550508  100550508    A    C     14   
11     7  100550515  100550515    T    A     16   
12     7  100550549  100550549    C    T     14   
13     7  100550793  100550793    C    T     59   
14     7  100550804  100550804    C    T     61   
15     7  100550813  100550813    A    G     66   
16     7  100550819  100550819    C    A     62   
17     7  100550829  100550829    C    G     61   
18     7  100550841  100550841 

     Chr      Start        End  Ref  Alt  depth  \
0      1    3649562    3649562    G    A      7   
1      1   53099279   53099279    C    T     14   
2      2   29416366   29416366    G    C      6   
3      2   85097574   85097574    G    A      8   
4      2  210685100  210685100    G    A      4   
5      4  140651584  140651587  CTG    -      9   
6      5  176930171  176930174  GAG    -      5   
7      6   30996643   30996643    T    C     25   
8      6   31084942   31084945  CTT    -      8   
9      7    1040174    1040174    G    A      5   
10     7  100550508  100550508    A    C     14   
11     7  100550515  100550515    T    A     16   
12     7  100550549  100550549    C    T     14   
13     7  100550793  100550793    C    T     59   
14     7  100550804  100550804    C    T     61   
15     7  100550813  100550813    A    G     66   
16     7  100550819  100550819    C    A     62   
17     7  100550829  100550829    C    G     61   
18     7  100550841  100550841 

## importing pileup directly into dataframe

In [1]:
import pandas as pd
from io import StringIO
import subprocess
with open('output/testpile', 'w') as file_out:
    bam = 'testdata/tumor.bam'
    cmd = ["samtools", "mpileup", "-B", "-d", "10000000", bam]
    pileup = subprocess.Popen(cmd, stdout=subprocess.PIPE)
b = StringIO(pileup.communicate()[0].decode('utf-8'))
pile_df = pd.read_csv(b, sep='\t', header=None, names=['Chr', 'Start', 'Alt', 'depth', 'read', 'quality'])
pile_df[:14]

Unnamed: 0,Chr,Start,Alt,depth,read,quality
0,chr2,91842923,N,1,^!C,I
1,chr2,91842924,N,1,A,I
2,chr2,91842925,N,1,C,I
3,chr2,91842926,N,1,C,I
4,chr2,91842927,N,1,A,I
5,chr2,91842928,N,1,A,I
6,chr2,91842929,N,1,C,I
7,chr2,91842930,N,1,C,I
8,chr2,91842931,N,1,A,I
9,chr2,91842932,N,1,C,I


### using apply on the whole dataframe

In [57]:
def useit(row):
    return f"{row['Alt']} and {row['read']}"

    
pile_df.apply(useit, axis=1)[:13]

0     N and ^!C
1       N and A
2       N and C
3       N and C
4       N and A
5       N and A
6       N and C
7       N and C
8       N and A
9       N and C
10      N and A
11      N and T
12      N and T
dtype: object

In [11]:
import pandas as pd
test = pd.DataFrame(['abc', 'aaaccd'],['aaccaa', 'aaatccca'])

In [17]:
test[0,0].value_counts()

KeyError: (0, 0)

## joining pileup and mut_df into one

In [42]:
import pandas as pd
mut_df = pd.read_csv('/Users/mahtin/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv', sep='\t').sort_values(['Chr', 'Start'])
mut_df['Start'] -= (mut_df['Alt'] == '-')
pileup_df = pd.read_csv('output/test_rel_eb.target.pileup', header=None, sep='\t', names=['Chr', 'Start', 'Ref', 'depth', 'read', 'Q'], dtype = {'Chr':int, 'Start':int, 'Ref':str, 'depth':int, 'read':str, 'Q':str}).drop(columns='Ref')
pileup_df
mut_df = pd.merge(left=mut_df, right=pileup_df, on=['Chr', 'Start'], how='outer', left_index=True)
# control
names = ['Chr', 'Start','Ref']
for i in range(10):
    names += [f"depth{i}", f"read{i}", f"Q{i}"]
control_df = pd.read_csv('output/test_rel_eb.control.pileup', header=None, sep='\t', names = names).drop(columns='Ref')
mut_df = pd.merge(left=mut_df, right=control_df, on=['Chr', 'Start'], how='outer', left_index=True)
mut_df
# mut_df['Start'] += (mut_df['Alt'] == '-')

Unnamed: 0,Chr,Start,End,Ref,Alt,mut_type,somatic_status,depth,read,Q,...,Q6,depth7,read7,Q7,depth8,read8,Q8,depth9,read9,Q9
0,1,3649562,3649562,G,A,snp,Germline,7,aAAgGGA,BAcAb=A,...,aD,1,g,B,5,gaGaG,DCb@?,4,gGAg,DcAE
1,1,53099279,53099279,C,T,snp,Germline,14,ccTCCCCTtCCTTt,DD@CgCe=B@C`@B,...,C,3,CCC,CCC,4,CCCC,Chhg,6,TCCTtC,cCe@bB
2,2,29416366,29416366,G,C,snp,Germline,6,CCcCcc,EWDBEA,...,h,2,Gc,D6,0,*,*,2,cG,Eh
3,2,85097574,85097574,G,A,snp,Germline,8,AaaGAa^]G^]A,A@@?A@6@,...,*,1,G,?,0,*,*,2,Gg,cD
4,2,210685100,210685100,G,A,snp,Germline,4,aaA^]A,??A@,...,@>A,0,*,*,1,G,?,0,*,*
5,4,140651584,140651587,CTG,-,indel,Germline,9,tT-3NNNT-3NNNT-3NNNt-3nnnT-3NNNT-3NNNT-3NNNt,BI@aA@@@>,...,aA,1,T-3NNN,b,2,T-3NNNT-3NNN,BA,1,t,B
6,5,176930171,176930174,GAG,-,indel,Germline,5,A-3NNNA-3NNNa-3nnna-3nnnA,=ABB@,...,?,2,aA,B@,0,*,*,1,a-3nnn,C
7,6,30996643,30996643,T,C,snp,Germline,25,tTTtTtttTCTTTcTctcCCcCCT^]T,A=[?`>??BS@2BEB<;=Cg=e@A_,...,BB,1,T,B,7,tTTTTTt,?2dDa@?,3,TtT,a?B
8,6,31084942,31084945,CTT,-,indel,Germline,8,GG-3NNNg-3nnnGg-3nnnGG-3NNNG,iEDiEECE,...,FE,0,*,*,6,GGGGgG,hjijEh,1,g,D
9,7,1040174,1040174,G,A,snp,Germline,5,AAAAa,^????,...,*,1,g,D,0,*,*,0,*,*


In [39]:
names = ['Chr', 'Start','Ref']
for i in range(10):
    names += [f"depth{i}", f"read{i}", f"Q{i}"]
names

['Chr',
 'Start',
 'Ref',
 'depth0',
 'read0',
 'Q0',
 'depth1',
 'read1',
 'Q1',
 'depth2',
 'read2',
 'Q2',
 'depth3',
 'read3',
 'Q3',
 'depth4',
 'read4',
 'Q4',
 'depth5',
 'read5',
 'Q5',
 'depth6',
 'read6',
 'Q6',
 'depth7',
 'read7',
 'Q7',
 'depth8',
 'read8',
 'Q8',
 'depth9',
 'read9',
 'Q9']

In [17]:
mut_df

Unnamed: 0,Chr,Start,End,Ref,Alt,mut_type,somatic_status,depth,read,Q
0,1,3649562,3649562,G,A,snp,Germline,7,aAAgGGA,BAcAb=A
1,1,53099279,53099279,C,T,snp,Germline,14,ccTCCCCTtCCTTt,DD@CgCe=B@C`@B
2,2,29416366,29416366,G,C,snp,Germline,6,CCcCcc,EWDBEA
3,2,85097574,85097574,G,A,snp,Germline,8,AaaGAa^]G^]A,A@@?A@6@
4,2,210685100,210685100,G,A,snp,Germline,4,aaA^]A,??A@
5,4,140651585,140651587,CTG,-,indel,Germline,9,tT-3NNNT-3NNNT-3NNNt-3nnnT-3NNNT-3NNNT-3NNNt,BI@aA@@@>
6,5,176930172,176930174,GAG,-,indel,Germline,5,A-3NNNA-3NNNa-3nnna-3nnnA,=ABB@
7,6,30996643,30996643,T,C,snp,Germline,25,tTTtTtttTCTTTcTctcCCcCCT^]T,A=[?`>??BS@2BEB<;=Cg=e@A_
8,6,31084943,31084945,CTT,-,indel,Germline,8,GG-3NNNg-3nnnGg-3nnnGG-3NNNG,iEDiEECE
9,7,1040174,1040174,G,A,snp,Germline,5,AAAAa,^????


## Single threading

In [6]:
##### RUNNIT
_['threads'] = 1
_['debug_mode'] = False
main(args,_)
!ls output

AML033_rel_eb.csv [34mlogs[m[m              test_rel_eb.csv


## Multithreading

In [9]:
_['threads'] = 3
_['debug_mode'] = False
run.main(args,_)
!ls output/

AML033_rel_eb.csv                [34mlogs[m[m
AML033_rel_eb.csv.control.pileup test_rel_eb.csv
AML033_rel_eb.csv.target.pileup


### trying with multiprocessing Pool

In [12]:
from functools import partial
from multiprocessing import Pool
def show(a,b,c,d):
    return a + b + c +d
a = 'a'
b = 'b'
c = 'c'
itera = ['d', 'e', 'f']
show_part = partial(show, a, b, c)
show_part('t')
pool = Pool(3)
result = pool.map(show_part, itera)
pool.close()
result


['abcd', 'abce', 'abcf']