# Implementation of the EBfilter by genomon

* EBrun (originally EBFilter) is an argparse wrapper passing command line arguments to run.py (is not needed for internal use)
* passed arguments:
    * targetMutationFile: the .vcf or .anno containing the mutations
    * targetBamPath: path to the tumor bam file (+.bai)
    * controlBamPathList: text list of path to PoN bam files (+ .bai)
    * outputPath
    * -q option for quality threshold – not needed --> default _q config
    * -Q option for base quality threshold - not needed --> default _Q from config
    * --region option for restriction of regions on mpileup -l

## Initiation

### imports

In [1]:
import os
from code import run

### snakemake config

In [2]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['loption'] = True
params['fitting_penalty'] = .5
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}

### load the config and GLOBAL STATE into variable _

In [3]:
debug_mode = True
args = {}
params = config['EB']['params']
threads = config['EB']['threads']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
is_loption = params['loption']   # True
log_path = 'output/logs'         # log_file: 'output/logs/'



### function args
will be passed during the function call

In [4]:
import os
HOME = os.environ['HOME']
args['mut_file'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv'
args['tumor_bam'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/bam/test_Rel1.bam'
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = os.path.join(log_path, f"{os.path.splitext(os.path.basename(args['output_path']))[0]}.log")
filter_quals = '!"#$%&\'()*+,-./'
# dump into _
_ = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'fitting_penalty': fit_pen, 'ff':_ff, 'log':log_file, 'threads':threads, 'sep': sep, 'debug_mode':debug_mode}
                        
                        

# Using dataframes

In [6]:
##### RUNNIT
_['threads'] = 1
_['debug_mode'] = False
run.main(args,_)
!ls output

NameError: name 'subprocess' is not defined

## final merge

In [18]:
import pandas as pd
anno_df = pd.read_csv('/Users/martinscience/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv', sep=sep).sort_values(['Chr', 'Start'])
out_df = pd.read_csv('output/test_rel_eb_only.csv', sep='\t')
out_df

Unnamed: 0,Chr,Start,End_x,Ref_x,Alt_x,mut_type,somatic_status,End_y,Ref_y,Alt_y,EB_score
0,1,3649562,3649562,G,A,snp,Germline,3649562,G,A,13.080
1,1,53099279,53099279,C,T,snp,Germline,53099279,C,T,60.000
2,2,29416366,29416366,G,C,snp,Germline,29416366,G,C,13.207
3,2,85097574,85097574,G,A,snp,Germline,85097574,G,A,9.094
4,2,210685100,210685100,G,A,snp,Germline,210685100,G,A,3.886
5,4,140651585,140651587,CTG,-,indel,Germline,140651587,CTG,-,60.000
6,5,176930172,176930174,GAG,-,indel,Germline,176930174,GAG,-,60.000
7,6,30996643,30996643,T,C,snp,Germline,30996643,T,C,60.000
8,6,31084943,31084945,CTT,-,indel,Germline,31084945,CTT,-,60.000
9,7,1040174,1040174,G,A,snp,Germline,1040174,G,A,5.224


## transforming for EB_score

## getting the count matrix from a df row

In [11]:
import pandas as pd
import numpy as np
from functools import partial

mut_df = pd.read_csv('output/test_rel_clean.csv', sep='\t')
# for snps
row = pd.Series(mut_df.iloc[1])
var = row['Alt']
matrix = pd.DataFrame()
matrix['depth_p'] = row.iloc[6::3].str.count(r'[ACTG]')
matrix['mm_p'] = row.iloc[6::3].str.count(var)
matrix['depth_n'] = row.iloc[6::3].str.count(r'[actg]')
matrix['mm_n'] = row.iloc[6::3].str.count(var.lower())
matrix

Unnamed: 0,depth_p,mm_p,depth_n,mm_n
read0,10,4,4,2
read1,3,3,0,0
read2,2,0,1,0
read3,1,0,0,0
read4,4,0,1,0
read5,1,0,3,0
read6,3,0,0,0
read7,1,0,0,0
read8,3,0,0,0
read9,4,0,0,0


### performing EBscore

In [13]:
import numpy as np
from scipy.special import gammaln
control_df = matrix.loc['read1':]
control_p = control_df.loc[:, ['depth_p', 'mm_p']]
control_n = matrix.loc['read1':,['depth_n', 'mm_n']]

KS_matrix = np.array([[1,0,1,1,0,1,0,0,0],[0,1,-1,0,1,-1,0,0,0]])
gamma_reduce = np.array([1,-1,-1,-1,1,1,1,-1,-1])
def bb_loglikelihood(params, count_df):
    [a, b] = params
    ab_matrix = np.array([1,1,1,a+b,a,b,a+b,a,b])
    # convert df into matrix for np.array operations that change dims
    count_matrix = count_df.values
    # perform matrix multiplication to get inputs to log-gamma
    input_matrix = np.matmul(count_matrix,KS_matrix) + ab_matrix
    # get corresponding log-gamma values and reduce over pon-dimensions
    gamma_matrix = np.sum(gammaln(input_matrix), axis=0)
    log_likelihood = np.sum(gamma_matrix * gamma_reduce)
    return log_likelihood
params = [.4,.5]
control_p
# bb_loglikelihood(params, control_p)

Unnamed: 0,depth_p,mm_p
read1,10,0
read2,8,0
read3,10,0
read4,8,0
read5,13,0
read6,14,0
read7,9,0
read8,11,0
read9,17,1
read10,17,1


### getting the p-values for target pairs

In [226]:
test = matrix.loc['read0',['depth_p', 'mm_p']]
test

depth_p    44
mm_p        1
Name: read0, dtype: int64

In [219]:
target_p = matrix.loc['read0',['depth_p', 'mm_p']]
control_n = matrix.loc['read0':,['depth_n', 'mm_n']]
target_p + np.array([0,1])
n_minus_k = target_p[0] - target_p[1]
df_list = [target_p + [0,i] for i in range(0, n_minus_k + 1)]
df_list

[depth_p    44
 mm_p        1
 Name: read0, dtype: int64, depth_p    44
 mm_p        2
 Name: read0, dtype: int64, depth_p    44
 mm_p        3
 Name: read0, dtype: int64, depth_p    44
 mm_p        4
 Name: read0, dtype: int64, depth_p    44
 mm_p        5
 Name: read0, dtype: int64, depth_p    44
 mm_p        6
 Name: read0, dtype: int64, depth_p    44
 mm_p        7
 Name: read0, dtype: int64, depth_p    44
 mm_p        8
 Name: read0, dtype: int64, depth_p    44
 mm_p        9
 Name: read0, dtype: int64, depth_p    44
 mm_p       10
 Name: read0, dtype: int64, depth_p    44
 mm_p       11
 Name: read0, dtype: int64, depth_p    44
 mm_p       12
 Name: read0, dtype: int64, depth_p    44
 mm_p       13
 Name: read0, dtype: int64, depth_p    44
 mm_p       14
 Name: read0, dtype: int64, depth_p    44
 mm_p       15
 Name: read0, dtype: int64, depth_p    44
 mm_p       16
 Name: read0, dtype: int64, depth_p    44
 mm_p       17
 Name: read0, dtype: int64, depth_p    44
 mm_p       18
 

In [194]:
control_pm = np.matmul(control_p.values,KS_matrix) + ab_matrix
control_pm.shape

(10, 9)

In [195]:

x = np.sum(gammaln(control_pm), axis=0)
print(x.shape)
reductor = np.array([1,-1,-1,-1,1,1,1,-1,-1])
y = x * reductor
math.exp(np.sum(y))

(9,)


4.235580538201608e-09

In [197]:
sum([1,2,3])

6


math.lgamma(-0.3)

### adjusting for bad quality calls

In [None]:
import re
filters = '!"#$%?&\'()*+,-./'
filter_string = r"([" + filters + "])"
filter_re = re.compile(filter_string)
read = 'AAAA*BBB'
Q = 'AAAA*BBB'
while filter_re.search(Q):
    m = filter_re.search(Q)
    print(Q,read, m.end(), m.group(0), m.group(1))
    Q = filter_re.sub('', Q, count=1)
    pos = max(m.start(),m.end()-1)
    read = read[:pos] + read[pos+1:]
print(read)


In [None]:
import pandas as pd
import re
from functools import partial

filters = '!"#$%?&\'()*+,-./'
filter_string = r"([" + filters + "])"
filter_re = re.compile(filter_string)

def remove_badQ(i,row):
    Q = row[f"Q{i}"]
    read = row[f"read{i}"]
    while filter_re.search(Q):
        print(read, Q)
        m = filter_re.search(Q)
        Q = filter_re.sub('', Q, count=1)
        pos = max(m.start(),m.end()-1)
        read = read[:pos] + read[pos+1:]
    print(read,Q)
    return row   
mut_df = pd.read_csv('output/test_rel_clean.csv', sep='\t')
pon_count = 10

is_snp = (mut_df['Ref'] != '-') & (mut_df['Alt'] != '-')
# example for row 7
has_badQ = mut_df['Q0'].str.contains(filter_re)
bad_df = mut_df[is_snp & has_badQ]
mut_df[is_snp & has_badQ] = bad_df.apply(partial(remove_badQ, 0), axis=1)


In [None]:
def remove_badQ(df, filters):
    filter_string = r"[" + filters + "]"
    filter_re = re.compile(filter_string)
    is_snp = (df['Ref'] != '-') & (df['Alt'] != '-')
    for i in range(pon_count + 1):
        Q = f"Q{i}"
        print(Q)
        bad_rows = df[snp][df.str.contains(filter_re)]
        print(bad_rows)


In [None]:
import re

filter_string = r"(^[" + filters + "].)|(.[" + filters + "])"
filter_re = re.compile(filter_string)
test = "s*d"
m = filter_re.search(test)
m.group()

In [6]:
import pandas as pd
mut_df = pd.read_csv('output/test_rel_clean.csv', sep='\t')
mut_df[mut_df['Alt'] == '-']

Unnamed: 0,Chr,Start,End,Ref,Alt,depth0,read0,Q0,depth1,read1,...,Q7,depth8,read8,Q8,depth9,read9,Q9,depth10,read10,Q10
5,4,140651585,140651587,CTG,-,9,t---_---t,BI@aA@@@>,1,_,...,aA,1,-,b,2,--,BA,1,t,B
6,5,176930172,176930174,GAG,-,5,--__A,=ABB@,1,a,...,?,2,aA,B@,0,*,*,1,_,C
8,6,31084943,31084945,CTT,-,8,G-_G_G-G,iEDiEECE,2,_G,...,FE,0,*,*,6,GGGGgG,hjijEh,1,g,D
100,11,76751604,76751604,T,-,8,-__-_---,iEEDEahD,1,-,...,ED,2,-_,hF,2,_-,FD,4,----,kjEC
110,12,76424938,76424940,TGC,-,13,Tt-____---_--,:5?BBBB??:Ba>,1,_,...,CC,2,-_,_C,3,-__,?CD,2,_t,C=
119,17,74309086,74309088,AAT,-,5,_-__C,EDEhd,2,Cc,...,i,3,cCC,AhD,0,*,*,0,*,*
124,19,30500119,30500121,TGA,-,6,G_--_-,FCEEAE,0,*,...,*,1,G,h,0,*,*,0,*,*


In [15]:
test = False
test2 = None
test3 = 'hello'
n = test and test3
n

False

### letter count

In [7]:
import pandas as pd
mut_df = pd.read_csv('output/test_rel_clean.csv', sep='\t')
mut_df
# check length:
for i in range(11):
    read = f"read{i}"
    Q = f"Q{i}"
    test = mut_df[mut_df[read].str.len() != mut_df[Q].str.len()]
    print(len(test.index))
i = 7
read = f"read{i}"
Q = f"Q{i}"
test = mut_df[mut_df[read].str.len() != mut_df[Q].str.len()]
test

0
0
0
0
0
0
0
0
0
0
0


Unnamed: 0,Chr,Start,End,Ref,Alt,depth0,read0,Q0,depth1,read1,...,Q7,depth8,read8,Q8,depth9,read9,Q9,depth10,read10,Q10


### cleaning the data

In [6]:
import pandas as pd
import re
from functools import partial

mut_df = pd.read_csv('output/test_rel_eb.csv', sep='\t')

# remove start_end signs in reads
sign_re = re.compile(r'\^.|\$')
indel_simple = re.compile(r'[\+\-]([0-9]+)')

mut_df['read'] = mut_df['read'].str.replace(sign_re, '')
for i in range(10):
    read = f"read{i}"
    mut_df[read] = mut_df[read].str.replace(sign_re, '')
is_indel = (mut_df['Ref'] == '-') | (mut_df['Alt'] == '-')
def clean_indels(i, row):
    indel_length = indel_simple.search(row['read']).group(1)
    re_string = r"([ACGTNacgtn])([\+\-])([0-9]+)([ACGTNacgtn]{" + str(indel_length) + "})"
    indel_re = re.compile(re_string)
    row['read'] = indel_re.sub(r'\1', row['read'])
    for i in range(i):
        read = f"read{i}"
        row[read] = indel_re.sub(r'\1', row[read])
    return row
clean_indels_i = partial(clean_indels, 10)
    
mut_df[is_indel] = mut_df[is_indel].apply(clean_indels_i, axis=1)
mut_df[is_indel]

# mut_df['read'].replace(sign_re, '')




FileNotFoundError: [Errno 2] File b'output/test_rel_eb.csv' does not exist: b'output/test_rel_eb.csv'

## importing pileup directly into dataframe

In [None]:
import pandas as pd
from io import StringIO
import subprocess
with open('output/testpile', 'w') as file_out:
    bam = 'testdata/tumor.bam'
    cmd = ["samtools", "mpileup", "-B", "-d", "10000000", bam]
    pileup = subprocess.Popen(cmd, stdout=subprocess.PIPE)
b = StringIO(pileup.communicate()[0].decode('utf-8'))
pile_df = pd.read_csv(b, sep='\t', header=None, names=['Chr', 'Start', 'Alt', 'depth', 'read', 'quality'])
pile_df[:14]

### using apply on the whole dataframe

In [None]:
def useit(row):
    return f"{row['Alt']} and {row['read']}"

    
pile_df.apply(useit, axis=1)[:13]

In [None]:
import pandas as pd
test = pd.DataFrame(['abc', 'aaaccd'],['aaccaa', 'aaatccca'])

In [None]:
test[0,0].value_counts()

## joining pileup and mut_df into one

In [None]:
import pandas as pd
mut_df = pd.read_csv('/Users/mahtin/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv', sep='\t').sort_values(['Chr', 'Start'])
mut_df['Start'] -= (mut_df['Alt'] == '-')
pileup_df = pd.read_csv('output/test_rel_eb.target.pileup', header=None, sep='\t', names=['Chr', 'Start', 'Ref', 'depth', 'read', 'Q'], dtype = {'Chr':int, 'Start':int, 'Ref':str, 'depth':int, 'read':str, 'Q':str}).drop(columns='Ref')
pileup_df
mut_df = pd.merge(left=mut_df, right=pileup_df, on=['Chr', 'Start'], how='outer', left_index=True)
# control
names = ['Chr', 'Start','Ref']
for i in range(10):
    names += [f"depth{i}", f"read{i}", f"Q{i}"]
control_df = pd.read_csv('output/test_rel_eb.control.pileup', header=None, sep='\t', names = names).drop(columns='Ref')
mut_df = pd.merge(left=mut_df, right=control_df, on=['Chr', 'Start'], how='outer', left_index=True)
mut_df
# mut_df['Start'] += (mut_df['Alt'] == '-')

In [None]:
names = ['Chr', 'Start','Ref']
for i in range(10):
    names += [f"depth{i}", f"read{i}", f"Q{i}"]
names

In [None]:
mut_df

## Single threading

In [None]:
##### RUNNIT
_['threads'] = 1
_['debug_mode'] = False
main(args,_)
!ls output

## Multithreading

In [None]:
_['threads'] = 3
_['debug_mode'] = False
run.main(args,_)
!ls output/

### trying with multiprocessing Pool

In [None]:
from functools import partial
from multiprocessing import Pool
def show(a,b,c,d):
    return a + b + c +d
a = 'a'
b = 'b'
c = 'c'
itera = ['d', 'e', 'f']
show_part = partial(show, a, b, c)
show_part('t')
pool = Pool(3)
result = pool.map(show_part, itera)
pool.close()
result
