# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

# Running EBfilter createCache on testdata

## setting the config

In [1]:
cd ..

/Users/martinscience/Sites/Python/EBFilter


In [2]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}


## running makeEBcache on testdata

In [3]:
args = {}
args['pon_list'] = 'testdata/PoN_list.txt'
args['cache_folder'] = 'output/testdata_cache' # provide a folder for storing the data (snakemake config)
args['force_caching'] = True                  # force cache generation although no bed_file is provided
args['bed_file'] = 'testdata/input.bed'
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

Generating Cache...
Generating  pileup for chromosome chr1..
Generating  pileup for chromosome chr4..
Generating  pileup for chromosome chr7..
Splitting bam files for chromosome chr4..
Splitting bam files for chromosome chr7..
Splitting bam files for chromosome chr1..
Pileup for chromosome chr7 is empty and will be dropped..
Pileup for chromosome chr1 is empty and will be dropped..
Pileup for chromosome chr4 is empty and will be dropped..
Generating  pileup for chromosome chr5..
Generating  pileup for chromosome chr2..
Generating  pileup for chromosome chr8..
Splitting bam files for chromosome chr5..
Splitting bam files for chromosome chr2..
Splitting bam files for chromosome chr8..
Pileup for chromosome chr5 is empty and will be dropped..
Pileup for chromosome chr8 is empty and will be dropped..
Pileup for chromosome chr2 is empty and will be dropped..
Generating  pileup for chromosome chr3..
Generating  pileup for chromosome chr9..
Generating  pileup for chromosome chr6..
Splitting b

### importing anno_df

In [39]:
import pandas as pd
def to_int(Chr_name):
    '''
    converts all number chromosomes to int
    '''
    try:
        return int(Chr_name)
    except ValueError:
        return Chr_name
anno_df = pd.read_csv('testdata/input.anno', sep='\t', converters={0:to_int, 1:to_int, 2:to_int})
rest_columns = [f'other{i+1}' for i in range(len(anno_df.columns) - 5)]
anno_df.columns = ['Chr','Start','End','Ref', 'Alt'] + rest_columns
test = "chr11"
anno_df_chr = anno_df.query(f'Chr == "{test}"')
anno_df_chr


Unnamed: 0,Chr,Start,End,Ref,Alt
0,chr11,562012,562012,C,T
1,chr11,824202,824202,C,A
2,chr11,1013896,1013896,C,T
3,chr11,1081746,1081746,G,C
4,chr11,1277322,1277322,G,T
5,chr11,2418116,2418116,C,A
6,chr11,3680752,3680752,G,A
7,chr11,5012707,5012707,C,G
8,chr11,5221726,5221726,A,G


### importing and inspecting cache file

In [196]:
AB_chr11 = pd.read_csv('output/testdata_cache/chr11.cache', sep=',').set_index(['Chr', 'Start'])
AB_columns = pd.MultiIndex.from_product([['A','C','T','G'],['+', '-'],['a','b']], names=['var', 'strand', 'param'])
AB_chr11.columns = AB_columns
AB_chr11 = AB_chr11.stack('var')
AB_chr11
# AB_chr11.loc[('chr11', 192951),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,strand,+,+,-,-
Unnamed: 0_level_1,Unnamed: 1_level_1,param,a,b,a,b
Chr,Start,var,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
chr11,192951,A,0.10000,1.00000,0.100000,1.000000
chr11,192951,C,0.10000,1.00000,0.100000,1.000000
chr11,192951,G,0.10000,1.00000,0.100000,1.000000
chr11,192951,T,0.10000,1.00000,0.100000,1.000000
chr11,192952,A,0.10000,1.00000,0.100000,1.000000
chr11,192952,C,0.10000,1.00000,0.100000,1.000000
chr11,192952,G,0.10000,1.00000,0.100000,1.000000
chr11,192952,T,0.10000,1.00000,0.100000,1.000000
chr11,192953,A,0.10000,1.00000,0.100000,1.000000
chr11,192953,C,0.10000,1.00000,0.100000,1.000000


### merge the anno_df and AB_df

In [193]:
merged = anno_df_chr.merge(AB_chr11, left_on=['Chr', 'Start', 'Alt'], right_index=True)

merged

Unnamed: 0,Chr,Start,End,Ref,Alt,"(+, a)","(+, b)","(-, a)","(-, b)"
0,chr11,562012,562012,C,T,0.1,7.21454,0.1,7.214537
1,chr11,824202,824202,C,A,0.7332,26.11528,0.7332,26.115282
2,chr11,1013896,1013896,C,T,0.1,4.0768,0.1,4.076797
3,chr11,1081746,1081746,G,C,0.31085,22.43849,0.310846,22.438488
4,chr11,1277322,1277322,G,T,0.1,1.0,0.1,1.0
5,chr11,2418116,2418116,C,A,1.2278,44.73086,1.227801,44.730861
6,chr11,3680752,3680752,G,A,0.1,1.0,0.1,1.0
7,chr11,5012707,5012707,C,G,0.22517,14.29558,0.22517,14.295578
8,chr11,5221726,5221726,A,G,0.75321,43.39363,0.753206,43.393626


In [148]:
row = merged.iloc[0]
var = row.name[4]
row = row[var]
row

strand  param
+       a        0.100000
        b        7.214540
-       a        0.100000
        b        7.214537
Name: (chr11, 562012, 562012, C, T), dtype: float64

* Variante 2:

In [179]:
merged = anno_df_chr.merge(AB_chr11, on=['Chr', 'Start']).set_index(['Chr', 'Start', 'End', 'Ref', 'Alt'])
merged.columns = AB_columns
merged = merged.reset_index()
row = merged.iloc[0]
var = row[('Alt','','')]
row[var]


strand  param
+       a            0.1
        b        7.21454
-       a            0.1
        b        7.21454
Name: 0, dtype: object

In [124]:
def get_AB(row):
    s = row.loc[row['Alt']]
    return s
right 
merged[merged.apply(get_AB, axis=1)
       

SyntaxError: unexpected EOF while parsing (<ipython-input-124-c7f8d163f647>, line 6)

### running EBscore in cache_mode on testdata

In [None]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/PoN_list.txt'
args['output_path'] = 'output/testdata_eb.csv'
args['cache_folder'] = 'output/testdata_cache'
args['generate_cache'] = False

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

# Running EBfilter createCache on my data

### Setting my testdata as arguments

In [None]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_folder'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.ABcache'
args['force_caching'] = True
args['bed_file'] = None
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = True
run.main(args, config)
!ls output

# Running EBfilter in Cache mode on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

### apply a column apply on several columns

In [6]:
import re
import pandas as pd
sign_re = re.compile(r'\^.|\$')
def clean_reads(run_column):

    return run_column.str.replace(sign_re, '')


pileup_small[['read0', 'read1']] = pileup_small[['read0', 'read1']].apply(lambda column: column.str.replace(sign_re, ''))


NameError: name 'pileup_small' is not defined

In [11]:
pon_df = pd.read_csv('testdata/PoN_list.txt', header=None)
def test(row):
    print(f"{row} hello")
    return f"{row} hello"
pon_df[0].apply(test)

testdata/normalreference1.bam hello
testdata/normalreference2.bam hello
testdata/normalreference3.bam hello
testdata/normalreference4.bam hello
testdata/normalreference5.bam hello
testdata/normalreference6.bam hello
testdata/normalreference7.bam hello
testdata/normalreference8.bam hello
testdata/normalreference9.bam hello
testdata/normalreference10.bam hello


0     testdata/normalreference1.bam hello
1     testdata/normalreference2.bam hello
2     testdata/normalreference3.bam hello
3     testdata/normalreference4.bam hello
4     testdata/normalreference5.bam hello
5     testdata/normalreference6.bam hello
6     testdata/normalreference7.bam hello
7     testdata/normalreference8.bam hello
8     testdata/normalreference9.bam hello
9    testdata/normalreference10.bam hello
Name: 0, dtype: object

TypeError: test() got an unexpected keyword argument 'axis'

In [251]:
pon_df

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object