# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

# Running EBfilter createCache on testdata

## setting the config

In [3]:
cd ..

/Users/martinscience/Sites/Python/EBFilter


In [4]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}


## running makeEBcache on testdata

In [None]:
args = {}
args['pon_list'] = 'testdata/PoN_list.txt'
args['cache_folder'] = 'output/testdata_cache' # provide a folder for storing the data (snakemake config)
args['force_caching'] = True                  # force cache generation although no bed_file is provided
args['bed_file'] = 'testdata/input.bed'
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

### importing anno_df

In [8]:
import pandas as pd
def to_int(Chr_name):
    '''
    converts all number chromosomes to int
    '''
    try:
        return int(Chr_name)
    except ValueError:
        return Chr_name
anno_df = pd.read_csv('testdata/input.anno', sep='\t', converters={0:to_int, 1:to_int, 2:to_int})
rest_columns = [f'other{i+1}' for i in range(len(anno_df.columns) - 5)]
anno_df.columns = ['Chr','Start','End','Ref', 'Alt'] + rest_columns
test = "chr11"
anno_df_chr = anno_df.query(f'Chr == "{test}"')
anno_df_chr.query('not (Ref == "C")')


Unnamed: 0,Chr,Start,End,Ref,Alt
3,chr11,1081746,1081746,G,C
4,chr11,1277322,1277322,G,T
6,chr11,3680752,3680752,G,A
8,chr11,5221726,5221726,A,G


### importing and inspecting cache file

In [39]:
AB_chr11 = pd.read_csv('output/testdata_cache/chr11.cache', sep=',').set_index(['Chr', 'Start'])
AB_columns = pd.MultiIndex.from_product([['A','C','T','G'],['+', '-'],['a','b']], names=['var', 'strand', 'param'])
AB_chr11.columns = AB_columns
AB_chr11 = AB_chr11.stack('var').reset_index()
AB_chr11.columns = AB_chr11.columns.droplevel(0)
AB_chr11.columns = ['Chr', 'Start', 'Alt'] + ['a+', 'b+', 'a-', 'b-']
AB_chr11[:13]


Unnamed: 0,Chr,Start,Alt,a+,b+,a-,b-
0,chr11,192951,A,0.1,1.0,0.1,1.0
1,chr11,192951,C,0.1,1.0,0.1,1.0
2,chr11,192951,G,0.1,1.0,0.1,1.0
3,chr11,192951,T,0.1,1.0,0.1,1.0
4,chr11,192952,A,0.1,1.0,0.1,1.0
5,chr11,192952,C,0.1,1.0,0.1,1.0
6,chr11,192952,G,0.1,1.0,0.1,1.0
7,chr11,192952,T,0.1,1.0,0.1,1.0
8,chr11,192953,A,0.1,1.0,0.1,1.0
9,chr11,192953,C,0.1,1.0,0.1,1.0


### merge the anno_df and AB_df

In [45]:
merged = anno_df_chr.merge(AB_chr11, on=['Chr', 'Start', 'Alt'])
row = merged.iloc[0]
row

Chr        chr11
Start     562012
End       562012
Ref            C
Alt            T
a+           0.1
b+       7.21454
a-           0.1
b-       7.21454
Name: 0, dtype: object

In [10]:
row = merged.iloc[0]
var = row.name[4]
row = row[var]
row

IndexError: invalid index to scalar variable.

* Variante 2:

In [11]:
merged = anno_df_chr.merge(AB_chr11, on=['Chr', 'Start']).set_index(['Chr', 'Start', 'End', 'Ref', 'Alt'])
merged.columns = AB_columns
merged = merged.reset_index()
row = merged.iloc[0]
var = row[('Alt','','')]
row[var]


ValueError: Length mismatch: Expected axis has 4 elements, new values have 16 elements

In [None]:
def get_AB(row):
    s = row.loc[row['Alt']]
    return s
right 
merged[merged.apply(get_AB, axis=1)
       

### running EBscore in cache_mode on testdata

In [None]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/PoN_list.txt'
args['output_path'] = 'output/testdata_eb.csv'
args['cache_folder'] = 'output/testdata_cache'
args['generate_cache'] = False

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

# Running EBfilter createCache on my data

### Setting my testdata as arguments

In [None]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_folder'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.ABcache'
args['force_caching'] = True
args['bed_file'] = None
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = True
run.main(args, config)
!ls output

# Running EBfilter in Cache mode on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

### apply a column apply on several columns

In [None]:
import re
import pandas as pd
sign_re = re.compile(r'\^.|\$')
def clean_reads(run_column):

    return run_column.str.replace(sign_re, '')


pileup_small[['read0', 'read1']] = pileup_small[['read0', 'read1']].apply(lambda column: column.str.replace(sign_re, ''))


In [37]:
for i in range(1):
    print(i)

0
