# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

# Running EBfilter createCache on testdata

## setting the config

In [1]:
cd ..

/Users/martinscience/Sites/Python/EBFilter


In [2]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}


## running makeEBcache on testdata

In [3]:
args = {}
args['pon_list'] = 'testdata/PoN_list.txt'
args['cache_folder'] = 'testdata/testdata_cache' # provide a folder for storing the data (snakemake config)
args['force_caching'] = False                 # force cache generation although no bed_file is provided
args['bed_file'] = 'testdata/input.bed'
args['generate_cache'] = True

In [4]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

Generating Cache...
Generating  pileup for chromosome chr11..
Splitting bam files for chromosome chr11..
Splitting the 10 lines of chr11.pileup into 3 chunks for multithreaded computation..
Process 32272: Computing ABs for Chr chr11 	(lines 0	 to 	10)
Process 32272: Computing ABs for chromosome chr11 	(lines 0	 to 	10) finished.
Writing ABcache for Chr chr11 to file testdata/testdata_cache/chr11.cache.
Writing final ABcache for covered regions to file testdata/testdata_cache/all.cache.
Generation of AB file finished.
testdata_clean.csv                   testdata_eb_32209.control.merged.csv
testdata_eb.csv                      testdata_eb_32209.control.pileup
testdata_eb_32185.control.merged.csv testdata_eb_32209.region_list.bed
testdata_eb_32185.control.pileup     testdata_eb_32209.target.pileup
testdata_eb_32185.region_list.bed    testdata_eb_32210.control.merged.csv
testdata_eb_32185.target.pileup      testdata_eb_32210.control.pileup
testdata_eb_32186.control.merged.csv testdata_eb_

### running EBscore in cache_mode on testdata

In [11]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/PoN_list.txt'
args['output_path'] = 'output/testdata_eb.csv'
args['use_cache'] = 'testdata/testdata_cache'
args['chromosome'] = 'chr11'
args['generate_cache'] = False

In [12]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

Running EBscore in Cache mode...
Loading annotation file testdata/input.anno into dataframe
Adding header: Chr	Start	End	Ref	Alt	other1...
Loading cache testdata/testdata_cache/chr11.cache..
testdata_eb.csv


# Running EBfilter createCache on my data

### Setting my testdata as arguments

In [13]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_folder'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_cache'
args['force_caching'] = True
args['bed_file'] = None
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = True
run.main(args, config)
!ls output

Generating Cache...
Generating  pileup for chromosome 8..
Generating  pileup for chromosome 3..
Generating  pileup for chromosome 7..
Splitting bam files for chromosome 7..
Splitting bam files for chromosome 3..
Splitting bam files for chromosome 8..


# Running EBfilter in Cache mode on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

### apply a column apply on several columns

In [None]:
import pandas as pd
cols = ['Chr', 'Start', 'Alt']
AB_df = pd.read_csv('testdata/testdata_cache/chr11.cache', sep=',').set_index(cols[:2])
AB_columns = pd.MultiIndex.from_product([['A','C','T','G'],['+', '-'],['a','b']], names=['var', 'strand', 'param'])
AB_df.columns = AB_columns
AB_df = AB_df.stack('var')
AB_df.columns = AB_df.columns.droplevel(0)
AB_df = AB_df.reset_index()

AB_df


In [None]:
cols + ['a+', 'b+', 'a-', 'b-']