# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

# Running EBfilter createCache on testdata

## setting the config

In [None]:
cd ..

In [None]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}


## running makeEBcache on testdata (takes ~2h on 3 cores)
* is not necessary for testing the cache mode
* a precomputed EBcache is stored in testdata/precom_testcache

In [None]:
args = {}
args['pon_list'] = 'testdata/PoN_list.txt'
args['cache_folder'] = 'testdata/testdata_cache' # provide a folder for storing the data (snakemake config)
args['force_caching'] = True               # force cache generation although no bed_file is provided
args['bed_file'] = None # 'testdata/input.bed'
args['generate_cache'] = True


In [None]:
config['threads'] = 4
config['debug_mode'] = False
run.main(args, config)
!ls output

### using CLI:

In [None]:
!./makeEBcache -t 3 -force_caching testdata/PoN_list.txt testdata/testdata_cache

## running EBscore in cache_mode on testdata
* you can use the cache file generated in the last step or the precomputed one
* for using precomputed file, just change the args['use_cache'] value

In [None]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/PoN_list.txt'
args['output_path'] = 'output/testdata_eb.csv'
# toggle outcomment in the next two lines for use of precomputed cache file
args['use_cache'] = 'testdata/testdata_cache' 
# args['use_cache'] = 'testdata/precom_testcache'
args['chromosome'] = 'chr11'
args['generate_cache'] = False

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

### using CLI:

In [None]:
!EBscore -t 3 -use_cache testdata/testdata_cache testdata/input.anno testdata/tumor.bam testdata/PoN_list.txt output/testdata_EB.csv

## Running EBfilter createCache on my data (takes ~5h on 3 cores)

In [None]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_folder'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_cache'
args['force_caching'] = False
args['bed_file'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/HAEv7.bed'
args['chrom'] = '22'
args['generate_cache'] = True

In [None]:
config['threads'] = 3
config['debug_mode'] = False
run.main(args, config)
!ls output

### using CLI:

In [None]:
!makeEBcache -t3 -force_caching ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_cache

## Running EBscore in cache_mode on my data

In [None]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['mut_file'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv'
args['tumor_bam'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/bam/test_Rel1.bam'
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['use_cache'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_cache'
args['output_path'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/output/test_eb.csv'
args['chromosome'] = 'chr11'
args['generate_cache'] = False

In [None]:
config['threads'] = 40
config['debug_mode'] = True
run.main(args, config)

### using CLI:

In [None]:
!EBscore -t 3 -use_cache ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_cache ~//Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/anno/test_rel.csv ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/bam/test_Rel1.bam ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list ~/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/output/test_eb.csv


## Running EBfilter createCache in BIHCluster on real data

In [None]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
args = {}
args['pon_list'] = f'{HOME}/work/static/ref/PoN/AML_Pon.txt'
args['cache_folder'] = f'{HOME}/work/static/ref/PoN/AML_cache'
args['force_caching'] = False
args['chromosome'] = 'Y'
args['bed_file'] = f'{HOME}/work/static/ref/bed_files/SureSelect/hg19/SS_HAEv6r2_hg19_Padded_nochr.bed'
args['generate_cache'] = True


In [None]:
config['threads'] = 40
config['debug_mode'] = True
run.main(args, config)

In [5]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out

_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}

import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
# HOME = f"{home}/mount"
args = {}
args['pon_list'] = f'{HOME}/work/static/ref/PoN/AML_Pon.txt'
args['cache_folder'] = f'{HOME}/work/static/ref/PoN/AML_cache'
args['force_caching'] = False
args['chrom'] = '1'
args['bed_file'] = f'{HOME}/work/static/ref/bed_files/SureSelect/hg19/SS_HAEv6r2_hg19_Padded_nochr.bed'
args['generate_cache'] = True

config['threads'] = 20
config['debug_mode'] = True
run.main(args, config)

Validating PoN list /Users/martinscience/mount/work/static/ref/PoN/AML_PonHome.txt..
Cache file /Users/martinscience/mount/work/static/ref/PoN/AML_cache/Y.cache found. Does not need to be generated.
Everything is there. No need for computation.


### using CLI:

In [None]:
!makeEBcache -t3 -bed_file ~/work/static/ref/bed_files/SureSelect/hg19/SS_HAEv6r2_hg19_Padded_nochr.bed ~/work/static/ref/PoN/AML_Pon.txt ~/work/static/ref/PoN/AML_cache


## Running EBscore in cache_mode in BIHCluster on real data

In [2]:
cd ..

/Users/martinscience/mount/work/utils/EBFilter


In [3]:
from codes import run

# snakemake config
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': ','}
config['EB']['log'] = 'output/logfile'


params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
config = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}

import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
# HOME = f"{HOME}/mount"
args = {}
args['mut_file'] = f'{HOME}/work/utils/EBFilter/mut_file3.csv'
args['sep'] = 'tab'
args['output_path'] = f'{HOME}/work/utils/EBFilter/eb_file3.csv'
args['tumor_bam'] = f'{HOME}/scratch/projects/somVar/recalib/test_Rel1.bam'
args['pon_list'] = f'{HOME}/work/static/ref/PoN/AML_Pon.txt'
args['use_cache'] = f'{HOME}/work/static/ref/PoN/AML_cache'
args['generate_cache'] = False 

config['threads'] = 32
config['debug_mode'] = True
run.main(args, config)

Validating PoN list /Users/martinscience/work/static/ref/PoN/AML_Pon.txt..


No PanelOfNormals list file: /Users/martinscience/work/static/ref/PoN/AML_Pon.txt

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
config['threads'] = 32
config['debug_mode'] = True
run.main(args, config)

Validating PoN list /Users/martinscience/work/static/ref/PoN/AML_Pon.txt..


No PanelOfNormals list file: /Users/martinscience/work/static/ref/PoN/AML_Pon.txt

SystemExit: 1

## get the files of chrom 3 for testing

In [9]:
import os
HOME = os.environ['HOME'] # set HOME to run on different Macs
HOME = f"{HOME}/mount"

def get_AB_df(chrom, config):
    '''
    load and reformat the AB-cache file for a chromosome
    '''

    cache_file = f'{HOME}/work/static/ref/PoN/AML_cache/{chrom}.cache'
    # the merger columns
    cols = ['Chr', 'Start', 'Alt']
    # load the file and set Chr and Start to index for proper setting of multi-index
    # print(f"Loading cache {cache_file}..")
    AB_df = pd.read_csv(cache_file, sep=',', compression='gzip', dtype={'Chr': str, 'Start': int}).set_index(cols[:2])
    AB_columns = pd.MultiIndex.from_product([['A', 'C', 'T', 'G'], ['+', '-'], ['a', 'b']], names=['var', 'strand', 'param'])
    # set multi-indexed columns
    AB_df.columns = AB_columns
    # stack the var column level for merge with the snp_df
    AB_df= AB_df.stack('var')
    # reduce the column index level to 1
    AB_df.columns = AB_df.columns.droplevel(0)
    # unset the indices and transfer to columns
    AB_df = AB_df.reset_index()
    # rename columns for merge
    AB_df.columns = cols + ['a+', 'b+', 'a-', 'b-']
    return AB_df

AB_3 = get_AB_df('3', config)
anno_3 = pd.read_csv('eb_file3.csv', sep='\t', dtype={'Chr': str, 'Start': int})
snp_3 = anno_3.query('not (Ref == "-" or Alt == "-")')
bed = pd.read_csv(f'{HOME}/work/static/ref/bed_files/SureSelect/hg19/SS_HAEv6r2_hg19_Padded_nochr.bed', skiprows=10, sep='\t', dtype={0: str, 1: int}, header=None)
bed_3 = bed[bed[0] == 3][[0,1]]

In [20]:
merge_snpAB = AB_3.merge(snp_3, on=['Chr', 'Start'], how='right')
merge_ABbed = AB_3.merge(bed_3, left_on=['Chr', 'Start'], right_on=[0,1], how='outer')

In [16]:
for chrom in [str(chrom + 1) for chrom in range(22)] + ['X', 'Y']:
    length = len(bed[bed[0] == chrom].index)
    print(chrom, ' : ', length, ' : ',round(length * 11 / 23))

1  :  24403  :  11671
2  :  18370  :  8786
3  :  13880  :  6638
4  :  9623  :  4602
5  :  10978  :  5250
6  :  12023  :  5750
7  :  12266  :  5866
8  :  8408  :  4021
9  :  9918  :  4743
10  :  10250  :  4902
11  :  13371  :  6395
12  :  13551  :  6481
13  :  4307  :  2060
14  :  7920  :  3788
15  :  9221  :  4410
16  :  10351  :  4950
17  :  13488  :  6451
18  :  3876  :  1854
19  :  13820  :  6610
20  :  5795  :  2772
21  :  2604  :  1245
22  :  5479  :  2620
X  :  8404  :  4019
Y  :  876  :  419


In [33]:
na_index = anno_3.query('(EB_score != EB_score) and (Ref == "A")').index

In [35]:
anno_3.iloc[na_index]

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,dpsi_max_tissue,dpsi_zscore,ICGC_Id,ICGC_Occurrence,EB_score
1,3,403345,403345,A,G,intronic,CHL1,,,,...,,,,,,-0.1596,-0.603,,,
17,3,4404087,4404087,A,G,intronic,SUMF1,,,,...,,,,,,,,MU579704,SKCA-BR|1|66|0.01515,
58,3,9896351,9896351,A,G,intergenic,RPUSD3;CIDEC,dist=10608;dist=12043,,,...,,,,,,,,,,
83,3,10400266,10400266,A,G,intronic,ATP2B2,,,,...,,,,,,0.1382,0.589,,,
91,3,11926423,11926423,A,G,intergenic,TAMM41;SYN2,dist=38030;dist=119411,,,...,,,,,,,,,,
92,3,11926467,11926467,A,G,intergenic,TAMM41;SYN2,dist=38074;dist=119367,,,...,,,,,,,,,,
110,3,13438830,13438830,A,G,intronic,NUP210,,,,...,,,,,,-0.0808,-0.387,,,
167,3,18308436,18308436,A,G,intronic,LOC339862,,,,...,,,,,,,,,,
196,3,32031962,32031962,A,G,exonic,ZNF860,,nonsynonymous SNV,ZNF860:NM_001137674:exon2:c.A1391G:p.H464R,...,0.169,-0.599,-3.961,-1.083,4.085,,,MU776286,"PRAD-US|1|256|0.00391,LGG-US|1|283|0.00353,BLC...",
266,3,39149218,39149218,A,T,UTR5,GORASP1;TTC21A,NM_031899:c.-186T>A;NM_001278789:c.-186T>A;NM_...,,,...,,,,,,,,,,


In [37]:
test = ['sadf', 'ds']
' '.join(test)

'sadf ds'