# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

## Setup arguments and state variable

### imports

In [None]:
import os
from code import run
import pandas as pd

### snakemake config

In [None]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}

### load the config and global state

In [None]:
args = {}
params = config['EB']['params']
threads = config['EB']['threads']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
state = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}

### Setting my testdata as arguments

# Running EBfilter createCache on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_path'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.ABcache'
args['generate_cache'] = True                 

In [None]:
state['threads'] = 1
state['debug_mode'] = True
run.main(args, state)
!ls output

## Running EBfilter in Cache mode on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

In [None]:
from functools import partial
import os
import subprocess
import pandas as pd
from io import StringIO
from subprocess import Popen, DEVNULL, PIPE
state['log'] = 'output/pileup.log'
pon_df = pd.read_csv('testdata/list_normal_sample.txt', header=None)
pon_sub_folder = 'output/pon'
if not os.path.isdir(pon_sub_folder):
    os.mkdir(pon_sub_folder)
chromosome = 'chr11'

### get the chromosome list from one of the pon bam files

In [None]:
def bam_to_chr_list(bam_file):
    bam_stats_cmd = ['samtools', 'idxstats', bam_file]
    bam_stats = Popen(bam_stats_cmd, stdout=PIPE, stderr=DEVNULL)
    bam_stats_string = StringIO(bam_stats.communicate()[0].decode('utf-8'))
    bam_stats_df = pd.read_csv(bam_stats_string, sep='\t', header=None)
    return list(bam_stats_df[0].T)
bam_list = bam_to_chr_list(pon_df.iloc[0,0])
bam_list[1]

### split bams for multithreading

In [None]:
def split_bam(chromosome, pon_row):
    bam_file = pon_row[0]
    bam_out = os.path.join(pon_sub_folder, f"{os.path.splitext(os.path.basename(bam_file))[0]}_{str(chromosome)}.bam")
    split_bam_cmd = ["samtools", "view", "-b", "-o", bam_out, bam_file, str(chromosome)]
    bam_index_cmd = ["samtools", "index", bam_out]
    subprocess.check_call(split_bam_cmd)
    subprocess.check_call(bam_index_cmd)
    return bam_out
pon_sub_df = pd.DataFrame()
pon_sub_df['bam'] = pon_df.apply(partial(split_bam, chromosome), axis=1)
pon_file_sub = os.path.join(pon_sub_folder, f"pon_list_{chromosome}.txt")
pon_sub_df.to_csv(pon_file_sub, header=None, index=False)
pon_count = 10

### get the pileup from the pon_list

In [46]:
def get_pileup_df(pon_file_sub):
    with open(state['log'], 'w+') as log:
        mpileup_cmd = ["samtools", "mpileup", "-B", "-d", "10000000", "-q",str(state['q']), "-Q",str(state['Q']), "--ff",state['ff']]
        mpileup_cmd += ["-b", pon_file_sub]
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        names = ['Chr', 'Start', 'Ref']
        for i in range(pon_count):
            names += [f"depth{i}", f"read{i}", f"Q{i}"]
    return pd.read_csv(pileup_string, sep='\t', header=None, names=names)
pileup_df = get_pileup_df(pon_file_sub)

In [48]:
pileup_df[30:60]

Unnamed: 0,Chr,Start,Ref,depth0,read0,Q0,depth1,read1,Q1,depth2,...,Q6,depth7,read7,Q7,depth8,read8,Q8,depth9,read9,Q9
30,chr11,192981,N,14,AAAAAAAAAAAAAA,IIIIIIIIIIIIII,10,AAAAAAAAA^]A,IIIIIIIIII,9,...,IIIIIIIIIIII,15,AAAAAAAAAAAAAAA,IIIIIIIIIIIIIII,7,AAAAAA^]A,IIIIIII,13,AAAAAAAAAAAAA,IIIIIIIIIIIII
31,chr11,192982,N,14,TTTTTTTTTTTTTT,IIIIIIIIIIIIII,11,TTTTTTTTTT^]T,IIIIIIIIIII,10,...,IIIIIIIIIIIII,16,TTTTTTTTTTTTTTT^]T,IIIIIIIIIIIIIIII,8,TTTTTTT^]T,IIIIIIII,14,TTTTTTTTTTTTT^]T,IIIIIIIIIIIIII
32,chr11,192983,N,14,CCCCCCCCCCCCCC,IIIIIIIIIIIIII,11,CCCCCCCCCCC,IIIIIIIIIII,10,...,IIIIIIIIIIIII,16,CCCCCCCCCCCCCCCC,IIIIIIIIIIIIIIII,8,CCCCCCCC,IIIIIIII,15,CCCCCCCCCCCCCC^]C,IIIIIIIIIIIIIII
33,chr11,192984,N,14,TTTTTTTTTTTTTT,IIIIIIIIIIIIII,11,TTTTTTTTTTT,IIIIIIIIIII,10,...,IIIIIIIIIIIIII,17,TTTTTTTTTTTTTTTT^]T,IIIIIIIIIIIIIIIII,8,TTTTTTTT,IIIIIIII,15,TTTTTTTTTTTTTTT,IIIIIIIIIIIIIII
34,chr11,192985,N,14,AAAAAAAAAAAAAA,IIIIIIIIIIIIII,11,AAAAAAAAAAA,IIIIIIIIIII,10,...,IIIIIIIIIIIIIII,17,AAAAAAAAAAAAAAAAA,IIIIIIIIIIIIIIIII,8,AAAAAAAA,IIIIIIII,15,AAAAAAAAAAAAAAA,IIIIIIIIIIIIIII
35,chr11,192986,N,14,AAAAAAAAAAAAAA,IIIIIIIIIIIIII,11,AAAAAAAAAAA,IIIIIIIIIII,10,...,IIIIIIIIIIIIIIII,17,AAAAAAAAAAAAAAAAA,IIIIIIIIIIIIIIIII,9,AAAAAAAA^]A,IIIIIIIII,16,AAAAAAAAAAAAAAA^]A,IIIIIIIIIIIIIIII
36,chr11,192987,N,14,GGGGGGGGGGGGGG,IIIIIIIIIIIIII,11,GGGGGGGGGGG,IIIIIIIIIII,10,...,IIIIIIIIIIIIIIIII,17,GGGGGGGGGGGGGGGGG,IIIIIIIIIIIIIIIII,10,GGGGGGGGG^]G,IIIIIIIIII,17,GGGGGGGGGGGGGGGG^]G,IIIIIIIIIIIIIIIII
37,chr11,192988,N,14,CCCCCCCCCCCCCC,IIIIIIIIIIIIII,11,CCCCCCCCCCC,IIIIIIIIIII,11,...,IIIIIIIIIIIIIIIII,17,CCCCCCCCCCCCCCCCC,IIIIIIIIIIIIIIIII,11,CCCCCCCCCC^]C,IIIIIIIIIII,18,CCCCCCCCCCCCCCCCC^]C,IIIIIIIIIIIIIIIIII
38,chr11,192989,N,14,TTTTTTTTTTTTTT,IIIIIIIIIIIIII,11,TTTTTTTTTTT,IIIIIIIIIII,12,...,IIIIIIIIIIIIIIIII,17,TTTTTTTTTTTTTTTTT,IIIIIIIIIIIIIIIII,12,TTTTTTTTTTT^]T,IIIIIIIIIIII,19,TTTTTTTTTTTTTTTTTT^]T,IIIIIIIIIIIIIIIIIII
39,chr11,192990,N,14,GGGGGGGGGGGGGG,IIIIIIIIIIIIII,11,GGGGGGGGGGG,IIIIIIIIIII,12,...,IIIIIIIIIIIIIIIII,17,GGGGGGGGGGGGGGGGG,IIIIIIIIIIIIIIIII,12,GGGGGGGGGGGG,IIIIIIIIIIII,20,GGGGGGGGGGGGGGGGGGG^]G,IIIIIIIIIIIIIIIIIIII


In [50]:
pileup_df = pileup_df.iloc[:,[0,1]].copy()[:20]

In [51]:
pileup_df

Unnamed: 0,Chr,Start
0,chr11,192951
1,chr11,192952
2,chr11,192953
3,chr11,192954
4,chr11,192955
5,chr11,192956
6,chr11,192957
7,chr11,192958
8,chr11,192959
9,chr11,192960


In [54]:
def getit(row):
    s = pd.Series()
    s[('A','a')] = f"{row['Start']}BC"
    s[('A','b')] = f"{row['Start']}AB"
    return s
AB_df = pd.DataFrame()
AB_df = pileup_df.apply(getit, axis=1)
AB_df[:13]

KeyError: ("None of [Index(['A', 'a'], dtype='object')] are in the [index]", 'occurred at index 0')

In [36]:
AB_df

Unnamed: 0,Chr,Start
0,chr11,192951.0
1,chr11,192952.0
2,chr11,192953.0
3,chr11,192954.0
4,chr11,192955.0
5,chr11,192956.0
6,chr11,192957.0
7,chr11,192958.0
8,chr11,192959.0
9,chr11,192960.0
