# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

## Setup arguments and state variable

### imports

In [1]:
import os
from code import run
import pandas as pd

IndentationError: expected an indented block (cache.py, line 38)

### snakemake config

In [22]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}

### load the config and global state

In [23]:
args = {}
params = config['EB']['params']
threads = config['EB']['threads']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
state = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}

### Setting my testdata as arguments

# Running EBfilter createCache on my data

In [24]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_path'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.ABcache'
args['generate_cache'] = True                 

In [25]:
state['threads'] = 1
state['debug_mode'] = True
run.main(args, state)
!ls output

NameError: name 'run' is not defined

## Running EBfilter in Cache mode on my data

In [6]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

In [28]:
from functools import partial
import os
import subprocess
import pandas as pd
from io import StringIO
from subprocess import Popen, DEVNULL, PIPE
state['log'] = 'output/pileup.log'
pon_df = pd.read_csv('testdata/list_normal_sample.txt', header=None)
pon_sub_folder = 'output/pon'
if not os.path.isdir(pon_sub_folder):
    os.mkdir(pon_sub_folder)
chromosome = 'chr11'

### get the chromosome list from one of the pon bam files

In [29]:
def bam_to_chr_list(bam_file):
    bam_stats_cmd = ['samtools', 'idxstats', bam_file]
    bam_stats = Popen(bam_stats_cmd, stdout=PIPE, stderr=DEVNULL)
    bam_stats_string = StringIO(bam_stats.communicate()[0].decode('utf-8'))
    bam_stats_df = pd.read_csv(bam_stats_string, sep='\t', header=None)
    return list(bam_stats_df[0].T)
bam_list = bam_to_chr_list(pon_df.iloc[0,0])
bam_list[0]

'chr1'

### split bams for multithreading

In [33]:
def split_bam(chromosome, pon_row):
    bam_file = pon_row[0]
    bam_out = os.path.join(pon_sub_folder, f"{os.path.splitext(os.path.basename(bam_file))[0]}_{str(chromosome)}.bam")
    split_bam_cmd = ["samtools", "view", "-b", "-o", bam_out, bam_file, str(chromosome)]
    bam_index_cmd = ["samtools", "index", bam_out]
    subprocess.check_call(split_bam_cmd)
    subprocess.check_call(bam_index_cmd)
    return bam_out
pon_sub_df = pd.DataFrame()
pon_sub_df['bam'] = pon_df.apply(partial(split_bam, chromosome), axis=1)
pon_file_sub = os.path.join(pon_sub_folder, f"pon_list_{chromosome}.txt")
pon_sub_df.to_csv(pon_file_sub, header=None, index=False)
pon_count = 10

### get the pileup from the pon_list

In [34]:
def get_pileup_df(pon_file_sub):
    with open(state['log'], 'w+') as log:
        mpileup_cmd = ["samtools", "mpileup", "-B", "-d", "10000000", "-q",str(state['q']), "-Q",str(state['Q']), "--ff",state['ff']]
        mpileup_cmd += ["-b", pon_file_sub]
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        names = ['Chr', 'Start', 'Ref']
        for i in range(pon_count):
            names += [f"depth{i}", f"read{i}", f"Q{i}"]
    return pd.read_csv(pileup_string, sep='\t', header=None, names=names)
pileup_df = get_pileup_df(pon_file_sub)

In [36]:
pileup_df[:-20]

Unnamed: 0,Chr,Start,Ref,depth0,read0,Q0,depth1,read1,Q1,depth2,...,Q6,depth7,read7,Q7,depth8,read8,Q8,depth9,read9,Q9
0,chr11,192951,N,1,^]C,I,1,^]C,I,0,...,*,1,^]C,I,0,*,*,1,^]C,I
1,chr11,192952,N,2,A^]A,II,2,A^]A,II,0,...,*,1,A,I,0,*,*,2,A^]A,II
2,chr11,192953,N,2,CC,II,2,CC,II,1,...,*,2,C^]C,II,0,*,*,2,CC,II
3,chr11,192954,N,2,TT,II,3,TT^]T,III,2,...,*,3,TT^]T,III,0,*,*,2,TT,II
4,chr11,192955,N,2,GG,II,3,GGG,III,2,...,*,3,GGG,III,0,*,*,2,GG,II
5,chr11,192956,N,2,GG,II,3,GGG,III,2,...,I,3,GGG,III,0,*,*,2,GG,II
6,chr11,192957,N,2,GG,II,3,GGG,III,2,...,I,4,GGG^]G,IIII,0,*,*,2,GG,II
7,chr11,192958,N,2,GG,II,3,GGG,III,2,...,III,4,GGGG,IIII,0,*,*,3,GG^]G,III
8,chr11,192959,N,2,GG,II,3,GGG,III,3,...,III,4,GGGG,IIII,1,^]G,I,3,GGG,III
9,chr11,192960,N,2,CC,II,3,CCC,III,4,...,IIII,5,CCCC^]C,IIIII,1,C,I,3,CCC,III
