# Idea: ABcache
## getting a precomputed ABcache file containing the parameters for beta-binomial distribution from the PoN-list

## Setup arguments and state variable

### imports

In [None]:
import os
from code import run
import pandas as pd

### snakemake config

In [None]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}

### load the config and global state

In [None]:
import os
from code import run
import pandas as pd

config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}


args = {}
params = config['EB']['params']
threads = config['EB']['threads']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
state = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}

### Setting my testdata as arguments

# Running EBfilter createCache on testdata

In [None]:
import os
from code import run
import pandas as pd

config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['fitting_penalty'] = .5
params['caching'] = True
# to simulate snakemake behavior
config['EB']['threads'] = 1
config['EB']['params'] = params
config['EB']
config['annovar'] = {'sep': '\t'}
config['EB']['log'] = 'output/logfile'


args = {}
params = config['EB']['params']
threads = config['EB']['threads']
log = config['EB']['log']
sep = config['annovar']['sep']
_q = str(params['map_quality'])  # mapping quality=20
_Q = params['base_quality']      # base quality=15
fit_pen = params['fitting_penalty']
filter_quals = ''
for qual in range( 33, 33 + _Q ): 
    filter_quals += chr( qual )  # qual asciis for filtering out
_ff = params['filter_flags']     # 'UNMAP,SECONDARY,QCFAIL,DUP'
state = {'q':_q, 'Q':_Q, 'filter_quals': filter_quals, 'log':log, 'fitting_penalty': fit_pen, 'ff':_ff, 'threads':threads, 'sep': sep}


args['pon_list'] = 'testdata/list_normal_sample.txt'
args['cache_path'] = 'output/test.cache'
args['generate_cache'] = True

In [None]:
state['threads'] = 3
state['debug_mode'] = True
run.main(args, state)
!ls output

# Running EBfilter createCache on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['cache_path'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.ABcache'
args['generate_cache'] = True

## Running EBfilter in Cache mode on my data

In [None]:
HOME = os.environ['HOME']
args['pon_list'] = f'{HOME}/Dropbox/Icke/Work/somVar/tools/EBFilter/mytestdata/aml_pon.list'
args['output_path'] = 'output/test_rel_eb.csv'
args['region'] = ''
log_file = f"{os.path.splitext(args['output_path'])[0]}.log"
state['log'] = log_file                   

In [None]:
from functools import partial
import os
import subprocess
import pandas as pd
from io import StringIO
from subprocess import Popen, DEVNULL, PIPE
state['log'] = 'output/pileup.log'
pon_df = pd.read_csv('testdata/list_normal_sample.txt', header=None)
pon_sub_folder = 'output/pon'
if not os.path.isdir(pon_sub_folder):
    os.mkdir(pon_sub_folder)
chromosome = 'chr11'

### get the chromosome list from one of the pon bam files

In [None]:
def bam_to_chr_list(bam_file):
    bam_stats_cmd = ['samtools', 'idxstats', bam_file]
    bam_stats = Popen(bam_stats_cmd, stdout=PIPE, stderr=DEVNULL)
    bam_stats_string = StringIO(bam_stats.communicate()[0].decode('utf-8'))
    bam_stats_df = pd.read_csv(bam_stats_string, sep='\t', header=None)
    return list(bam_stats_df[0].T)
bam_list = bam_to_chr_list(pon_df.iloc[0,0])
bam_list[1]

### split bams for multithreading

In [None]:
def split_bam(chromosome, pon_row):
    bam_file = pon_row[0]
    bam_out = os.path.join(pon_sub_folder, f"{os.path.splitext(os.path.basename(bam_file))[0]}_{str(chromosome)}.bam")
    split_bam_cmd = ["samtools", "view", "-b", "-o", bam_out, bam_file, str(chromosome)]
    bam_index_cmd = ["samtools", "index", bam_out]
    subprocess.check_call(split_bam_cmd)
    subprocess.check_call(bam_index_cmd)
    return bam_out
pon_sub_df = pd.DataFrame()
pon_sub_df['bam'] = pon_df.apply(partial(split_bam, chromosome), axis=1)
pon_file_sub = os.path.join(pon_sub_folder, f"pon_list_{chromosome}.txt")
pon_sub_df.to_csv(pon_file_sub, header=None, index=False)
pon_count = 10

### get the pileup from one of the sub pon_lists (eg. pon_list_chr11.txt)

In [None]:
def get_pileup_df(pon_file_sub):
    with open(state['log'], 'w+') as log:
        mpileup_cmd = ["samtools", "mpileup", "-B", "-d", "10000000", "-q",str(state['q']), "-Q",str(state['Q']), "--ff",state['ff']]
        mpileup_cmd += ["-b", pon_file_sub]
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        pileup_stream = Popen(mpileup_cmd, stdout=PIPE, stderr=log)
        pileup_string = StringIO(pileup_stream.communicate()[0].decode('utf-8'))
        names = ['Chr', 'Start', 'Ref']
        for i in range(pon_count):
            names += [f"depth{i}", f"read{i}", f"Q{i}"]
    return pd.read_csv(pileup_string, sep='\t', header=None, names=names)
pileup_df = get_pileup_df(pon_file_sub)

In [None]:
pileup_small = pileup_df.iloc[:100].copy()
pileup_dfs = np.array_split(pileup_small, 3)
pileup_dfs[1].iloc[0].name

### apply a column apply on several columns

In [None]:
import re
sign_re = re.compile(r'\^.|\$')
def clean_reads(run_column):

    return run_column.str.replace(sign_re, '')


pileup_small[['read0', 'read1']] = pileup_small[['read0', 'read1']].apply(lambda column: column.str.replace(sign_re, ''))


### create a new dataframe AB_df with good indexes

In [None]:
from functools import partial
AB_df = pileup_small.iloc[:,:2]

def getit(var, row):
    s = pd.Series(index=[f'{var}+a', f'{var}+b', f'{var}-a', f'{var}-b'])
    s = row[['depth1', 'depth2', 'depth3', 'depth4']]
    print(row.name)
    return s
for var in ['A','C','T','G']:
    AB_df[[f'{var}+a', f'{var}+b', f'{var}-a',f'{var}-b']] = pileup_small.apply(partial(getit, var), axis=1)
AB_df = AB_df.set_index(['Chr', 'Start'])
AB_df.columns = columns
AB_df