# Implementation of the EBfilter by genomon

* EBrun (originally EBFilter) is an argparse wrapper passing command line arguments to run.py (is not needed for internal use)
* passed arguments:
    * targetMutationFile: the .vcf or .anno containing the mutations – needed --> mut_file
    * targetBamPath: path to the tumor bam file (+.bai) – needed --> tumor_bam
    * controlBamPathList: text list of path to PoN bam files (+ .bai) – needed --> pon_list
    * outputPath: clear  – needed --> output_path
    * -f option for anno or vcf – not needed --> will be inferred from .ext
    * thread_num: –not needed --> taken from config
    * -q option for quality threshold – not needed --> default _q config
    * -Q option for base quality threshold - not needed --> default _Q from config
    * --ff option for filter flags – not needed because of preprocessing??
    * --loption for samtools mpileup -l option – must elaborate..
    * --region option for restriction of regions on mpileup -l – must elaborate..
    * --debug – not needed

## Initiation

### imports

In [87]:
import pandas as pd
import numpy as np
import vcf
import pysam
import sys
import os
import subprocess
import math
import scipy.stats as ss
import scipy.optimize as so
import re
import multiprocessing

### snakemake config

In [88]:
config = {'EB':{'run': True}}
params = {}
params['map_quality'] = 20
params['base_quality'] = 15
params['filter_flags'] = 'UNMAP,SECONDARY,QCFAIL,DUP'
params['loption'] = True
config['EB']['threads'] = 1
config['EB']['params'] = params

### function args

In [89]:
args = {}
args['mut_file'] = 'testdata/input.anno'
args['tumor_bam'] = 'testdata/tumor.bam'
args['pon_list'] = 'testdata/list_normal_sample.txt'
args['output_path'] = 'output/output.anno'
args['region'] = ''

In [90]:
def main(args):
    '''
    validates files and refers to respective functions
    '''

    # should add validity check for arguments
    mut_file = args['mut_file']
    tumor_bam = args['tumor_bam']
    pon_list = args['pon_list']
    output_path = args['output_path']
    is_anno = not(os.path.splitext(test)[-1] == '.vcf')
    region = args['region']

    # file existence check
    validate(mut_file, tumor_bam, pon_list) 
    if threads == 1:
        # non multi-threading mode
        if is_anno:
            EBFilter_worker_anno(mut_file, tumor_bam, pon_list, output_path, map_quality, base_qual_thres, filter_flags, is_loption, region, debug_mode)
        else: 
            EBFilter_worker_vcf(mut_file, tumor_bam, pon_list, output_path, map_quality, base_qual_thres, filter_flags, is_loption, region, debug_mode)

### load the config state

In [91]:
debug_mode = True
params = config['EB']['params']
threads = config['EB']['threads']
_q = params['map_quality']
_Q = params['base_quality']
_ff = params['filter_flags']
is_loption = params['loption']
log_file = 'output/logs'

### worker_anno

In [113]:
def EBFilter_worker_anno(mut_file, tumor_bam, pon_list, output_path, region):

    pon_count = sum(1 for line in open(pon_list, 'r'))

    # --> process_anno
    if is_loption:
        make_region_list(mut_file) # in utils

    # generate pileup files
    anno2pileup(mut_file, output_path, tumor_bam, region)
    anno2pileup(mut_file, output_path, pon_list, region)
    ##########

    # delete region_list.bed
    if is_loption and not debug_mode:
        subprocess.check_call(["rm", "-f", f"{mut_file}.region_list.bed"])

    ##########
    # load pileup files
    pos2pileup_target = {}
    pos2pileup_control = {}

    with open(f"{output_path}.target.pileup", 'r') as file_in:
        for line in file_in:
            field = line.rstrip('\n').split('\t')
            pos2pileup_target[field[0] + '\t' + field[1]] = '\t'.join(field[3:])

    with open(f"{output_path}.control.pileup", 'r') as file_in:
        for line in file_in:
            field = line.rstrip('\n').split('\t')
            pos2pileup_control[field[0] + '\t' + field[1]] = '\t'.join(field[3:])
    ##########

     ##########
    # get restricted region if not None
    if is_loption and region:
        region_match = region_exp.match(region)
        reg_chr = region_match.group(1)
        reg_start = int(region_match.group(2))
        reg_end = int(region_match.group(3))

SyntaxError: invalid syntax (<ipython-input-113-e5fdb45c334e>, line 24)

### utils

In [93]:
def validate_region(region):
    '''
    returns True if region 
    '''
    region_exp = re.compile('^[^ \t\n\r\f\v,]+:\d+\-\d+')
    # region format check
    if region:
        region_match = region_exp.match(region)
        if region_match:
            return True

def validate(mut_file, tumor_bam, pon_list):
    # file existence check
    if not os.path.exists(mut_file):
        sys.stderr.write(f"No target mutation file: {mut_file}")
        sys.exit(1)
    if not os.path.exists(tumor_bam):
        sys.stderr.write(f"No target bam file: {tumor_bam}")
        sys.exit(1)
    if not os.path.exists(tumor_bam + ".bai") and not os.path.exists(re.sub(r'bam$', "bai", tumor_bam)):
        sys.stderr.write(f"No index for target bam file: {tumor_bam}")
        sys.exit(1)

    if not os.path.exists(pon_list):
        sys.stderr.write(f"No control list file: {pon_list}")
        sys.exit(1)
        
    with open(pon_list) as hIN:
        for file in hIN:
            file = file.rstrip()
            if not os.path.exists(file):
                sys.stderr.write(f"No control bam file: {file}")
                sys.exit(1)
            if not os.path.exists(file + ".bai") and not os.path.exists(re.sub(r'bam$', "bai", file)):
                sys.stderr.write(f"No index for control bam file: {file}")
                
def make_region_list(anno_path):
    # make bed file for mpileup
    out_path = f"{anno_path}.region_list.bed"
    with open(anno_path) as file_in:
        with open(out_path, 'w') as file_out:
            for line in file_in:
                field = line.rstrip('\n').split('\t')
                loc = int(field[1]) - (field[4] == "-")  # -1 if field 4 == '-' eg. deletion 
                print(field[0], (loc - 1), loc, file=file_out, sep='\t')

### main

In [104]:
def main(args):
    '''
    validates files and refers to respective functions
    '''

    # should add validity check for arguments
    mut_file = args['mut_file']
    tumor_bam = args['tumor_bam']
    pon_list = args['pon_list']
    output_path = args['output_path']
    is_anno = not(os.path.splitext(mut_file)[-1] == '.vcf')
    region = args['region']

    # file existence check
    validate(mut_file, tumor_bam, pon_list) 
    if threads == 1:
        # non multi-threading mode
        if is_anno:
            EBFilter_worker_anno(mut_file, tumor_bam, pon_list, output_path, region)
        else: 
            EBFilter_worker_vcf(mut_file, tumor_bam, pon_list, output_path, region)
    else:
        # multi-threading mode
        ##########

        if is_anno:
            # partition anno files
            partition_anno(mut_file, output_path, threads)

            jobs = []

            for i in range(threads):
                worker_args = (f"{output_path}.{i}", tumor_bam, pon_list, f"{output_path}.{i}", region)
                process = multiprocessing.Process(target=EBFilter_worker_anno, args=worker_args)
                    
                jobs.append(process)
                process.start()
        
            # wait all the jobs to be done
            for i in range(threads):
                jobs[i].join()
        
            # merge the individual results
            merge_anno(output_path, threads)
        
            # delete intermediate files
            if debug_mode == False:
                for i in range(threads):
                    subprocess.check_call(["rm", f"{output_path}.{i}", f"{output_path}.{i}.control.pileup", f"{output_path}.{i}.target.pileup"])

### anno2pileup

In [105]:
def anno2pileup(anno_path, out_path, bam_or_pon, region):
    '''
    creates a pileup from all the entries in the anno file
    '''
    with open(log_file, 'w') as log:
        with open(anno_path, 'r') as file_in:
            with open(out_path, 'w') as file_out:
                mpileup_cmd = ["samtools", "mpileup", "-B", "-d", "10000000", "-q", _q, "-Q", _Q, "--ff", _ff]

                # add tumor_bam or pon_list of bam files depending on file extension of bam_or_pon
                if (os.path.splitext(bam_or_pon)[-1] == '.bam'):
                    mpileup_cmd += [bam_or_pon]
                else:
                    mpileup_cmd += ["-b", bam_or_pon]

                if is_loption:
                    # region_list.bed is generated by worker_anno
                    mpileup_cmd += ["-l", f"{anno_path}.region_list.bed"]

                    if region:
                        mpileup_cmd = mpileup_cmd + ["-r", region]
                    subprocess.check_call([str(command) for command in mpileup_cmd], stdout=file_out, stderr=log) # maybe logging
                # no loption 
                else: 
                    # get lines of anno file
                    for line in file_in:
                        print('anno2pileup', line, bam_or_pon)
                        field = line.rstrip('\n').split('\t')
                        loc = int(field[1]) - (field[4] == "-") # -1 if field 4 == '-' eg. deletion
                        mutReg = f"{field[0]}:{loc}-{loc}"
                
                        # set region for mpileup
                        mpileup_cmd += ["-r", mutReg]
                        subprocess.check_call([str(command) for command in mpileup_cmd], stdout=file_out, stderr=log)

In [106]:
threads = 1
main(args)

In [107]:
ls output

logs                          output.anno.2.control.pileup
output.anno                   output.anno.2.target.pileup
output.anno.0.control.pileup  output.anno.control.pileup
output.anno.0.target.pileup   output.anno.sub.anno.1
output.anno.1.control.pileup  output.anno.sub.anno.2
output.anno.1.target.pileup   output.anno.target.pileup


### Multithreading

In [108]:
def partition_anno(anno_path, out_path, threads):

    
    with open(anno_path, 'r') as file_in:
        # get line number
        record_num = sum(1 for line in file_in)
        file_in.seek(0,0)
        threads = min(record_num, threads)
        # get lines per subprocess
        frac_lines = record_num / threads

        current_sub = current_line = 0
        file_out = open(f"{out_path}.{current_sub}", 'w')
        for line in file_in:
            print(line.rstrip("\n"), file=file_out) 
            current_line += 1
            if (current_line >= frac_lines) and (current_sub < threads - 1):
                current_sub += 1
                current_line = 0
                file_out.close()
                file_out = open(f"{out_path}.{current_sub}", 'w')
        file_out.close()

    return threads


def merge_anno(out_path, threads):

    file_out = open(out_path, 'w')
    for i in range(threads):
        file_in = open(f"{out_path}.{i}", 'r')
        for line in file_in:
            print(line.rstrip('\n'), file=file_out)

In [111]:
threads = 3
debug_mode = False
main(args)

In [112]:
ls output/

logs         output.anno


In [105]:
mpileup = pd.read_csv('output/tumor.mpileup', sep='\t', header=None, names=['Chr', 'Start', 'ref', 'depth', 'reads', 'mapQ'], dtype={'Start':int, 'reads':str, 'mapQ': str})

In [29]:
mpileup[mpileup['Start'] == 193069]

Unnamed: 0,Chr,Start,ref,depth,reads,mapQ
290,chr11,193069,N,40,CCCCCCCCCCCCCCCCCCCCCCccCCCccCccCCCCCcCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII


In [43]:
sys.exit(1)

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [42]:
list(range(3))

[0, 1, 2]