In [4]:
# get the code
import sys
sys.path.append('../code')

## get the coverage on a given chromosome
+ CLI:
    *`samtools view $bam chr7 |`
        + extracts the reads for that chromosome
    *`bamCoverage [minCoverage=0] |`
        + would be better to have chromCoverage (where is it)
		+ would make it more performant
							
    *`rollingCoverage [rollingWindow=100] |`
        + every half windowSize a mean coverage is written out

    *`filterbed $BED` chr7 [writeout exomCoords=1]`
        + filters the output for positions covered by the bedfile
        + ilter the output to only exon-spanning rows

+ make the command run in memory using stringIO

### prepare the files

In [5]:
# set the paths
home = '/Users/mahtin'
home = '/Users/martinscience'
testdata = f"{home}/Dropbox/Icke/Work/somVar/testdata"
tooldata = f"{home}/Dropbox/Icke/Work/somVar/tooldata"
shell_path = "../shell"
static_path = f"{home}/Dropbox/Icke/Work/static"
bed_path = f"{static_path}/bed_files/SureSelect/hg38"

### make the config

In [6]:
# shell paths
mawk_tools = ['bamCoverage', 'filterBed', 'rollingCoverage']
mawk_tool_dict = {tool:f"{shell_path}/{tool}.mawk" for tool in mawk_tools}
config = {
    'rollingWindowSize': 10,
    'bedfile': f"{bed_path}/SS_HAEv7_hg38_Covered.bed"
}
config.update(mawk_tool_dict)
config

{'rollingWindowSize': 10,
 'bedfile': '/Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Covered.bed',
 'bamCoverage': '../shell/bamCoverage.mawk',
 'filterBed': '../shell/filterBed.mawk',
 'rollingCoverage': '../shell/rollingCoverage.mawk'}

In [7]:
from io import StringIO
import os
from subprocess import PIPE, run
from script_utils import show_output, show_command

def get_coverage(bam_file, chrom='', config={}):
    '''
    creates a coverage_df for a bam file on a given chromosome
    '''
    # unwrap the tools
    bamCoverage = config['bamCoverage']
    rollingCoverage = config['rollingCoverage']
    filterBed = config['filterBed']
    
    view_cmd = f"samtools view {bam_file} {chrom}"
    cov_cmd = f"{bamCoverage} | {rollingCoverage} {config['rollingWindowSize']} | "
    # the 1 at the end is the option for the filterbed tool to output exonic coords
    cov_cmd += f"{filterBed} {config['bedfile']} {chrom} 1"
    cmd = f"{view_cmd} | {cov_cmd}"
    show_command(cmd, multi=False)
    cov_df = pd.read_csv(StringIO(
        run(cmd, stdout=PIPE, check=True, shell=True).stdout.decode('utf-8')), sep='\t')
    return cov_df

In [8]:
bam_file = f"{testdata}/bam/002_A.chr7.bam"
chrom = "chr7"
cov_df = get_coverage(bam_file, chrom, config)

[92mProcess 12421[0m : [1m$ samtools view /Users/martinscience/Dropbox/Icke/Work/somVar/testdata/bam/002_A.chr7.bam chr7 | ../shell/bamCoverage.mawk | ../shell/rollingCoverage.mawk 10 | ../shell/filterBed.mawk /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Covered.bed chr7 1[0m


In [9]:
cov_df

Unnamed: 0,Chr,Pos,Coverage,XPos
0,chr7,19800,67.6,1
1,chr7,19805,70.9,6
2,chr7,19810,76.2,11
3,chr7,19815,83.0,16
4,chr7,19820,86.1,21
...,...,...,...,...
463355,chr7,159144845,22.6,2423529
463356,chr7,159144850,21.2,2423534
463357,chr7,159144855,20.3,2423539
463358,chr7,159144860,18.0,2423544
