# getting SNP and coverage data from bam files or pileup files
## python wrapper around samtools / mawktools
+ bam --> mpileup --> clean --> filter --> pile2SNP
+ mpileup --> clean --> filter --> pile2SNP
+ cleanpileup --> filter --> pile2SNP

In [None]:
# get the code
import sys
import os
sys.path.append('../code')


# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")

 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
PONCOV_path = cnv_path = os.path.join(cnvdata, "chromCov")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate paths
+ makes the config more pluggable

In [None]:
from script_utils_CNV import get_CNVconfig
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        genmap_split_path=os.path.join("annotation/genmap/hg38/split"),
        pon_cov_path=os.path.join(cnvdata, "chromCov")
    ))
CNVconfig

## run the code

In [None]:
from CNV_raw import get_rawCNV

cov_df = get_rawCNV(
    clean_TN_pileup_file=os.path.join(cnvdata, "cleanpileup/02_A-B.chr1.gz"),
    chrom="chr1",
    SNP_output=os.path.join(output_path, "pile2CNV/02_A-B.chr1.snp"),
    config=CNVconfig
)
cov_df.to_csv(os.path.join(output_path, "pile2CNV/02_A-B.chr1.rawcov.gz"), sep="\t", index=False, compression="gzip")
cov_df

get_rawCNV(
    clean_TN_pileup_file=os.path.join(cnvdata, "cleanpileup/03_A-B.chr1.gz"),
    chrom="chr1",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr1.snp"),
    config=CNVconfig
)

In [None]:
get_rawCNV(
    normal_bam=os.path.join(testdata, "bam/03_B.chr7.bam"),
    tumor_bam=os.path.join(testdata, "bam/03_A.chr7.bam"),
    chrom="chr7",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr7.snp"),
    config=CNVconfig
)

In [None]:
cov_df

##  the function

In [None]:
from script_utils_CNV import show_output, cmd2df


def get_rawCNV(
    normal_bam="",
    tumor_bam="",
    tumor_bams=[],
    TN_pileup_file="",  # mpileup from normalbam, tumorbam(s)
    clean_TN_pileup_file="",  # mpileup from normalbam, tumorbam(s) with cleanpileup.mawk already done
    pileup_is_clean=True,
    chrom="",
    config={},
    SNP_output="",
):
    """
    wrapper around CLI chain around the core tool pile2CNV.mawk

    """

    # PARAMS
    # unwrap mawk tools
    def mawk(tool):
        return os.path.join(config["mawk_path"], f"{tool}.mawk")

    sc = config["hetSNP"]
    cc = config["coverage"]

    # create the basic command and unpack required params
    # ##### FILTERBED
    filter_cmd = f"{mawk('filterBed')} {config['bed_file']} -x -c {chrom}"

    if not SNP_output:
        show_output("Output file for heteroSNP is missing!", color="warning")
        return
    # ##### PILE2CNV
    cnv_cmd = f"{mawk('pile2CNV')} -x -o {SNP_output} -v {sc['normalVAF'][0]} -V {sc['normalVAF'][1]} -d {sc['minDepth']} -c {cc['minCov']}"

    # combine
    cmd = f"{filter_cmd} | {cnv_cmd}"

    if clean_TN_pileup_file:
        if os.path.splitext(clean_TN_pileup_file)[1] == ".gz":
            cmd = f"gunzip < {clean_TN_pileup_file} | {cmd}"
        else:
            cmd = f"cat {clean_TN_pileup_file} | {cmd}"
    else:
        # add the cleanup cmd if
        cmd = f"{mawk('cleanpileup')} -d | {cmd}"
        if TN_pileup_file:
            if os.path.splitext(TN_pileup_file)[1] == ".gz":
                cmd = f"gunzip < {TN_pileup_file} | {cmd}"
            else:
                cmd = f"cat {TN_pileup_file} | {cmd}"
        else:
            # pileup has to be done
            # get the bam files
            if tumor_bams:
                bams = " ".join([normal_bam], tumor_bams)
            else:
                bams = f"{normal_bam} {tumor_bam}"
            # get params from config
            pc = config["pileup"]
            split_genome = os.path.join(config["genome_split_path"], f"{chrom}.fa")
            pileup_cmd = f"samtools mpileup -f {split_genome} -l {config['bed_file']} -r {chrom} -q {pc['MAPQ']} -Q {pc['Q']} {bams}"
            cmd = f"{pileup_cmd} | {cmd}"
    try:
        cov_df = cmd2df(cmd, show=True, multi=False)
        return cov_df
    except:
        show_output("There was an error using shell command", color="warning")
        return cmd


In [None]:
get_rawCNV(
    normal_bam=os.path.join(testdata, "bam/03_B.chr7.bam"),
    tumor_bam=os.path.join(testdata, "bam/03_A.chr7.bam"),
    chrom="chr7",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr7.snp"),
    config=CNVconfig
)

### run all chroms

In [None]:
chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']
for chrom in chrom_list:
    pileup_file = f"cleanpileup/03_A-B.{chrom}.gz"
    show_output(f"pile2CNV for {os.path.basename(pileup_file)}")
    cov_df = get_rawCNV(
    pileup_file,
    chrom=chrom,
    SNP_output=os.path.join(output_path, f"pile2CNV/03_A-B.{chrom}.snp"),
    config=CNVconfig
    )
    cov_df.to_csv(os.path.join(output_path, f"pile2CNV/03_A-B.{chrom}.rawcov.gz"), sep="\t", index=False, compression="gzip")
show_output("FINISHED", color="success")