# getting SNP and coverage data from PONmatrix
## python wrapper around samtools / mawktools
+ cleanpileup --> filterBED --> PON2SNP

In [4]:
# get the code
import sys
import os
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output, cmd2df
from CNV_raw import addGCratio, addGenmap, PON2CNV

# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")

 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate paths
+ makes the config more pluggable

In [5]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq"),
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split")
    ))
CNVconfig

{'pileup': {'MAPQ': 20, 'Q': 25},
 'hetSNP': {'normalVAF': [0.2, 0.8], 'minDepth': 15},
 'coverage': {'minCov': 0, 'minPONcov': 50, 'maxPONstd': 50},
 'PONcoverage': {'minCov': 0,
  'minVAF': 0.25,
  'minDepth': 50,
  'normCov': 100,
  'stdFactor': 2,
  'verbose_output': False},
 'mawk_path': '../shell',
 'bed_file': '/Users/mahtin/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed',
 'gc_split_path': '/Users/mahtin/Dropbox/Icke/Work/static/genome/gatk/hg38/split',
 'PON_path': '/Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq',
 'genmap_split_path': '/Users/mahtin/Dropbox/Icke/Work/static/annotation/genmap/hg38/split'}

## run the code

In [None]:
chrom = "chr21"
pon_df, snp_df = PON2CNV(
    chrom=chrom,
    config=CNVconfig
)
pon_df

##  the function

In [None]:
def PON2CNV(
    chrom="",
    config={}
):
    """
    wrapper around CLI chain around the core tool PON2CNV.mawk

    """

    # PARAMS
    # unwrap mawk tools
    def mawk(tool):
        return os.path.join(config["mawk_path"], f"{tool}.mawk")
    
    pon_path = config['PON_path']
    c = config["PONcoverage"]
    
    # ####BUILD COMMAND #########
    # ### READ PONMATRIX
    matrix_file = os.path.join(pon_path, f"matrix/{chrom}.pon.gz")
    if not os.path.isfile(matrix_file):
        show_output(f"PON matrix file {matrix_file} not found! Exiting.", color="warning")
        return
    read_cmd = f"gunzip < {matrix_file}"
    
    # ### FILTERBED
    filter_cmd = f"{mawk('filterBed')} {config['bed_file']} -x -c {chrom}"

    # ##### PON2CNV
    SNP_file = os.path.join(pon_path, f"snp/{chrom}.snp")
    
    cnv_cmd = f"{mawk('PON2CNV')} -x -o {SNP_file} -v {c['minVAF']} -d {c['minDepth']} -c {c['minCov']}"

    # combine
    cmd = f"{read_cmd} | {filter_cmd} | {cnv_cmd}"

    try:
        cov_df = cmd2df(cmd, show=True, multi=False)
    except Exception as e:
        show_output(f"There was an error using shell command <<{e}>>", color="warning")
        return cmd

    # add GC
    if "gc_split_path" in config and os.path.isdir(gc_path := config['gc_split_path']):
        cov_df = addGCratio(cov_df, chrom=chrom, gc_path=gc_path)
    else:
        show_output(f"Could not find GC path {gc_path}", color="warning")

    # add genmap data to both cov and snp data
    if "genmap_split_path" in config and os.path.isdir(genmap_path := config['genmap_split_path']):
        # reload snp_df from temp file
        show_output(f"Reloading PONSNP data from {SNP_file}")
        snp_df = pd.read_csv(SNP_file, sep="\t")
        cov_df, snp_df = addGenmap(cov_df, snp_df, chrom="chr7", genmap_path=genmap_path)
        # resave snp_df
        show_output(f"Resaving annotated heteroSNP data to {SNP_file}.gz")
        snp_df.to_csv(f"{SNP_file}.gz", index=False, sep="\t", compression="gzip")
        os.remove(SNP_file)
    else:
        show_output(f"Could not find genmap path {genmap_path}", color="warning")
    return cov_df, snp_df

In [None]:
cov_df, snp_df = PON2CNV(
    chrom="chr21",
    config=CNVconfig
)
cov_df

In [None]:
snp_df

### run all chroms

In [6]:
chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']
for chrom in chrom_list:
    show_output(f"PON2CNV for {chrom}")
    cov_df, _ = PON2CNV(
    chrom=chrom,
    config=CNVconfig
    )
    PONcov_file = os.path.join(CNVconfig['PON_path'], f"cov/{chrom}.cov.gz")
    show_output(f"Writing PON coverage to {PONcov_file}")
    cov_df.to_csv(PONcov_file, sep="\t", index=False, compression="gzip")
show_output("FINISHED", color="success")

[1;90;1mPON2CNV for chr1[0m
[1m$ gunzip < /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/matrix/chr1.pon.gz | ../shell/filterBed.mawk /Users/mahtin/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr1 | ../shell/PON2CNV.mawk -x -o /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr1.snp -v 0.25 -d 50 -c 0[0m
[1;90;1mLoading GC data for chr1 from /Users/mahtin/Dropbox/Icke/Work/static/genome/gatk/hg38/split/chr1.gc100-10.gz[0m
[1;90;1mReloading PONSNP data from /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr1.snp[0m
[1;90;1mLoading mappability data for chr1 from /Users/mahtin/Dropbox/Icke/Work/static/annotation/genmap/hg38/split/hg38_genmap.HAEv7.chr1.txt.gz[0m
[1;90;1mResaving annotated heteroSNP data to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr1.snp.gz[0m
[1;90;1mWriting PON coverage to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/cov/chr1.

[1;90;1mLoading GC data for chr9 from /Users/mahtin/Dropbox/Icke/Work/static/genome/gatk/hg38/split/chr9.gc100-10.gz[0m
[1;90;1mReloading PONSNP data from /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr9.snp[0m
[1;90;1mLoading mappability data for chr9 from /Users/mahtin/Dropbox/Icke/Work/static/annotation/genmap/hg38/split/hg38_genmap.HAEv7.chr9.txt.gz[0m
[1;90;1mResaving annotated heteroSNP data to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr9.snp.gz[0m
[1;90;1mWriting PON coverage to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/cov/chr9.cov.gz[0m
[1;90;1mPON2CNV for chr10[0m
[1m$ gunzip < /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/matrix/chr10.pon.gz | ../shell/filterBed.mawk /Users/mahtin/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr10 | ../shell/PON2CNV.mawk -x -o /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr10.snp -v 0.25 

[1;90;1mReloading PONSNP data from /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr17.snp[0m
[1;90;1mLoading mappability data for chr17 from /Users/mahtin/Dropbox/Icke/Work/static/annotation/genmap/hg38/split/hg38_genmap.HAEv7.chr17.txt.gz[0m
[1;90;1mResaving annotated heteroSNP data to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr17.snp.gz[0m
[1;90;1mWriting PON coverage to /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/cov/chr17.cov.gz[0m
[1;90;1mPON2CNV for chr18[0m
[1m$ gunzip < /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/matrix/chr18.pon.gz | ../shell/filterBed.mawk /Users/mahtin/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr18 | ../shell/PON2CNV.mawk -x -o /Users/mahtin/Dropbox/Icke/Work/static/PON/HAEv7_hg38_NovaSeq/snp/chr18.snp -v 0.25 -d 50 -c 0[0m
[1;90;1mLoading GC data for chr18 from /Users/mahtin/Dropbox/Icke/Work/static/genome/gatk/hg38/split/