# getting SNP and coverage data from bam files or pileup files
## python wrapper around samtools / mawktools
+ bam --> mpileup --> clean --> filter --> pile2SNP
+ mpileup --> clean --> filter --> pile2SNP
+ cleanpileup --> filter --> pile2SNP

In [1]:
# get the code
import sys
import os
sys.path.append('../code')


# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")

 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
PONCOV_path = cnv_path = os.path.join(cnvdata, "chromCov")

### load config and basic functions

In [53]:
from script_utils_CNV import show_output, cmd2df, get_CNVconfig

def get_rawCNV(
    normal_bam="",
    tumor_bam="",
    tumor_bams=[],
    TN_pileup_file="",    # mpileup from normalbam, tumorbam(s)
    clean_TN_pileup_file="", # mpileup from normalbam, tumorbam(s) with cleanpileup.mawk already done
    pileup_is_clean=True,
    chrom="",
    config=CNVconfig,
    SNP_output=""
    ):
    
    ## PARAMS
    # unwrap mawk tools
    def mawk(tool):
        return os.path.join(config["mawk_path"], f"{tool}.mawk")

    sc = config['hetSNP']
    cc = config['coverage']
    
    
    # create the basic command and unpack required params
    ###### FILTERBED
    filter_cmd = f"{mawk('filterBed')} {config['bed_file']} -x -c {chrom}"
    
    if not SNP_output:
        show_output("Output file for heteroSNP is missing!", color="warning")
        return
    ###### PILE2CNV 
    cnv_cmd = f"{mawk('pile2CNV')} -x -o {SNP_output} -v {sc['normalVAF'][0]} -V {sc['normalVAF'][1]} -d {sc['minDepth']} -c {cc['minCov']}"
    
    # combine 
    cmd = f"{filter_cmd} | {cnv_cmd}"
    
    
    if clean_TN_pileup_file:
        if os.path.splitext(clean_TN_pileup_file)[1] == ".gz":
            cmd = f"gunzip < {clean_TN_pileup_file} | {cmd}"
        else:
            cmd = f"cat {clean_TN_pileup_file} | {cmd}"
    else:
        # add the cleanup cmd if 
        cmd = f"{mawk('cleanpileup')} -d | {cmd}"
        if TN_pileup_file:
            if os.path.splitext(TN_pileup_file)[1] == ".gz":
                cmd = f"gunzip < {TN_pileup_file} | {cmd}"
            else:
                cmd = f"cat {TN_pileup_file} | {cmd}"
        else:
            # pileup has to be done
            # get the bam files
            if tumor_bams:
                bams = " ".join([normal_bam], tumor_bams)
            else:
                bams = f"{normal_bam} {tumor_bam}"
            # get params from config
            pc = config['pileup']
            split_genome = os.path.join(config['genome_split_path'], f"{chrom}.fa")
            pileup_cmd = f"samtools mpileup -f {split_genome} -l {config['bed_file']} -r {chrom} -q {pc['MAPQ']} -Q {pc['Q']} {bams}"
            cmd = f"{pileup_cmd} | {cmd}"
    try:
        cov_df = cmd2df(cmd, show=True, multi=False)
        return cov_df
    except:
        show_output("There was an error using shell command", color="warning")
        
    


    # matrix_df = cmd2df(cmd)
    # return matrix_df

## A) get raw data

In [48]:
 CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split")
    ))

get_rawCNV(
    clean_TN_pileup_file=os.path.join(cnvdata, "cleanpileup/02_A-B.chr1.gz"),
    chrom="chr1",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr7.snp"),
    config=CNVconfig
)

[1m$ gunzip < /Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata/cleanpileup/02_A-B.chr1.gz | ../shell/filterBed.mawk /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr1 | ../shell/pile2CNV.mawk -x -o /Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata/output/pile2CNV/02_A-B.chr7.snp -v 0.2 -V 0.8 -d 15 -c 30[0m


Unnamed: 0,Chr,Start,ExonPos,Cov1,Cov2
0,chr1,16760,4630,26.08,35.48
1,chr1,16770,4640,30.39,42.44
2,chr1,16780,4650,34.31,48.84
3,chr1,16790,4660,37.49,54.29
4,chr1,16800,4670,39.99,58.71
...,...,...,...,...,...
578119,chr1,248918350,248906220,24037.60,28822.50
578120,chr1,248918360,248906230,24035.90,28821.00
578121,chr1,248918370,248906240,24034.30,28819.40
578122,chr1,248918380,248906250,24032.80,28817.60


In [49]:
get_rawCNV(
    clean_TN_pileup_file=os.path.join(cnvdata, "cleanpileup/03_A-B.chr1.gz"),
    chrom="chr1",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr1.snp"),
    config=CNVconfig
)

[1m$ gunzip < /Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata/cleanpileup/03_A-B.chr1.gz | ../shell/filterBed.mawk /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr1 | ../shell/pile2CNV.mawk -x -o /Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata/output/pile2CNV/03_A-B.chr1.snp -v 0.2 -V 0.8 -d 15 -c 30[0m


Unnamed: 0,Chr,Start,ExonPos,Cov1,Cov2
0,chr1,13410,1647,20.13,39.14
1,chr1,13420,1657,27.65,54.46
2,chr1,13430,1667,36.32,70.81
3,chr1,13440,1677,45.66,88.10
4,chr1,13450,1687,55.68,106.04
...,...,...,...,...,...
581995,chr1,248918350,248906587,239876.00,287575.00
581996,chr1,248918360,248906597,239862.00,287557.00
581997,chr1,248918370,248906607,239847.00,287537.00
581998,chr1,248918380,248906617,239830.00,287514.00


In [None]:
get_rawCNV(
    normal_bam=os.path.join(testdata, "bam/03_B.chr7.bam"),
    tumor_bam=os.path.join(testdata, "bam/03_A.chr7.bam"),
    chrom="chr7",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr7.snp"),
    config=CNVconfig
)

[1m$ samtools mpileup -f /Users/martinscience/Dropbox/Icke/Work/static/genome/gatk/hg38/split/chr7.fa -l /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -r chr7 -q 20 -Q 25 /Users/martinscience/Dropbox/Icke/Work/somVar/testdata/bam/03_B.chr7.bam /Users/martinscience/Dropbox/Icke/Work/somVar/testdata/bam/03_A.chr7.bam | ../shell/cleanpileup.mawk -d | ../shell/filterBed.mawk /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed -x -c chr7 | ../shell/pile2CNV.mawk -x -o /Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata/output/pile2CNV/03_A-B.chr7.snp -v 0.2 -V 0.8 -d 15 -c 30[0m


## B) coverage: merge sample coverage with Pon coverage

In [None]:
def combine_Covdata(sample, sample_cnv_path="", PON_cnv_path="", verbose=False, filtered=True):
    
    cover_dfs = []
    chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']
    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(sample_cnv_path, f"{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            print("No file", sample_cov_file)
            continue
        if verbose:
            print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        #reading PONcoverage
        full_or_filtered = "filtered" if filtered else "full"
        pon_cov_file = os.path.join(PON_cnv_path, f"{chrom}.{full_or_filtered}.csv.gz")
        if not os.path.isfile(pon_cov_file):
            print("No file", pon_cov_file)
            continue
        if verbose:
            print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[4:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        ##### here recover missing FullExonPos from margin
        # get 
        exon_start, full_start = sample_df.iloc[0][['ExonPos', 'FullExonPos']]
        offset = full_start - exon_start
        sample_df.loc[sample_df['FullExonPos'] != sample_df['FullExonPos'], 'FullExonPos'] = sample_df['ExonPos'] + offset
        sample_df.loc[:, 'FullExonPos'] = sample_df.loc[:, 'FullExonPos'].astype(int)
        cover_dfs.append(sample_df)  
    # combine chrom data
    cover_df = pd.concat(cover_dfs)
    
    # normalize the coverage
    cover_df['Coverage'] = cover_df['Coverage'].fillna(0)
    mean_cov = sample_df['Coverage'].mean()
    cover_df.loc[:, 'Coverage'] = (cover_df['Coverage'] / mean_cov * 100)
    # loggable are the coverages, where log2ratio can be computed
    loggable = (cover_df['PONmeanCov'] * cover_df['Coverage'] != 0)
    cover_df.loc[loggable, 'log2ratio'] = np.log2(cover_df.loc[loggable, 'Coverage'] / cover_df.loc[loggable, 'PONmeanCov'])
    # mark regions without PON coverage as 0
    cover_df.loc[~loggable, 'log2ratio'] = np.nan
    return cover_df