In [None]:
# get the code
import sys
import os
import re
import matplotlib.pyplot as plt
# use seaborn plotting defaults
import seaborn as sns; sns.set()
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output
from rollingCNV import interpolate, one_col_rolling, llh, rolling_data
from rollingCov import rolling_coverage
from combineCNV import filter_snp
from plot import plot_cov, plot_snp

######## need to find the chained assignment!!!
pd.set_option('mode.chained_assignment', None)

# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'


# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq")


cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")
plot_path = os.path.join(cnvdata, "plot")
fig_path = os.path.join(cnvdata, "figures")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate paths

In [None]:
path_config = dict(
        mawk_path="../shell",
        cov_path=os.path.join(output_path, "pile2CNV"),   # path containing rawcov.gz files for this sample
        snp_path=os.path.join(output_path, "pile2CNV"),   # path containing snp files for this sample
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split"),
        PON_path = PON_path,
    )
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=path_config)

### load cnv data

In [None]:
sample = "03_A-B"
cnv_df = pd.read_csv(os.path.join(output_path, f"CNV/{sample}.cnv.snp.gz"), sep="\t", compression="gzip")
cnv_full_df = pd.read_csv(os.path.join(output_path, f"CNV/{sample}.cnv.full.gz"), sep="\t", compression="gzip")

In [None]:
cnv_df

In [None]:
def make_SNP_plot(sample, cnv_df):

    df = cnv_df.loc[:, [c for c in cnv_df.columns if c.startswith("VAF")]]
    fig, ax = plt.subplots(figsize=(10, 10))
    _ = ax.scatter(df['VAF1'], df['VAF2'], s=0.25, alpha=0.4)
    _ = ax.set_xlabel("NVAF", fontsize=20)
    _ = ax.set_ylabel("TVAF", fontsize=20)
    
    # calculate offRate
    df0 = df[(df > 0.1).any(axis=1)]
    n = len(df0.index)
    df1 = df0[np.abs(df0["VAF1"] - df0["VAF2"]) > 0.25]
    m = len(df1.index)
    off_ratio = m / n * 100
    _ = ax.set_title(
        f"{sample} |  Tumor vs Normal - offRate {round(off_ratio, 1)}", fontsize=30
    )
    return fig, off_ratio

In [None]:
_ = make_SNP_plot(sample, cnv_df)

In [None]:
cnv_df

In [None]:
def write_ASCAT(cnv_df, sample="", outpath=""):
    '''
    for a sample XX_A-B it writes into outpath/XX/:
    XX_baf_normal.tsv
    XX_baf_tumor.tsv 
    XX_logr_normal.tsv
    XX_logr_tumor.tsv
    
    sample comes in shape sample_tumor-normal
    '''
    s = sample.split("_")
    sample_name = s[0]
    tumor_name = s[1].split("-")[0]
    normal_name = s[1].split("-")[1]
    
    base_file = os.path.join(outpath, f"{sample_name}/{sample_name}")
    
    cnv_df.loc[:, 'Chr'] = cnv_df['Chr'].str.replace("chr", "")
    # VAF
    vaf_cols = [col for col in cnv_df.columns if col.startswith("VAF")]
    
    normal_cols = {'Chr':"chrs", 'Pos':"pos", vaf_cols[0]:sample_name}
    normal_baf_file = f"{base_file}_baf_normal.tsv"
    cnv_df.loc[:, normal_cols.keys()].rename(normal_cols, axis=1).to_csv(normal_baf_file, sep="\t")
    
    tumor_cols = {'Chr':"chrs", 'Pos':"pos", vaf_cols[1]:sample_name}
    tumor_baf_file = f"{base_file}_baf_tumor.tsv"
    cnv_df.loc[:, tumor_cols.keys()].rename(tumor_cols, axis=1).to_csv(tumor_baf_file, sep="\t")

    # loglratio
    log2_pat = re.compile(r"log2ratio[0-9]+_mean$")
    log_cols = [col for col in cnv_df.columns if re.match(log2_pat, col)]
    
    normal_cols = {'Chr':"chrs", 'Pos':"pos", log_cols[0]:sample_name}
    normal_log_file = f"{base_file}_logr_normal.tsv"
    cnv_df.loc[:, normal_cols.keys()].rename(normal_cols, axis=1).to_csv(normal_log_file, sep="\t")
    
    tumor_cols = {'Chr':"chrs", 'Pos':"pos", log_cols[1]:sample_name}
    tumor_log_file = f"{base_file}_logr_tumor.tsv"
    cnv_df.loc[:, tumor_cols.keys()].rename(tumor_cols, axis=1).to_csv(tumor_log_file, sep="\t")
    show_output(f"ASCAT output written to {base_file}_[baf/logr]_[tumor|normal].tsv", color="success")
    return cnv_df

In [None]:
_ = write_ASCAT(cnv_df, sample="03_A-B", outpath=os.path.join(output_path, "CNV/ASCAT"))

In [None]:

snp_df = pd.read_csv(os.path.join(output_path, f"snp/{sample}.snp.gz"), sep="\t", compression="gzip")

fig_params = dict(
    figsize=(32,3),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(-0,1),
    label_size=13
)

vaf = dict(
        title='VAF',
        plot_type='scatter',  # ['line', 'scatter']
        data='VAF2',
        plot_args=dict(
            s=2,
            color='black',
            cmap='viridis',
            alpha=1
        )
    )
chroms = ['chr3', 'chr4', 'chr5', 'chr6','chr7', 'chr9', 'chr12', 'chr17']
r1 = 'chr17:3Mb-9Mb'

r20 = 'chr20:20.7Mb-34.5Mb'
r7 = 'chr7:95.8Mb-111Mb'
r17 = 'chr17:18.2Mb-25Mb'
fig, _, _, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region='', **fig_params)
fig, _, df20, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region=r20, **fig_params)
fig, _, df7, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region=r7, **fig_params)
fig, _, df17, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region=r17, **fig_params)
#fig, ax, df, chrom_df

# run the code

In [None]:
#### load coverage data
from rollingSNP import rolling_snp, remergeCNV
roll_cov_df = pd.read_csv(os.path.join(output_path, f"cov/{sample}.roll.cov.gz"), sep="\t", compression="gzip")
rsnp_df = rolling_snp(snp_df, roll_cov_df, config=CNVconfig)

# visualize
fig, _, _, _ = plot_snp(rsnp_df.query('SNPdensity<0.01'), plots=[vaf], chroms="all", region='', **fig_params)
fig, _, _, _ = plot_snp(rsnp_df, plots=[vaf], chroms="all", region=r20, **fig_params)
fig, _, _, _ = plot_snp(rsnp_df, plots=[vaf], chroms="all", region=r7, **fig_params)
fig, _, _, _ = plot_snp(rsnp_df, plots=[vaf], chroms="all", region=r17, **fig_params)
cnv_df = remergeCNV(rsnp_df, roll_cov_df)