# combining chromosomal PON coverage and SNP data into combined data
+ mappability and GCratio is included for GC normalization and possible filtering
+ mappability and GC will not be saved in output files*

In [None]:
# get the code
import sys
import os
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output, cmd2df
from CNV_raw import addGCratio, addGenmap, PON2CNV

# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq")
 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")

### get the config

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        PON_path = PON_path,
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split")
    ))
CNVconfig

## Run the code

In [None]:
from combineCNV import make_PON_coverage, make_PON_snp
cov_df, filter_df = make_PON_coverage(config=CNVconfig, save=True)
snp_df = make_PON_snp(config=CNVconfig, save=True)

## Step by step

### PON coverage
+ combine chroms
+ normalize GCratio
+ adjust for male chrX coverage
+ filter outliers

In [None]:
def combine_PON_coverage(config={}):
    '''
    combine the PON coverages for all chroms
    '''

    # paths
    pon_path = config['PON_path']

    chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']
    dfs = []
    for chrom in chrom_list:
        PON_cov_file = os.path.join(pon_path, f"cov/{chrom}.cov.gz")
        if os.path.isfile(PON_cov_file):
            show_output(f"Loading coverage file {PON_cov_file}")
            cov_df = pd.read_csv(PON_cov_file, sep="\t", compression="gzip")
            dfs.append(cov_df)
        else:
            show_output(f"Could not find PON coverage file {PON_cov_file}")
    cov_df = pd.concat(dfs).reset_index(drop=True)
    # make chrom categorical
    cov_df.loc[:, "Chr"] = pd.Categorical(cov_df['Chr'], chrom_list)
    return cov_df

In [None]:
cov_df = combine_PON_coverage(config=CNVconfig)
# look at general stats
cov_df.groupby("Chr").agg({"Cov1": "mean", "Cov2": "mean", "Pos":['count', 'min', 'max']}).sort_values("Chr")

### B) normalize GC ratio

###  Visualize GC bias
+ distribution of mean

In [None]:
def make_GC_plot(cov_df, sample="", agg="mean", max_plots=99):
    '''
    create GC plot for the coverages
    '''
    cov_cols = [col for col in cov_df.columns if col.startswith("Cov")][:max_plots]
    # create the agg dictionary
    cov_agg = {col: agg for col in cov_cols}
    # make the agg
    df = cov_df.loc[cov_df["map50_0"] > 0.5, :].loc[cov_df["map30_0"] > 0.5, :].loc[cov_df["map75_1"] > 0.5, :].groupby("GCratio").agg(cov_agg)
    fig, ax = plt.subplots(figsize=(10, 10))
    for col in cov_cols:
        _ = ax.plot(df.index, df[col], alpha=0.4)
    _ = ax.set_xlabel("GCratio", fontsize=14)
    _ = ax.set_ylabel("Coverage", fontsize=14)
    if sample:
        _ = ax.set_title(f"Sample {sample} | GCratio vs coverage", fontsize=20)
    return fig, ax

In [None]:
from plot import make_GC_plot
fig, _ = make_GC_plot(cov_df)
fig.savefig(os.path.join(cnvdata, "figures/GC_PON.jpeg"))

### visualize GC ratio distribution

In [None]:
fig, _ = make_GC_plot(normGC_cov_df, agg="count")
fig.savefig(os.path.join(cnvdata, "figures/GC_PON_count.jpeg"))

### normalize the coverage to 100

In [None]:
def normalize_GC_col(cov_df, col):
    '''
    normalizes one coverage column for GC ratio
    '''
    # compute the normalizer df
    # for each GCratio, norm_df has the difference of the respective mean from arbitrary norm coverage 100
    # remove chrX for the normalization or male genomes will have slightly greater mean
    norm_df = (100 / cov_df.query('Chr != "chrX"').groupby("GCratio").agg({col: 'mean'})).reset_index().rename({col: "factor"}, axis=1)

    # merge to get the factor
    cov_df = cov_df.merge(norm_df)
    # adjust coverage using the factor from norm_df
    cov_df[col] = cov_df[col] * cov_df['factor']
    # remove factor
    cov_df = cov_df.drop("factor", axis=1)
    return cov_df


def normalize_GC(cov_df):
    '''
    normalize GC for an entire tumor_normal sample
    '''
    for col in cov_df.columns:
        if col.startswith("Cov"):
            show_output(f"Normalizing GC ratio for {col}.")
            cov_df = normalize_GC_col(cov_df, col, )
    cov_df = cov_df.reset_index(drop=True).sort_values(['Chr', 'Pos'])
    return cov_df


def normalize(cov_df):
    '''
    normalize general coverage to 100 (no GC normalization)
    '''
    for col in cov_df.columns:
        if col.startswith("Cov"):
            show_output(f"Normalizing GC ratio for {col}.")
            cov_df.loc[:, col] = cov_df[col] / cov_df[col].mean() * 100
    return cov_df

### visualize normalization

In [None]:
normGC_cov_df = normalize_GC(cov_df)
norm_cov_df = normalize(cov_df)
del cov_df
fig, _ = make_GC_plot(normGC_cov_df)
fig.savefig(os.path.join(cnvdata, "figures/GC_PON_GCnorm.jpeg"))
fig, _ = make_GC_plot(norm_cov_df)
fig.savefig(os.path.join(cnvdata, "figures/GC_PON_norm.jpeg"))

### raise X-coverage for male samples
+ consense X coverage is required

In [None]:
normGC_cov_df.groupby("Chr").agg(cov_agg).loc["chrX"]

In [None]:
def amazonize(*cov_dfs):
    '''
    detect male samples via below-threshold X-chrom coverage
    coverage on chrX is doubled in these samples
    '''

    amazon_dfs = []
    for cov_df in cov_dfs:
        # get coverage cols
        cov_cols = [col for col in cov_df.columns if col.startswith("Cov")]
        # create the agg dictionary
        cov_agg = {col: "mean" for col in cov_cols}
        # compute x_coverage for all samples using agg dictionary
        X_coverage = cov_df.query('Chr == "chrX"').agg(cov_agg)
        # filter out the male samples
        male_cols = [col for col in cov_cols if X_coverage[col] < 75]
        # adjust the coverage for male samples
        cov_df.loc[cov_df['Chr'] == "chrX", male_cols] = cov_df[male_cols] * 2
        amazon_dfs.append(cov_df)
    return amazon_dfs

In [None]:
normX_df = amazonize(norm_cov_df)
del norm_cov_df
normX_df.groupby("Chr").agg(cov_agg).loc["chrX"]

In [None]:
normX_df

### compute the mean of all the coverages

In [None]:
def compute_stats(*dfs):
    '''
    get statistics
    '''
    
    compute_dfs = []
    for df in dfs:
        # remove all pre-existing stats
        df = df.drop([col for col in df.columns if col.startswith("PONcov")], axis=1)
        # set index for all non-coverage columns
        index_cols = [col for col in df.columns if not col.startswith("Cov")]
        cov_df = df.drop(index_cols, axis=1)
        df['PONcov_mean'] = cov_df.mean(axis=1)
        df['PONcov_median'] = cov_df.median(axis=1)
        df['PONcov_std'] = cov_df.std(axis=1)
        compute_dfs.append(df)
    return compute_dfs

In [None]:
mean_df = compute_stats(normX_df)
del normX_df
mean_df

### filter the coverage
+ for filtering all outliers are removed that stray from the local mean above max_mean_std
+ other filtering steps per position should be done afterwards during rolling window stuff:
+ `filter_df = df.query('meanCov > @mincov and std < @max_mean_std')`


In [None]:
mean_df['PONcov_std'].max()
mean_df['PONcov_std'].mean()

### remove outliers in order to reduce noise

In [None]:
def remove_outliers(*dfs, std_factor=2.5):
    '''
    cycle through all sample cols, remove outliers with difference to PONcov greater than std_factor * std
    '''
    filter_dfs = []
    for df in dfs:
        for col in [col for col in df.columns if col.startswith("Cov")]:
            df.loc[np.abs(df['PONcov_mean'] - df[col]) / df['PONcov_std'] > std_factor, col] = np.nan
        filter_dfs.append(df)
    return filter_dfs

In [None]:
filter_df = compute_stats(remove_outliers(mean_df, std_factor=2))
del mean_df
filter_df

In [None]:
filter_df['PONcov_std'].max()
filter_df['PONcov_std'].mean()

### the master function

In [None]:
def make_PON_coverage(
    config={
        "PONcoverage": {
            "stdFactor": 2.5  # only exonPositions straighing within std_factor * std around meanCoverage are kept
        },
        "PON_path": ".",  # path to the PON folder
    },
    save=True,
):
    """ """

    # paths
    pon_path = config["PON_path"]
    # load all sample coverages for one chromosome
    cov_df = combine_PON_coverage(config=config)

    # make chrom categorical
    chrom_list = [f"chr{i}" for i in range(1, 23)] + ["chrX"]
    cov_df.loc[:, "Chr"] = pd.Categorical(cov_df["Chr"], chrom_list)

    # normalize and add mean values and std
    show_output("Normalizing coverage and removing GC dependencies for PON coverage.")
    covGC_df = normalize_GC(cov_df)

    cov_df = normalize(cov_df)
    
    
    show_output("Lifting X-coverages for male samples to XX coverage.")
    covGC_df, cov_df = amazonize(covGC_df, cov_df)

    show_output("Computing stats.")
    covGC_df, cov_df = compute_stats(covGC_df, cov_df)

    std_factor = config["PONcoverage"]["stdFactor"]
    show_output("Remove outliers and recompute stats.")
    filterGC_df, filter_df = remove_outliers(covGC_df, cov_df, std_factor=std_factor)

    # save and adjust the output columns
    base_cols = ["Chr", "Pos", "ExonPos"]
    # map_cols = [col for col in cov_df.columns if col.startswith("map")]
    cov_cols = [col for col in cov_df.columns if col.startswith("Cov")]
    stat_cols = [col for col in cov_df.columns if col.startswith("PONcov")]
    
    cov_df = cov_df.loc[:, base_cols + cov_cols + stat_cols]
    covGC_df = covGC_df.loc[:, base_cols + cov_cols + stat_cols]
    filter_df = filter_df.loc[:, base_cols + stat_cols]
    filterGC_df = filterGC_df.loc[:, base_cols + stat_cols]
    
    # save dataframes
    if save:
        PON_cov_file = os.path.join(pon_path, f"CNV/pon.cov.full.gz")
        show_output(f"Saving combined PON coverage file {PON_cov_file}.")
        cov_df.to_csv(PON_cov_file, sep="\t", index=False, compression="gzip")
        # GC variant
        PON_cov_file = PON_cov_file.replace("full", "fullGC")
        show_output(f"Saving combined PON coverage file {PON_cov_file}.")
        covGC_df.to_csv(PON_cov_file, sep="\t", index=False, compression="gzip")
        
        # filtered
        PON_cov_file = os.path.join(pon_path, f"CNV/pon.cov.filter.gz")
        show_output(f"Saving filtered PON coverage file {PON_cov_file}.")
        filter_df.to_csv(PON_cov_file, sep="\t", index=False, compression="gzip")
        PON_cov_file = PON_cov_file.replace("filter", "filterGC")
        show_output(f"Saving filtered PON coverage file {PON_cov_file}.")
        filterGC_df.to_csv(PON_cov_file, sep="\t", index=False, compression="gzip")
        
        show_output("Finished", color="success")

    return cov_df, filter_df

In [None]:
cov_df, filter_df = make_PON_coverage(config=CNVconfig)
filter_df[:10]

### PON SNP
+ combine chroms

In [None]:
def make_PON_snp(config={}, save=True):
    """
    combine the PON coverages for all chroms
    save to hardcoded place in PON_path
    """

    # paths
    pon_path = config["PON_path"]

    chrom_list = [f"chr{c + 1}" for c in range(22)] + ["chrX"]
    dfs = []
    for chrom in chrom_list:
        PON_snp_file = os.path.join(pon_path, f"snp/{chrom}.snp.gz")
        if os.path.isfile(PON_snp_file):
            show_output(f"Loading PON SNP file {PON_snp_file}")
            snp_df = pd.read_csv(PON_snp_file, sep="\t", compression="gzip")
            dfs.append(snp_df)
        else:
            show_output(
                f"Could not find PON coverage file {PON_snp_file}", color="warning"
            )
    snp_df = (
        pd.concat(dfs)
        .reset_index(drop=True)
        .rename({"VAF": "PONVAF", "Depth": "PONDepth"}, axis=1)
    )
    # make chrom categorical
    snp_df.loc[:, "Chr"] = pd.Categorical(snp_df["Chr"], chrom_list)

    # save file
    if save:
        PON_snp_file = os.path.join(pon_path, f"CNV/pon.snp.gz")
        show_output(f"Saving combined PON SNP file {PON_snp_file}.")
        snp_df.to_csv(PON_snp_file, sep="\t", index=False, compression="gzip")
        show_output("Finished", color="success")
    return snp_df

In [None]:
snp_df = make_PON_snp(config=CNVconfig)
snp_df