# combining individual coverage data into a mean-normalized coverage
+ mappability and GCratio is included for GC normalization and possible filtering
+ mappability and GC will not be saved in output files*

In [None]:
# get the code
import sys
import os
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output, cmd2df
from CNV_raw import addGCratio, addGenmap, PON2CNV

# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq")
 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")

### get the config

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        PON_path = PON_path,
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split")
    ))
CNVconfig

## Run the code

In [None]:
from combineCNV import make_PON_coverage
cov_df, filter_df = make_PON_coverage(config=CNVconfig)

In [None]:
# save
filter_df.to_csv(os.path.join(PON_path, "pon.filter.gz"), sep="\t", index=False, compression="gzip")
cov_df.to_csv(os.path.join(PON_path, "pon.full.gz"), sep="\t", index=False, compression="gzip")

## Step by step

### get all the normal samples from the PON list into df for normalization and averaging

In [None]:
def combine_PON_coverage(config={}):
    '''
    combine the PON coverages for all chroms
    '''

    # paths
    pon_path = config['PON_path']

    chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']
    dfs = []
    for chrom in chrom_list:
        PON_cov_file = os.path.join(pon_path, f"cov/{chrom}.cov.gz")
        if os.path.isfile(PON_cov_file):
            show_output(f"Loading coverage file {PON_cov_file}")
            cov_df = pd.read_csv(PON_cov_file, sep="\t", compression="gzip")
            dfs.append(cov_df)
        else:
            show_output(f"Could not find PON coverage file {PON_cov_file}")
    cov_df = pd.concat(dfs).reset_index(drop=True)
    # make chrom categorical
    cov_df.loc[:, "Chr"] = pd.Categorical(cov_df['Chr'], chrom_list)
    return cov_df

In [None]:
cov_df = combine_PON_coverage(config=CNVconfig)
cov_df

## B) normalize GC ratio

###  Visualize GC bias
+ distribution of mean

In [None]:
def make_GC_plot(cov_df, agg="mean", max_plots=99):
    '''
    create GC plot for the coverages
    '''
    cov_cols = [col for col in cov_df.columns if col.startswith("Cov")][:max_plots]
    # create the agg dictionary
    cov_agg = {col: agg for col in cov_cols}
    # make the agg
    df = cov_df.loc[cov_df["map50_0"] > 0.5, :].loc[cov_df["map30_0"] > 0.5, :].loc[cov_df["map75_1"] > 0.5, :].groupby("GCratio").agg(cov_agg)
    fig, ax = plt.subplots(figsize=(10, 10))
    for col in cov_cols:
        _ = ax.plot(df.index, df[col], alpha=0.4)
    return fig, ax

In [None]:
from plot import make_GC_plot
fig, _ = make_GC_plot(cov_df)
fig.savefig(os.path.join(cnvdata, "figures/GCPON.jpeg"))

### check size of the data
+ seems to lack substantially compared to tumor matrix!

In [None]:
cov_df.groupby("Chr").agg({"Cov1": "mean", "Cov2": "mean", "Pos":['count', 'min', 'max']}).sort_values("Chr")

### normalize the coverage to 100

In [None]:
def normalize_GC_col(cov_df, col):
    '''
    normalizes one coverage column for GC ratio
    '''
    # compute the normalizer df
    # for each GCratio, norm_df has the difference of the respective mean from arbitrary norm coverage 100
    # remove chrX for the normalization or male genomes will have slightly greater mean
    norm_df = (100 / cov_df.query('Chr != "chrX"').groupby("GCratio").agg({col: 'mean'})).reset_index().rename({col: "factor"}, axis=1)

    # merge to get the factor
    cov_df = cov_df.merge(norm_df)
    # adjust coverage using the factor from norm_df
    cov_df[col] = cov_df[col] * cov_df['factor']
    # remove factor
    cov_df = cov_df.drop("factor", axis=1)
    return cov_df


def normalize_GC(cov_df):
    '''
    normalize GC for an entire tumor_normal sample
    '''
    for col in cov_df.columns:
        if col.startswith("Cov"):
            show_output(f"Normalizing GC ratio for {col}.")
            cov_df = normalize_GC_col(cov_df, col)
    cov_df = cov_df.reset_index(drop=True).sort_values(['Chr', 'Pos'])
    return cov_df

### visualize normalization

In [None]:
norm_cov_df = normalize_GC(cov_df)
del cov_df
fig, _ = make_GC_plot(norm_cov_df)
# fig.savefig(os.path.join(cnvdata, "figures/GCPON_normalized.jpeg"))
norm_cov_df

## raise X-coverage for male samples
+ consense X coverage is required

In [None]:
norm_cov_df.groupby("Chr").agg(cov_agg).loc["chrX"]

In [None]:
def amazonize(cov_df):
    '''
    detect male samples via below-threshold X-chrom coverage
    coverage on chrX is doubled in these samples
    '''

    # get coverage cols
    cov_cols = [col for col in cov_df.columns if col.startswith("Cov")]
    # create the agg dictionary
    cov_agg = {col: "mean" for col in cov_cols}
    # compute x_coverage for all samples using agg dictionary
    X_coverage = cov_df.query('Chr == "chrX"').agg(cov_agg)
    # filter out the male samples
    male_cols = [col for col in cov_cols if X_coverage[col] < 75]
    # adjust the coverage for male samples
    cov_df.loc[cov_df['Chr'] == "chrX", male_cols] = cov_df[male_cols] * 2
    return cov_df

In [None]:
normX_df = amazonize(norm_cov_df)
del norm_cov_df
normX_df.groupby("Chr").agg(cov_agg).loc["chrX"]

In [None]:
normX_df

## compute the mean of all the coverages

In [None]:
def compute_stats(df):
    '''
    get statistics
    '''
    # remove all pre-existing stats
    df = df.drop([col for col in df.columns if col.startswith("PONcov")], axis=1)
    # set index for all non-coverage columns
    index_cols = [col for col in df.columns if not col.startswith("Cov")]
    cov_df = df.drop(index_cols, axis=1)
    df['PONcov_mean'] = cov_df.mean(axis=1)
    df['PONcov_median'] = cov_df.median(axis=1)
    df['PONcov_std'] = cov_df.std(axis=1)
    return df

In [None]:
mean_df = compute_stats(normX_df)
del normX_df
mean_df

### filter the coverage
+ for filtering all outliers are removed that stray from the local mean above max_mean_std
+ other filtering steps per position should be done afterwards during rolling window stuff:
+ `filter_df = df.query('meanCov > @mincov and std < @max_mean_std')`


In [None]:
mean_df['PONcov_std'].max()
mean_df['PONcov_std'].mean()

### remove outliers in order to reduce noise

In [None]:
def remove_outliers(df, std_factor=2.5):
    '''
    cycle through all sample cols, remove outliers with difference to PONcov greater than std_factor * std
    '''
    for col in [col for col in df.columns if col.startswith("Cov")]:
        df.loc[np.abs(df['PONcov_mean'] - df[col]) / df['PONcov_std'] > std_factor, col] = np.nan
    return df

In [None]:
filter_df = compute_stats(remove_outliers(mean_df, std_factor=2))
del mean_df
filter_df

In [None]:
filter_df['PONcov_std'].max()
filter_df['PONcov_std'].mean()

### the master function

In [None]:
def make_PON_coverage(config={
    'PONcoverage': {
        'stdFactor': 2.5  # only exonPositions straighing within std_factor * std around meanCoverage are kept
    },
    'PON_path': '.',  # path to the PON folder
}):
    '''

    '''
    # load all sample coverages for one chromosome
    cov_df = combine_PON_coverage(config=config)
    # make chrom categorical
    chrom_list = [f"chr{i}" for i in range(1, 23)] + ["chrX"]
    cov_df.loc[:, "Chr"] = pd.Categorical(cov_df['Chr'], chrom_list)

    # normalize and add mean values and std
    show_output("Normalizing coverage and removing GC dependencies for PON coverage.")
    cov_df = normalize_GC(cov_df)

    show_output("Lifting X-coverages for male samples to XX coverage.")
    cov_df = amazonize(cov_df)

    show_output("Computing stats.")
    cov_df = compute_stats(cov_df)

    std_factor = config['PONcoverage']['stdFactor']
    show_output("Remove outliers and recompute stats.")
    filter_df = remove_outliers(cov_df, std_factor=std_factor)

    # save and adjust the output columns
    base_cols = ['Chr', 'Pos', 'ExonPos']
    # map_cols = [col for col in cov_df.columns if col.startswith("map")]
    cov_cols = [col for col in cov_df.columns if col.startswith("Cov")]
    stat_cols = [col for col in cov_df.columns if col.startswith("PONcov")]

    cov_df = cov_df.loc[:, base_cols + cov_cols + stat_cols]
    filter_df = filter_df.loc[:, base_cols + stat_cols]

    return cov_df, filter_df

In [None]:
cov_df, filter_df = make_PON_coverage(config=CNVconfig)
filter_df[:10]