# adding GC data, combine the chroms and add the PON coverage for log2ratio

In [None]:
# get the code
import sys
import os
import matplotlib.pyplot as plt
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output

# HOME
home = '/Users/mahtin'
# home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq")
 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate paths

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split"),
        pon_path=PON_path
    ))

## A) get coverge for all chroms and concat


In [None]:
def combine_coverage(sample, 
                     coverage_path="",  # path containing rawcov.gz files for this sample
                     config={}
    ):
    '''
    combine the coverages for all chroms and add the GC ratio
    '''
    
    dfs = []
    chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']
    for chrom in chrom_list:
        cov_file = os.path.join(coverage_path, f"{sample}.{chrom}.rawcov.gz")
        show_output(f"Loading coverage file {cov_file}")
        cov_df = pd.read_csv(cov_file, sep="\t", compression="gzip")
        dfs.append(cov_df)
    cov_df = pd.concat(dfs).reset_index(drop=True)
    # make chrom categorical
    cov_df.loc[:, "Chr"] = pd.Categorical(cov_df['Chr'], chrom_list)
    return cov_df

In [None]:
sample="03_A-B"
cov_df = combine_coverage(
    sample, 
    coverage_path=os.path.join(output_path, "pile2CNV"),
    config=CNVconfig
)
cov_df

### look at general stats

In [None]:
cov_df.groupby("Chr").agg({"Cov1": "mean", "Cov2": "mean", "Pos":['count', 'min', 'max']}).sort_values("Chr")

## B) normalize GC ratio

###  Visualize GC bias

In [None]:
from plot import make_GC_plot
fig, _ = make_GC_plot(cov_df, sample=sample)
# fig.savefig(os.path.join(cnvdata, "figures/GCPON.jpeg"))

In [None]:
from combineCNV import normalize_GC  # see notebook "combinePONcoverage"
norm_cov_df = normalize_GC(cov_df)
del cov_df
fig, _ = make_GC_plot(norm_cov_df)

## C) coverage: merge sample coverage with Pon coverage

In [None]:
pon_df = pd.read_csv(os.path.join(CNVconfig['pon_path'], "pon.filter.gz"), sep="\t", compression="gzip")
pon_df

In [None]:
def get_full_exon_pos(df, pon_df):
    '''
    adds the accumulated exonic position (over all chroms from PON data) to coverage df
    '''
    
    # create chrom_df from pon_df
    chrom_df = pon_df.groupby("Chr").agg(dict(ExonPos=["min","max"]))["ExonPos"]
    chrom_df['chrAdd'] = chrom_df['max'].cumsum().shift(fill_value=0)
    chrom_df = chrom_df.loc[:, 'chrAdd'].reset_index()
    
    # merge with chrom_df
    df = df.merge(chrom_df)
    
    # get FullExonPos from ExonPos and chrAdd
    df.loc[:, 'FullExonPos'] = df['ExonPos'] + df['chrAdd']
    
    # save the output columns
    base_cols = ['Chr', 'Pos', 'ExonPos', 'FullExonPos', 'GCratio']
    map_cols = [col for col in df.columns if col.startswith("map")]
    cov_cols = [col for col in df.columns if col.startswith("Cov")]
    out_cols = base_cols + map_cols + cov_cols
    
    # adds the last ExonPos of chrom to start of next chromosome
    return df.loc[:, out_cols]

In [None]:
norm_full_cov_df = get_full_exon_pos(norm_cov_df, pon_df)
del norm_cov_df
norm_full_cov_df

In [None]:
pon_df[:13]

In [None]:
merge_df = norm_full_cov_df.merge(pon_df)
del norm_full_cov_df
merge_df

In [None]:
merge_df

In [None]:
def log2ratio(df, cov_col, pon_cov_col='PONcov_mean'):
    '''
    add log2ratio (log2(COV/PONCOV) to coverage data
    '''
    
    # mask rows where logging does not compute
    loggable = df[cov_col] * df['PONcov_mean'] != 0
    # apply the log
    log_col = cov_col.replace("Cov", "log2ratio")
    df.loc[loggable, log_col] = np.log2(df.loc[loggable, cov_col] / df.loc[loggable, pon_cov_col])
    # get col index of cov_col for inserting log_col
    insert_index = df.columns.get_loc(cov_col) + 1
    cols = list(df.columns)
    out_cols = cols[:insert_index] + [log_col] + cols[insert_index:]
    return df.loc[:, out_cols]

In [None]:
for col in merge_df.columns:
    if col.startswith("Cov"):     
        merge_df = log2ratio(merge_df, col)
merge_df

In [None]:
def include_PONcov(cov_df, PON_cov_path="", config={}):
    '''
    adds the PON coverage per coverage_chrom_df
    normalization is performed afterwards on combined df
    '''
    
    # reading sampleCoverage
    PON_cov_file = os.path.join(PON_cov_path, f"{chrom}.filtered.csv.gz")
    # check file existence
    if not os.path.isfile(PON_cov_file):
        show_output(f"PON coverage file {PON_cov_file} not found", color="warning")
        return
    if verbose:
        print(f"Reading PON coverage of {chrom} from {PON_cov_file}.")
        PON_df = pd.read_csv(PON_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        
        # column rename
        trans_dict = {col:f"PON{col}" for col in PON_df.columns[4:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(PON_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        ##### here recover missing FullExonPos from margin
        # get 
        exon_start, full_start = sample_df.iloc[0][['ExonPos', 'FullExonPos']]
        offset = full_start - exon_start
        sample_df.loc[sample_df['FullExonPos'] != s ample_df['FullExonPos'], 'FullExonPos'] = sample_df['ExonPos'] + offset
        sample_df.loc[:, 'FullExonPos'] = sample_df.loc[:, 'FullExonPos'].astype(int)
        cover_dfs.append(sample_df)  
    # combine chrom data
    cover_df = pd.concat(cover_dfs)
    
    # normalize the coverage
    cover_df['Coverage'] = cover_df['Coverage'].fillna(0)
    mean_cov = sample_df['Coverage'].mean()
    cover_df.loc[:, 'Coverage'] = (cover_df['Coverage'] / mean_cov * 100)
    # loggable are the coverages, where log2ratio can be computed
    loggable = (cover_df['PONmeanCov'] * cover_df['Coverage'] != 0)
    cover_df.loc[loggable, 'log2ratio'] = np.log2(cover_df.loc[loggable, 'Coverage'] / cover_df.loc[loggable, 'PONmeanCov'])
    # mark regions without PON coverage as 0
    cover_df.loc[~loggable, 'log2ratio'] = np.nan
    return cover_df

### run the PON includer

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        pon_cov_path=os.path.join(cnvdata, "chromCov")
    ))