# adding GC data, combine the chroms and add the PON coverage for log2ratio

In [None]:
# get the code
import sys
import os
sys.path.append('../code')


# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")

 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
PONCOV_path = cnv_path = os.path.join(cnvdata, "chromCov")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate

In [None]:
from script_utils_CNV import get_CNVconfig
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        genmap_split_path=os.path.join("annotation/genmap/hg38/split"),
        pon_cov_path=os.path.join(cnvdata, "chromCov")
    ))

## A) get coverage data

from CNV_raw import get_rawCNV

cov_df = get_rawCNV(
    clean_TN_pileup_file=os.path.join(cnvdata, "cleanpileup/03_A-B.chr7.gz"),
    chrom="chr7",
    SNP_output=os.path.join(output_path, "pile2CNV/03_A-B.chr7.snp"),
    config=CNVconfig
)


cov_df.to_csv(os.path.join(output_path, "pile2CNV/03_A-B.chr7.rawcov.gz"), sep="\t", index=False, compression="gzip")
cov_df

In [None]:
cov_df = pd.read_csv(os.path.join(output_path, "pile2CNV/03_A-B.chr7.rawcov.gz"), sep="\t", compression="gzip")
cov_df

In [None]:
from script_utils_CNV import show_output

def addGCratio(cov_df, chrom="", config={}):
    '''
    adds the GCdata to the coverage data
    '''
    
    # load GC data for that chromosome
    gc_file = os.path.join(config['gc_split_path'], f"{chrom}.gc100-10.gz")
    try:      
        gc_df = pd.read_csv(gc_file, sep="\t", compression="gzip")
        show_output(f"Loading GC data for {chrom} from {gc_file}")
    except Exception as e:
        show_output(f"{e}: Could not load GC data for {chrom} from {gc_file}", color="warning")
        return
    cov_df = cov_df.merge(gc_df, on="Start", how="left")
    return cov_df
    

In [None]:
df = addGCratio(cov_df, chrom="chr7", config=CNVconfig)
df

### do all chroms


##  Visualize GC bias

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(df['GC/AT'], df['Cov1'], s=0.1, alpha=.01)

## C) coverage: merge sample coverage with Pon coverage

In [None]:
chrom_list = [f"chr{c + 1}" for c in range(22)] + ['chrX']

def include_PONcov(sample, sample_cnv_path="", PON_cnv_path="", verbose=False, filtered=True):
    
    cover_dfs = []
    
    
    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(sample_cnv_path, f"{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            print("No file", sample_cov_file)
            continue
        if verbose:
            print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        #reading PONcoverage
        full_or_filtered = "filtered" if filtered else "full"
        pon_cov_file = os.path.join(PON_cnv_path, f"{chrom}.{full_or_filtered}.csv.gz")
        if not os.path.isfile(pon_cov_file):
            print("No file", pon_cov_file)
            continue
        if verbose:
            print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[4:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        ##### here recover missing FullExonPos from margin
        # get 
        exon_start, full_start = sample_df.iloc[0][['ExonPos', 'FullExonPos']]
        offset = full_start - exon_start
        sample_df.loc[sample_df['FullExonPos'] != s ample_df['FullExonPos'], 'FullExonPos'] = sample_df['ExonPos'] + offset
        sample_df.loc[:, 'FullExonPos'] = sample_df.loc[:, 'FullExonPos'].astype(int)
        cover_dfs.append(sample_df)  
    # combine chrom data
    cover_df = pd.concat(cover_dfs)
    
    # normalize the coverage
    cover_df['Coverage'] = cover_df['Coverage'].fillna(0)
    mean_cov = sample_df['Coverage'].mean()
    cover_df.loc[:, 'Coverage'] = (cover_df['Coverage'] / mean_cov * 100)
    # loggable are the coverages, where log2ratio can be computed
    loggable = (cover_df['PONmeanCov'] * cover_df['Coverage'] != 0)
    cover_df.loc[loggable, 'log2ratio'] = np.log2(cover_df.loc[loggable, 'Coverage'] / cover_df.loc[loggable, 'PONmeanCov'])
    # mark regions without PON coverage as 0
    cover_df.loc[~loggable, 'log2ratio'] = np.nan
    return cover_df

In [None]:
def include_PONcov(cov_df, PON_cov_path="", config={}):
    '''
    adds the PON coverage per coverage_chrom_df
    normalization is performed afterwards on combined df
    '''
    
    # reading sampleCoverage
    PON_cov_file = os.path.join(PON_cov_path, f"{chrom}.filtered.csv.gz")
    # check file existence
    if not os.path.isfile(PON_cov_file):
        show_output(f"PON coverage file {PON_cov_file} not found", color="warning")
        return
    if verbose:
        print(f"Reading PON coverage of {chrom} from {PON_cov_file}.")
        PON_df = pd.read_csv(PON_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        
        # column rename
        trans_dict = {col:f"PON{col}" for col in PON_df.columns[4:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(PON_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        ##### here recover missing FullExonPos from margin
        # get 
        exon_start, full_start = sample_df.iloc[0][['ExonPos', 'FullExonPos']]
        offset = full_start - exon_start
        sample_df.loc[sample_df['FullExonPos'] != s ample_df['FullExonPos'], 'FullExonPos'] = sample_df['ExonPos'] + offset
        sample_df.loc[:, 'FullExonPos'] = sample_df.loc[:, 'FullExonPos'].astype(int)
        cover_dfs.append(sample_df)  
    # combine chrom data
    cover_df = pd.concat(cover_dfs)
    
    # normalize the coverage
    cover_df['Coverage'] = cover_df['Coverage'].fillna(0)
    mean_cov = sample_df['Coverage'].mean()
    cover_df.loc[:, 'Coverage'] = (cover_df['Coverage'] / mean_cov * 100)
    # loggable are the coverages, where log2ratio can be computed
    loggable = (cover_df['PONmeanCov'] * cover_df['Coverage'] != 0)
    cover_df.loc[loggable, 'log2ratio'] = np.log2(cover_df.loc[loggable, 'Coverage'] / cover_df.loc[loggable, 'PONmeanCov'])
    # mark regions without PON coverage as 0
    cover_df.loc[~loggable, 'log2ratio'] = np.nan
    return cover_df

### run the PON includer

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=dict(
        mawk_path="../shell",
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        pon_cov_path=os.path.join(cnvdata, "chromCov")
    ))