# combining individual coverage data into a mean-normalized coverage

In [None]:
# set the paths
home = '/Users/mahtin'
# home = '/Users/martinscience'
testdata = f"{home}/Dropbox/Icke/Work/somVar/testdata"
tooldata = f"{home}/Dropbox/Icke/Work/somVar/tooldata"
static_path = f"{home}/Dropbox/Icke/Work/static"
bed_path = f"{static_path}/bed_files/SureSelect/hg38"

cnvdata = f"{tooldata}/myCNVdata/"
bedCov_path = f"{cnvdata}/bedCov"
bedCov_path = f"{home}/mount/scratch/develop/PONcoverage/bedCov"
output_path = f"{cnvdata}/output/PONcoverage"

## get the code

In [None]:
import sys
sys.path.append('../codeCNV')
from pon_coverage import make_PON_coverage

### get the config

In [None]:
# load the config
# edit config directly in yaml file
import yaml
config_file = '../config/config_devel.yaml'
def get_config(config_file):
        with open(config_file) as file:
        # The FullLoader parameter handles the conversion from YAML
        # scalar values to Python the dictionary format
            config = yaml.load(file, Loader=yaml.FullLoader)
        return config
config = get_config(config_file)
pon_config = config['CNV']['PONcoverage']
pon_config['sample_PON_path'] = bedCov_path

In [None]:
sample_list = [f"{str(s+1).zfill(3)}_B" for s in range(45)]
full_df, filter_df = make_PON_coverage(sample_list, config=config)

In [None]:
full_df[:10]

In [None]:
filter_df[:10]

In [None]:
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']
for chrom in chrom_list:
    filter_df.query('Chr == @chrom').to_csv(os.path.join(output_path, f"chroms/{chrom}.filtered.csv.gz"), sep='\t', index=False, compression="gzip")
    full_df.query('Chr == @chrom').to_csv(os.path.join(output_path, f"chroms/{chrom}.full.csv.gz"), sep='\t', index=False, compression="gzip")

## Plan:

+ for the PanelofNormals, create a coverage file for each PON bam and each chromosome
+ Pon bams should optimally not contain CNV or there should be a lot of them to reduce the std
+ put all the files into a big matrix and normalize coverages and produce an average coverage (+ std) for the exonic space
+ compare the tumor samples against that PONcoverage to get differences in CNV

### have to look at the Kenichi-CNV-graphs to identify the samples from FDAH1 without CNVs in the normals
+ [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

### alternative approach
+ get all samples and remove the outliers from std

In [None]:
bedCov_path

In [None]:
test = pd.read_csv(f'{bedCov_path}/009_B.chr5.bedCov', sep='\t', compression="gzip")
test[:3]

### get all the normal samples from the PON list into df for normalization and averaging

In [None]:
def gather_PONcoverage_chrom(chrom, sample_list, bedCov_path):
    '''
    gather all sample.chrom.bedCov files and gather together
    '''
    cov_df = pd.DataFrame(columns=['ExonPos', 'Pos'])
    for sample in sample_list:
        sample_file = os.path.join(bedCov_path, f"{sample}.{chrom}.bedCov")
        if not os.path.isfile(sample_file):
            continue
        print(f"Reading {sample} from {sample_file}.")
        df = pd.read_csv(sample_file, sep='\t', compression='gzip').loc[:,['Pos', 'ExonPos', 'Coverage']].rename(columns={'Coverage':sample})
        cov_df = cov_df.merge(df, on=['ExonPos', 'Pos'], how='outer')
    cov_df = cov_df.fillna(0).sort_values('ExonPos')
    cov_df['Chr'] = chrom
    # reorder columns
    cols = ['Chr', 'Pos', 'ExonPos'] + list(cov_df.columns)[2:-1]
    return cov_df.loc[:,cols]

In [None]:
import os
sample_list = [f"{str(s+1).zfill(3)}_B" for s in range(45)]

cov_df = gather_PONcoverage_chrom("chrX", sample_list, bedCov_path)
# cov_df.to_csv(f"{output_path}/PON_coverage.chr7.csv", sep='\t', index=False)
cov_df

### for proper normalization, the full exon coverages have to be read in before normalization

In [None]:
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']

def gather_PONcoverage(chrom_list=[], sample_list=[], bedCov_path='.'):
    '''
    combine the PONcoverage for all chromosomes
    '''
    
    cov_dfs = []
    for chrom in chrom_list:
        print(f"Collecting PON coverages for {chrom}")
        cov_df = gather_PONcoverage_chrom(chrom, sample_list, bedCov_path)
        cov_dfs.append(cov_df)
    cov_df_full = pd.concat(cov_dfs).reset_index(drop=True)
    return cov_df_full

In [None]:
cov_df = gather_PONcoverage(chrom_list, sample_list, bedCov_path=bedCov_path)
cov_df.to_csv(f"{output_path}/PON_coverage.csv", sep='\t', index=False)
cov_df

In [None]:
cov_df

### normalize the coverage to coverage 100

In [None]:
def normalize_coverage(cov_df, norm_cov=100):
    norm_df = cov_df.set_index(['Chr','Pos','ExonPos'])
    norm_df = norm_df / norm_df.mean() * norm_cov
    return norm_df.reset_index()

In [None]:
norm_df = normalize_coverage(cov_df, norm_cov=pon_config['normCov'])
norm_df

In [None]:
def equalize_X(norm_df):
    '''
    detects samples with a normed chromX coverage below 75 (XY)
    and doubles respective coverages for an overall diploidy for 
    normalization of coverage
    '''
    
    # extract x chrom from normalized df and set index to hide non-coverage cols
    x_df = norm_df.query('Chr == "chrX"').set_index(['Chr','Pos','ExonPos'])
    no_x_df = norm_df.query('Chr != "chrX"')
    # double the values for samples with mean below 75
    x_df.loc[:, x_df.mean() < 75] = x_df * 2
    # concat norm_df without x and harmonized X chrom df
    equalX_df = pd.concat([no_x_df, x_df.reset_index()]).sort_values(['Chr', 'Pos'])
    return equalX_df

In [None]:
equalX_df = equalize_X(norm_df)
equalX_df.mean()

In [None]:
normX_df = normalize_coverage(equalX_df, norm_cov=pon_config['normCov'])
normX_df.to_csv(f"{output_path}/PON_coverage_normalized.csv", sep='\t', index=False)
normX_df

### compute the mean of all the coverages

In [None]:
def add_mean(norm_df):
    norm_df = norm_df.set_index(['Chr', 'Pos', 'ExonPos'])
    norm_df['meanCov'] = norm_df.mean(axis=1)
    norm_df['medianCov'] = norm_df.median(axis=1)
    norm_df['std'] = norm_df.std(axis=1)
    return norm_df.reset_index()
mean_df = add_mean(norm_df)
mean_df.to_csv(f"{output_path}/PON_coverage_mean.csv", sep='\t', index=False)
mean_df

In [None]:
def get_full_exon_pos(df):
    '''
    adds the accumulated exonic position (over all chroms)
    '''
    
    # save the output columns
    cols = list(df.columns)
    df = df.reset_index(drop=True)
    # adds the last ExonPos of chrom to start of next chromosome
    df.loc[:,'chromStep'] = df.shift(1)['ExonPos'].fillna(0).astype(int)
    df.loc[df['Chr'] == df.shift(1)['Chr'],'chromStep'] = 0
    df['chromAccum'] = df['chromStep'].cumsum()
    df['FullExonPos'] = df['ExonPos'] + df['chromAccum']
    cols = cols[:2] + ['FullExonPos'] + cols[2:]
    return df[cols]

In [None]:
full_df = get_full_exon_pos(mean_df)
full_df

### filter the coverage
+ for filtering all outliers are removed that stray from the local mean above max_mean_std
+ other filtering steps per position should be done afterwards during rolling window stuff:
+ `filter_df = df.query('meanCov > @mincov and std < @max_mean_std')`


In [None]:
mean_df['std'].max()

### remove outliers in order to reduce noise

In [None]:
def remove_outliers(df, std_factor=2.5):
    # cycle through all sample cols, remove outliers and recompute the mean
    for col in list(df.columns)[3:-3]:
        df.loc[np.abs(df['meanCov'] - df[col]) / df['std'] > std_factor, col] = np.nan
    return add_mean(df.iloc[:,:-3])

In [None]:
filter_df = remove_outliers(mean_df, std_factor=2)
filter_df.to_csv(f"{output_path}/PON_coverage_filtered.csv", sep='\t', index=False)
filter_df

In [None]:
filter_df['std'].max()

In [None]:
filter_df.loc[:,['Chr', 'Pos', 'ExonPos', 'meanCov', 'medianCov', 'std']]

In [None]:
def make_PON_coverage(sample_list, chrom_list=[f"chr{chrom + 1}" for chrom in range(22)] + ['chrX'], config={
    # provide a different chrom_list if you don't want standard ['chr1', 'chr2'...]
    'normCov':100,       # to what value are coverages normalized#
    'stdFactor': 3,     # only exonPositions straighing within std_factor * std around meanCoverage are kept
    'bed_cov_path': '.' # the path to the bed_cover_file
}):
    # load all sample coverages for one chromosome
    cov_df = gather_PONcoverage(chrom_list=chrom_list, sample_list=sample_list, bedCov_path=config['sample_PON_path'])
    
    # normalize and add mean values and std
    eqX_df = equalize_X(normalize_coverage(cov_df, norm_cov=config['normCov']))
    normX_df = normalize_coverage(eqX_df, norm_cov=config['normCov'])
    mean_df = add_mean(normX_df)
    # add full exon coords to normalized PON coverage
    full_df = get_full_exon_pos(mean_df)
    
    filter_df = remove_outliers(mean_df, std_factor=config['stdFactor'])
    # remove sample columns and addd full exon coords to filtered PON coverage
    filter_df = get_full_exon_pos(filter_df.loc[:,['Chr', 'Pos', 'ExonPos', 'meanCov', 'medianCov', 'std']])
    return full_df, filter_df

In [None]:
sample_list = [f"{str(s+1).zfill(3)}_B" for s in range(45)]

full_df, filter_df = make_PON_coverage(sample_list, config=pon_config)

In [None]:
full_df[:10]

In [None]:
filter_df[:10]
filter_df.to_csv(f"{output_path}/PON_coverage_filtered.csv", sep='\t', index=False)