## combining individual coverage data into a mean-normalized coverage

## Plan:

+ for the PanelofNormals, create a coverage file for each PON bam and each chromosome
+ Pon bams should optimally not contain CNV or there should be a lot of them to reduce the std
+ put all the files into a big matrix and normalize coverages and produce an average coverage (+ std) for the exonic space
+ compare the tumor samples against that PONcoverage to get differences in CNV

### have to look at the Kenichi-CNV-graphs to identify the samples from FDAH1 without CNVs in the normals
+ [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

### alternative approach
+ get all samples and remove the outliers from std

In [None]:
# set the paths
home = '/Users/mahtin'
home = '/Users/martinscience'
testdata = f"{home}/Dropbox/Icke/Work/somVar/testdata"
tooldata = f"{home}/Dropbox/Icke/Work/somVar/tooldata"
shell_path = "../shell"
static_path = f"{home}/Dropbox/Icke/Work/static"
bed_path = f"{static_path}/bed_files/SureSelect/hg38"

cnvdata = f"{tooldata}/myCNVdata/"
bedCov_path = f"{cnvdata}/bedCov"
bedCov_path = f"{home}/mount/scratch/develop/PONcoverage/bedCov"
output_path = f"{cnvdata}/output/PONcoverage"

In [None]:
bedCov_path

In [None]:
test = pd.read_csv(f'{bedCov_path}/009_B.chr5.bedCov', sep='\t', compression="gzip")
test[:13]

### get all the normal samples from the PON list into df for normalization and averaging

In [None]:
# try with pd merge
import os
def combine_coverage(PONcov_path, chrom, sample_list):
    cov_df = pd.DataFrame(columns=['ExonPos', 'Pos'])
    for sample in sample_list:
        file = os.path.join(PONcov_path, f"{sample}.{chrom}.bedCov")
        if not os.path.isfile(file):
            continue
        print(f"Reading {sample} from {file}.")
        df = pd.read_csv(file, sep='\t', compression='gzip').loc[:,['Pos', 'ExonPos', 'Coverage']].rename(columns={'Coverage':sample})
        cov_df = cov_df.merge(df, on=['ExonPos', 'Pos'], how='outer')
    cov_df = cov_df.fillna(0).sort_values('ExonPos')
    cov_df['Chr'] = chrom
    # reorder columns
    cols = ['Chr', 'Pos', 'ExonPos'] + list(cov_df.columns)[2:-1]
    return cov_df.loc[:,cols]

In [None]:
sample_list = [f"{str(s).zfill(3)}_B" for s in range(45)]
cov_df = combine_coverage(bedCov_path, "chr7", sample_list)
cov_df

### make a stacked "tidy version" of the coverage df for vizualisation in tidyverse

In [None]:
def tidy_df(df):
    return df.drop(columns=['Chr', 'Pos']).set_index('ExonPos').stack().reset_index().rename(columns={'level_1':'sample', 0:'Coverage'})

cov_df.to_csv(f"{output_path}/PON_coverage.csv", sep='\t', index=False)

tidy_df(cov_df).to_csv(f"{output_path}/PON_coverage_tidy.csv", sep='\t', index=False)

# actually I can tidy the data in R with gather:

### normalize the coverage to coverage 100

In [None]:
def normalize_coverage(cov_df, norm_cov=100):
    norm_df = cov_df.set_index(['Chr','Pos','ExonPos'])
    norm_df = norm_df / norm_df.mean() * norm_cov
    return norm_df.reset_index()

In [None]:
norm_df = normalize_coverage(cov_df)
norm_df

In [None]:
norm_df.to_csv(f"{output_path}/PON_coverage_norm.csv", sep='\t', index=False)
tidy_df(norm_df).to_csv(f"{output_path}/PON_coverage_normtidy.csv", sep='\t', index=False)

### compute the mean of all the coverages

In [None]:
def add_mean(norm_df):
    norm_df = norm_df.set_index(['Chr', 'Pos', 'ExonPos'])
    norm_df['meanCov'] = norm_df.mean(axis=1)
    norm_df['medianCov'] = norm_df.median(axis=1)
    norm_df['std'] = norm_df.std(axis=1)
    return norm_df.reset_index()
mean_df = add_mean(norm_df)
mean_df.to_csv(f"{output_path}/PON_coverage_mean.csv", sep='\t', index=False)
tidy_df(mean_df).to_csv(f"{output_path}/PON_coverage_mean.tidy.csv", sep='\t', index=False)
mean_df

### filter the coverage

In [None]:
mean_df['std'].max()

In [None]:
def filter_coverage(df, mincov=20, max_mean_std=20):
    filter_df = df.query('meanCov > @mincov and std < @max_mean_std')
    return filter_df

In [None]:
filter_df = filter_coverage(mean_df, mincov=50, max_mean_std=40)
filter_df.to_csv(f"{output_path}/PON_coverage_filter.csv", sep='\t', index=False)
tidy_df(filter_df).to_csv(f"{output_path}/PON_coverage_filter.tidy.csv", sep='\t', index=False)

### remove outliers in order to reduce noise

In [None]:
def remove_outliers(df, std_factor=2.5):
    # cycle through all sample cols and remove outliers
    for col in list(df.columns)[3:-3]:
        df.loc[np.abs(df['meanCov'] - df[col]) / df['std'] > std_factor, col] = np.nan
    return add_mean(df.iloc[:,:-3])

In [None]:
removed_df = remove_outliers(filter_df, std_factor=2)
removed_df.to_csv(f"{output_path}/PON_coverage_removed.csv", sep='\t', index=False)
tidy_df(removed_df).to_csv(f"{output_path}/PON_coverage_removed.tidy.csv", sep='\t', index=False)
removed_df

In [None]:
def make_PON_coverage(chrom, sample_list, config={
    'sample_PON_path': '.', # the folder with the sample/chrom coverages
    'outpath':'.',       # where the coverage files are saved
    'normCov':100,       # to what value are coverages normalized#
    'minCov': 20,        # only exonPositions with the average coverage above minCov are kept
    'max_mean_std': 20,  # only exonPositions with a coverage std below max_mean_std are kept
    'std_factor': 3,     # only exonPositions straighing within std_factor * std around meanCoverage are kept
}):
    
    # load all sample coverages for one chromosome
    cov_df = combine_coverage(config['sample_PON_path'], chrom, sample_list)
    
    # normalize and add mean values and std
    mean_df = add_mean(normalize_coverage(cov_df, norm_cov=config['normCov']))
    
    # filter hard regions and outlying data points
    filter_df = filter_coverage(mean_df, mincov=config['minCov'], max_mean_std=config['max_mean_std'])
    final_df = remove_outliers(filter_df, std_factor=config['std_factor'])
    
    # output
    file_name = f"{output_path}/PON_coverage.{chrom}.removed.csv"
    print(f"Saving filtered PON file {file_name} to {output_path}")
    final_df.to_csv(file_name, sep='\t', index=False)
    return final_df

In [None]:
config = {
    'sample_PON_path': bedCov_path,
    'output_path': output_path,
    'normCov':100,  
    'minCov': 20,   
    'max_mean_std': 20,
    'std_factor': 2,    
} 
sample_list = [f"{str(s).zfill(3)}_B" for s in range(45)]

df = make_PON_coverage("chr7", sample_list, config)

In [None]:
df