## combining chromosome SNP data per sample into WES SNP data
### also add EB data if available to filter out bad SNPs

In [None]:
# set the paths
home = '/Users/mahtin'
home = '/Users/martinscience'
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
cnvdata = os.path.join(tooldata, "myCNVdata")
shell_path = "../shell"
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
output_path = os.path.join(cnvdata, "output")

cnv_path = os.path.join(cnvdata, "cnv")

In [None]:
cnv_path

In [None]:
test = pd.read_csv(f'{cnv_path}/01_A.chr3.snpEB', sep='\t').loc[:,['Chr', 'Start', 'Ref','EBscore', 'PoN-Alt']]
test[:13]
test = pd.read_csv(f'{cnv_path}/01_A.chr3.snp', sep='\t')
test[:13]

### combine all snp for a sample into one df with corresponding EBscores

In [None]:
# try with pd merge
import os
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']


def combine_SNPdata(sample, cnv_path):
    snp_dfs = []
    file_base = os.path.join(snp_path, sample)
    for chrom in chrom_list:
        # reading SNP
        file = f"{file_base}.{chrom}.snp"
        if not os.path.isfile(file):
            continue
        print(f"Reading SNP VAF from {chrom} of sample {sample} from {file}.")
        snp_df = pd.read_csv(file, sep='\t')
        snp_df[['Alt', 'AltDepth']] = snp_df['Alt'].str.extract(r"([AGCT])([0-9]+)")
        
        #reading snpEB
        file = f"{file_base}.{chrom}.snpEB"
        if not os.path.isfile(file):
            continue
        print(f"Reading SNP EBscores from {chrom} of sample {sample} from {file}.")
        snpEB_df = pd.read_csv(file, sep='\t').loc[:,['Chr', 'Start', 'Ref','Alt', 'EBscore', 'PoN-Alt']]
        snp_df = snp_df.merge(snpEB_df, on=['Chr', 'Start', 'Ref', 'Alt'])
        
        snp_dfs.append(snp_df)
    snp_df = pd.concat(snp_dfs)
    return snp_df.loc[:, ["Chr", "Start", "ExonPos", "Ref", "Depth", "Alt", "VAF", "EBscore", "PoN-Alt"]]

In [None]:
sample = "01_A"
snp_df = combine_SNPdata(sample, cnv_path)
snp_df

### merge sample coverage with Pon coverage

In [None]:
def combine_Covdata(sample, cnv_data_path):
    
    cover_dfs = []

    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(cnv_data_path, f"cnv/{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            continue
        print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        
        #reading PONcoverage
        pon_cov_file = os.path.join(cnv_data_path, f"chromCov/{chrom}.filtered.csv.gz")
        if not os.path.isfile(pon_cov_file):
            continue
        print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[3:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="inner").loc[:,['Chr', 'Pos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        # normalize the coverage
        sample_df['Coverage'] = sample_df['Coverage'] / sample_df['Coverage'].mean() * 100
        cover_dfs.append(sample_df)   
    cover_df = pd.concat(cover_dfs)
    cover_df['log2ratio'] = np.log2(covere_df['Coverage'] / cover_df['PONmeanCov'])
    cover_df['rr100'] = cover_df['log2ratio'].rolling(100).mean()
    cover_df['rr200'] = cover_df['log2ratio'].rolling(200).mean()
    cover_df['rr500'] = cover_df['log2ratio'].rolling(500).mean()
    
    return cover_df

In [None]:
sample = "01_A"
combine_Covdata(sample, cnvdata)

In [None]:
snp_df = combine_heteroSNP("03_A")
snp_df.to_csv(f"{output_path}/03_A.snp.csv", sep='\t', index=False)
snp_df

### make a stacked "tidy version" of the coverage df for vizualisation in tidyverse

In [None]:
def tidy_df(df):
    return df.drop(columns=['Chr', 'Pos']).set_index('ExonPos').stack().reset_index().rename(columns={'level_1':'sample', 0:'Coverage'})

tidy_df(cov_df).to_csv(f"{output_path}/PON_coverage_tidy.csv", sep='\t', index=False)

### normalize the coverage to coverage 100

In [None]:
def normalize_coverage(cov_df):
    norm_df = cov_df.set_index(['Chr','Pos','ExonPos'])
    norm_df = norm_df / norm_df.mean() * 100
    return norm_df.reset_index()

In [None]:
norm_df = normalize_coverage(cov_df)
norm_df

In [None]:
tidy_df(norm_df).to_csv(f"{output_path}/PON_coverage_normtidy.csv", sep='\t', index=False)

### compute the mean of all the coverages

In [None]:
def add_mean(norm_df):
    norm_df = norm_df.set_index(['Chr', 'Pos', 'ExonPos'])
    norm_df['meanCov'] = norm_df.mean(axis=1)
    norm_df['medianCov'] = norm_df.median(axis=1)
    norm_df['std'] = norm_df.std(axis=1)
    return norm_df.reset_index()
mean_df = add_mean(norm_df)
tidy_df(mean_df).to_csv(f"{output_path}/PON_coverage_mean.csv", sep='\t', index=False)
mean_df

### filter the coverage

In [None]:
mean_df['std'].max()

In [None]:
def filter_coverage(df, mincov, maxstd):
    filter_df = df.query('meanCov > @mincov and std < @maxstd')
    return filter_df

In [None]:
filter_df = filter_coverage(mean_df, mincov=20, maxstd=50)
tidy_df(filter_df).to_csv(f"{output_path}/PON_coverage_filter.csv", sep='\t', index=False)