## combining chromosome SNP data per sample into WES SNP data
### also add EB data if available to filter out bad SNPs

In [None]:
# set the paths
home = '/Users/mahtin'
home = '/Users/martinscience'
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
cnvdata = os.path.join(tooldata, "myCNVdata")
shell_path = "../shell"
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
output_path = os.path.join(cnvdata, "output")

snp_path = os.path.join(cnvdata, "cnv")
eb_path = os.path.join(cnvdata, "EB")

In [None]:
snp_path
eb_path

In [None]:
test = pd.read_csv(f'{snp_path}/01_A.chr3.snp', sep='\t')
test[:13]

### combine all snp for a sample into one df

In [None]:
# try with pd merge
import os
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']


def combine_heteroSNP(sample):
    snp_dfs = []
    file_base = os.path.join(snp_path, sample)
    for chrom in chrom_list:
        file = f"{file_base}.{chrom}.snp"
        if not os.path.isfile(file):
            continue
        print(f"Reading {chrom} of sample {sample} from {file}.")
        snp_df = pd.read_csv(file, sep='\t')
        snp_dfs.append(snp_df)
    snp_df = pd.concat(snp_dfs)
    return snp_df

In [None]:
sample = "01_A"
snp_df = combine_heteroSNP(sample)
snp_df

### load the EB data for that sample and merge

In [None]:
eb_df = pd.read_csv(os.path.join(eb_path, f"{sample}-B.EB.csv"), sep='\t')
eb_df

### merge has to be done on Chr, Start and Alt as EB contains duplicates for Chr+Start with different Alt

In [None]:
eb_df[eb_df.duplicated(['Chr', 'Start'], keep=False)]

### convert SNP Alt into standard Alt +  AltDepth for joining

In [None]:
snp_df[['Alt', 'AltDepth']] = snp_df['Alt'].str.extract(r"([AGCT])([0-9]+)")
snp_df

In [None]:
merge_df = snp_df.merge(eb_df, on=['Chr', 'Start', 'Ref', 'Alt'], how='left')
merge_df

In [None]:
merge_df[merge_df['EBscore'] == merge_df['EBscore']].query('VAF > 0.05')

In [None]:
snp_df.to_csv(f"{output_path}/heteroSNP/01_A.snp.csv", sep='\t', index=False)
snp_dfaaa

In [None]:
snp_df = combine_heteroSNP("03_A")
snp_df.to_csv(f"{output_path}/03_A.snp.csv", sep='\t', index=False)
snp_df

### make a stacked "tidy version" of the coverage df for vizualisation in tidyverse

In [None]:
def tidy_df(df):
    return df.drop(columns=['Chr', 'Pos']).set_index('ExonPos').stack().reset_index().rename(columns={'level_1':'sample', 0:'Coverage'})

tidy_df(cov_df).to_csv(f"{output_path}/PON_coverage_tidy.csv", sep='\t', index=False)

### normalize the coverage to coverage 100

In [None]:
def normalize_coverage(cov_df):
    norm_df = cov_df.set_index(['Chr','Pos','ExonPos'])
    norm_df = norm_df / norm_df.mean() * 100
    return norm_df.reset_index()

In [None]:
norm_df = normalize_coverage(cov_df)
norm_df

In [None]:
tidy_df(norm_df).to_csv(f"{output_path}/PON_coverage_normtidy.csv", sep='\t', index=False)

### compute the mean of all the coverages

In [None]:
def add_mean(norm_df):
    norm_df = norm_df.set_index(['Chr', 'Pos', 'ExonPos'])
    norm_df['meanCov'] = norm_df.mean(axis=1)
    norm_df['medianCov'] = norm_df.median(axis=1)
    norm_df['std'] = norm_df.std(axis=1)
    return norm_df.reset_index()
mean_df = add_mean(norm_df)
tidy_df(mean_df).to_csv(f"{output_path}/PON_coverage_mean.csv", sep='\t', index=False)
mean_df

### filter the coverage

In [None]:
mean_df['std'].max()

In [None]:
def filter_coverage(df, mincov, maxstd):
    filter_df = df.query('meanCov > @mincov and std < @maxstd')
    return filter_df

In [None]:
filter_df = filter_coverage(mean_df, mincov=20, maxstd=50)
tidy_df(filter_df).to_csv(f"{output_path}/PON_coverage_filter.csv", sep='\t', index=False)