## combining chromosome SNP data per sample into WES SNP data
### also add EB data if available to filter out bad SNPs

In [None]:
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'
# standard paths
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "/Users/martinscience/mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")

# tool-specific paths
shell_path = "../shell"
# 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
cnvPON_path = cnv_path = os.path.join(cnvdata, "chromCov")
cnv_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/cnv")
cnvPON_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/chromCov")

In [None]:
cnvPON_path

# A) combine the data from individual chroms into exom df

## 1) coverage: merge sample coverage with Pon coverage

### make full ExomCoords
+ get the last coords of the chromosome
+ make a running sum
+ add that to the ExonPos

In [None]:
def get_full_exon_pos(df):
    '''
    adds the accumulated exonic position (over all chroms)
    '''
    
    # save the output columns
    cols = list(df.columns)
    df = df.reset_index(drop=True)
    # adds the last ExonPos of chrom to start of next chromosome
    df.loc[:,'chromStep'] = df.shift(1)['ExonPos'].fillna(0).astype(int)
    df.loc[df['Chr'] == df.shift(1)['Chr'],'chromStep'] = 0
    df['chromAccum'] = df['chromStep'].cumsum()
    df['FullExonPos'] = df['ExonPos'] + df['chromAccum']
    cols = cols[:2] + ['FullExonPos'] + cols[2:] + ['chromAccum']
    return df[cols]

In [None]:
def combine_Covdata(sample, sample_cnv_path="", PON_cnv_path="", verbose=False, filtered=True):
    
    cover_dfs = []

    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(sample_cnv_path, f"{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            print("No file", sample_cov_file)
            continue
        if verbose:
            print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        #reading PONcoverage
        full_or_filtered = "filtered" if filtered else "full"
        pon_cov_file = os.path.join(PON_cnv_path, f"{chrom}.{full_or_filtered}.csv.gz")
        if not os.path.isfile(pon_cov_file):
            print("No file", pon_cov_file)
            continue
        if verbose:
            print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[3:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="inner").loc[:,['Chr', 'Pos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        # normalize the coverage
        sample_df['Coverage'] = sample_df['Coverage'] / sample_df['Coverage'].mean() * 100
        cover_dfs.append(sample_df)   
    cover_df = pd.concat(cover_dfs)
    cover_df['log2ratio'] = np.log2(cover_df['Coverage'] / cover_df['PONmeanCov'])    
    return get_full_exon_pos(cover_df)

In [None]:
sample = "01_A"
cov_df = combine_Covdata(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=True, filtered=False)
cov_df

## 2) heteroSNP: combine all snp for a sample into one df with corresponding EBscore

In [None]:
import os
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']


def combine_SNPdata(sample, sample_cnv_path="", verbose=False):
    snp_dfs = []
    file_base = os.path.join(sample_cnv_path, sample)
    for chrom in chrom_list:
        # reading SNP
        file = f"{file_base}.{chrom}.snp"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP VAF from {chrom} of sample {sample} from {file}.")
        snp_df = pd.read_csv(file, sep='\t')
        snp_df[['Alt', 'AltDepth']] = snp_df['Alt'].str.extract(r"([AGCT])([0-9]+)")
        
        #reading snpEB
        file = f"{file_base}.{chrom}.snpEB"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP EBscores from {chrom} of sample {sample} from {file}.")
        snpEB_df = pd.read_csv(file, sep='\t').loc[:,['Chr', 'Start', 'Ref','Alt', 'EBscore', 'PoN-Alt']]
        snp_df = snp_df.merge(snpEB_df, on=['Chr', 'Start', 'Ref', 'Alt'])
        
        snp_dfs.append(snp_df)
    snp_df = pd.concat(snp_dfs).reset_index(drop=True)
    return snp_df.loc[:, ["Chr", "Start", "ExonPos", "Ref", "Depth", "Alt", "VAF", "EBscore", "PoN-Alt"]]

In [None]:
sample = "01_A"
snp_df = combine_SNPdata(sample, sample_cnv_path=cnv_path)
snp_df

In [None]:
def centerVAF(snp_df):
    '''
    attempting to correct for off-center VAF means
    '''

    # get the VAF mean
    meanVAF = snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()
    # store the original VAF in orgVAF
    snp_df['orgVAF'] = snp_df['VAF']
    snp_df.loc[snp_df['VAF'] <= meanVAF,
               'VAF'] = snp_df['VAF'] / meanVAF * 0.5
    snp_df.loc[snp_df['VAF'] > meanVAF, 'VAF'] = 0.5 + \
        0.5 * (snp_df['VAF'] - meanVAF) / (1-meanVAF)
    newMeanVAF = snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()
    return snp_df, meanVAF, newMeanVAF

In [None]:
snp_df = centerVAF(snp_df)
snp_df

In [None]:
snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()

In [None]:
snp_df.query('0.05 < VAF < 0.95')['centeredVAF'].mean()

### get the exome coordinates for snp_df from the cov_df
+ cov_df is more complete
+ needed to have a common coordinate system

In [None]:
def get_full_exon_pos_from_cov(snp_df, cov_df):
    snp_cols = list(snp_df.columns)
    snp_df = snp_df.merge(cov_df.loc[:,['Chr', 'chromAccum']].groupby('Chr').first().reset_index(), on='Chr')
    snp_df['FullExonPos'] = snp_df['ExonPos'] + snp_df['chromAccum']
    cols = snp_cols[:2] + ['FullExonPos'] + snp_cols[2:]
    return snp_df[cols], cov_df.drop(columns='chromAccum')

snp_df, cov_df = get_full_exon_pos_from_cov(snp_df, cov_df)
snp_df

In [None]:
cov_df

## combine SNP data and covData

In [None]:
def get_covNsnp(sample, sample_cnv_path='', PON_cnv_path='', verbose=False):
    '''
    load the coverage_data for a sample and the heteroSNP data and apply the same fullExonCoords
    '''
    print(f'Loading coverage data for sample {sample}')
    cov_df = combine_Covdata(sample, sample_cnv_path=sample_cnv_path, PON_cnv_path=PON_cnv_path, verbose=verbose)
    print(f'Loading SNP data for sample {sample}')
    snp_df = combine_SNPdata(sample, sample_cnv_path=sample_cnv_path, verbose=verbose)
    # get full exonPos from cov_df and remove the accumPos from cov_df
    snp_df, cov_df = get_full_exon_pos_from_cov(snp_df, cov_df)
    # 
    snp_df, _, _ = centerVAF(snp_df)
    print(f"Finished loading sample {sample}")
    return snp_df, cov_df

In [None]:
sample = "01_A"
snp_df, cov_df = get_covNsnp(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=False)

In [None]:
snp_df[:10]

In [None]:
cov_df[:10]