## combining chromosome SNP data per sample into WES SNP data
### also add EB data if available to filter out bad SNPs

In [None]:
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'
# standard paths
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "/Users/martinscience/mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")

# tool-specific paths
shell_path = "../shell"
# 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
cnvPON_path = cnv_path = os.path.join(cnvdata, "chromCov")
cnv_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/cnv")
cnvPON_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/chromCov")

# get the code

In [None]:
# get the code
import sys
sys.path.append('../code')
from combineCNVdata import get_covNsnp

In [None]:
sample = "01_A"
snp_df, cov_df = get_covNsnp(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=False, centerSNP=True)
snp_df.to_csv(os.path.join(output_path, 'heteroSNP/01_A.snp'), sep='\t', index=False)
cov_df.to_csv(os.path.join(output_path, 'covDif/01_A.cov'), sep='\t', index=False)

In [None]:
snp_df[:10]

In [None]:
cov_df.query('Chr == "chrX"').sort_values(['Chr', 'FullExonPos'])

# A) combine the data from individual chroms into exom df

## 1) coverage: merge sample coverage with Pon coverage

### make full ExomCoords
+ get the last coords of the chromosome
+ make a running sum
+ add that to the ExonPos

In [None]:
chrom = 'chr2'
sample = "01_A"
pon_cov = pd.read_csv(os.path.join(cnvPON_path, f"{chrom}.filtered.csv.gz"), sep='\t', compression="gzip")
pon_cov

In [None]:
sample_cov = pd.read_csv(os.path.join(cnv_path, f"{sample}.{chrom}.cov"), sep='\t', compression="gzip")
sample_cov

In [None]:
def combine_Covdata(sample, sample_cnv_path="", PON_cnv_path="", verbose=False, filtered=True):
    
    cover_dfs = []
    chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']
    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(sample_cnv_path, f"{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            print("No file", sample_cov_file)
            continue
        if verbose:
            print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        #reading PONcoverage
        full_or_filtered = "filtered" if filtered else "full"
        pon_cov_file = os.path.join(PON_cnv_path, f"{chrom}.{full_or_filtered}.csv.gz")
        if not os.path.isfile(pon_cov_file):
            print("No file", pon_cov_file)
            continue
        if verbose:
            print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[4:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").loc[:,['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        ##### here recover missing FullExonPos from margin
        # get 
        exon_start, full_start = sample_df.iloc[0][['ExonPos', 'FullExonPos']]
        offset = full_start - exon_start
        sample_df.loc[sample_df['FullExonPos'] != sample_df['FullExonPos'], 'FullExonPos'] = sample_df['ExonPos'] + offset
        
        # normalize the coverage
        sample_df['Coverage'] = (sample_df['Coverage'] / sample_df['Coverage'].mean() * 100).fillna(0)
        cover_dfs.append(sample_df)   
    cover_df = pd.concat(cover_dfs)
    # loggable are the coverages, where log2ratio can be computed
    loggable = (cover_df['PONmeanCov'] * cover_df['Coverage'] != 0)
    cover_df.loc[loggable, 'log2ratio'] = np.log2(cover_df.loc[loggable, 'Coverage'] / cover_df.loc[loggable, 'PONmeanCov'])
    # mark regions without PON coverage as 0
    cover_df.loc[~loggable, 'log2ratio'] = np.nan
    return cover_df

In [None]:
sample = "01_A"
cov_df = combine_Covdata(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, filtered=True)
cov_df

In [None]:
a,b = cov_df.iloc[0][['Pos', 'FullExonPos']]
a

In [None]:
cov_df

## 2) heteroSNP: combine all snp for a sample into one df with corresponding EBscore

In [None]:
import os
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']


def combine_SNPdata(sample, sample_cnv_path="", verbose=False):
    snp_dfs = []
    file_base = os.path.join(sample_cnv_path, sample)
    for chrom in chrom_list:
        # reading SNP
        file = f"{file_base}.{chrom}.snp"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP VAF from {chrom} of sample {sample} from {file}.")
        snp_df = pd.read_csv(file, sep='\t')
        snp_df[['Alt', 'AltDepth']] = snp_df['Alt'].str.extract(r"([AGCT])([0-9]+)")
        
        #reading snpEB
        file = f"{file_base}.{chrom}.snpEB"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP EBscores from {chrom} of sample {sample} from {file}.")
        snpEB_df = pd.read_csv(file, sep='\t').loc[:,['Chr', 'Start', 'Ref','Alt', 'EBscore', 'PoN-Alt']]
        snp_df = snp_df.merge(snpEB_df, on=['Chr', 'Start', 'Ref', 'Alt'])
        
        snp_dfs.append(snp_df)
    snp_df = pd.concat(snp_dfs).reset_index(drop=True)
    return snp_df.loc[:, ["Chr", "Start", "ExonPos", "Ref", "Depth", "Alt", "VAF", "EBscore", "PoN-Alt"]]

In [None]:
sample = "01_A"
snp_df = combine_SNPdata(sample, sample_cnv_path=cnv_path)
snp_df

In [None]:
def centerVAF(snp_df):
    '''
    attempting to correct for off-center VAF means
    '''

    # get the VAF mean
    meanVAF = snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()
    # store the original VAF in orgVAF
    snp_df['orgVAF'] = snp_df['VAF']
    snp_df.loc[snp_df['VAF'] <= meanVAF,
               'VAF'] = snp_df['VAF'] / meanVAF * 0.5
    snp_df.loc[snp_df['VAF'] > meanVAF, 'VAF'] = 0.5 + \
        0.5 * (snp_df['VAF'] - meanVAF) / (1-meanVAF)
    newMeanVAF = snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()
    return snp_df, meanVAF, newMeanVAF

In [None]:
snp_df, _, _ = centerVAF(snp_df)
snp_df

In [None]:
snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()

In [None]:
snp_df.query('0.05 < VAF < 0.95')['orgVAF'].mean()

### get the exome coordinates for snp_df from the cov_df
+ cov_df is more complete
+ needed to have a common coordinate system

In [None]:
snp_df[:10]

In [None]:
cov_df[:10]

In [None]:
def get_full_exon_pos_from_cov(snp_df, cov_df):
    
    snp_cols = list(snp_df.columns)
    snp_dfs = []
    for chrom in snp_df['Chr'].unique():
        merge = snp_df.query('Chr == @chrom').merge(cov_df.query('Chr == @chrom').loc[:,['Chr','Pos', 'FullExonPos','ExonPos']], how='outer').sort_values('ExonPos')
        merge['PosL'] = merge['Pos'].fillna(method="ffill")
        merge['FullL'] = merge['FullExonPos'].fillna(method="ffill")
        merge.loc[merge['FullExonPos'] != merge['FullExonPos'], 'FullExonPos'] = merge['FullL'] + merge['Start'] - merge['PosL']
        # fill the margins
        merge.loc[:,'FullExonPos'] = merge['FullExonPos'].fillna(method="bfill").fillna(method="ffill")
        # reduce the columns and only snp_data rows
        cols = snp_cols[:2] + ['FullExonPos'] + snp_cols[2:]
        snp_merge = merge[cols].query("VAF == VAF")
        for col in ['Start', 'FullExonPos', 'Depth']:
            snp_merge.loc[:, col] = snp_merge[col].astype(int)
        snp_dfs.append(snp_merge)
    snp_df = pd.concat(snp_dfs).reset_index(drop=True).sort_values('FullExonPos').rename(columns={'Start':'Pos'})

    return snp_df

snp_df = get_full_exon_pos_from_cov(snp_df, cov_df)
snp_df

In [None]:
cov_df

## combine SNP data and covData

In [None]:
def get_covNsnp(sample, sample_cnv_path='', PON_cnv_path='', verbose=False):
    '''
    load the coverage_data for a sample and the heteroSNP data and apply the same fullExonCoords
    '''
    print(f'Loading coverage data for sample {sample}')
    cov_df = combine_Covdata(sample, sample_cnv_path=sample_cnv_path, PON_cnv_path=PON_cnv_path, verbose=verbose)
    print(f'Loading SNP data for sample {sample}')
    snp_df = combine_SNPdata(sample, sample_cnv_path=sample_cnv_path, verbose=verbose)
    # get full exonPos from cov_df and remove the accumPos from cov_df
    snp_df = get_full_exon_pos_from_cov(snp_df, cov_df)
    # 
    snp_df, _, _ = centerVAF(snp_df)
    print(f"Finished loading sample {sample}")
    return snp_df, cov_df

In [None]:
sample = "01_A"
snp_df, cov_df = get_covNsnp(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=False)

In [None]:
snp_df[:10]

In [None]:
cov_df[:10]