## combining chromosome SNP data per sample into WES SNP data
### also add EB data if available to filter out bad SNPs

In [None]:
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'
# standard paths
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "/Users/martinscience/mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")

# tool-specific paths
shell_path = "../shell"
# 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
cnvPON_path = cnv_path = os.path.join(cnvdata, "chromCov")
cnv_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/cnv")
cnvPON_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/chromCov")

In [None]:
cnvPON_path

# A) combine the data

## 1) heteroSNP: combine all snp for a sample into one df with corresponding EBscore

In [None]:
# try with pd merge
import os
chrom_list = [f"chr{chrom + 1}" for chrom in range(22)] + ['chrX']


def combine_SNPdata(sample, sample_cnv_path="", verbose=False):
    snp_dfs = []
    file_base = os.path.join(sample_cnv_path, sample)
    for chrom in chrom_list:
        # reading SNP
        file = f"{file_base}.{chrom}.snp"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP VAF from {chrom} of sample {sample} from {file}.")
        snp_df = pd.read_csv(file, sep='\t')
        snp_df[['Alt', 'AltDepth']] = snp_df['Alt'].str.extract(r"([AGCT])([0-9]+)")
        
        #reading snpEB
        file = f"{file_base}.{chrom}.snpEB"
        if not os.path.isfile(file):
            print('No file', file)
            continue
        if verbose:
            print(f"Reading SNP EBscores from {chrom} of sample {sample} from {file}.")
        snpEB_df = pd.read_csv(file, sep='\t').loc[:,['Chr', 'Start', 'Ref','Alt', 'EBscore', 'PoN-Alt']]
        snp_df = snp_df.merge(snpEB_df, on=['Chr', 'Start', 'Ref', 'Alt'])
        
        snp_dfs.append(snp_df)
    snp_df = pd.concat(snp_dfs)
    return snp_df.loc[:, ["Chr", "Start", "ExonPos", "Ref", "Depth", "Alt", "VAF", "EBscore", "PoN-Alt"]]

In [None]:
sample = "01_A"
snp_df = combine_SNPdata(sample, sample_cnv_path=cnv_path)
snp_df

## 2) coverage: merge sample coverage with Pon coverage

### make full ExomCoords
+ get the last coords of the chromosome
+ make a running sum
+ add that to the ExonPos

In [None]:
def get_full_exon_pos(df):
    '''
    adds the accumulated exonic position (over all chroms)
    '''
    
    # save the output columns
    cols = list(df.columns)
    df = df.reset_index(drop=True)
    # adds the last ExonPos of chrom to start of next chromosome
    df.loc[:,'chromStep'] = df.shift(1)['ExonPos'].fillna(0).astype(int)
    df.loc[df['Chr'] == df.shift(1)['Chr'],'chromStep'] = 0
    df['chromAccum'] = df['chromStep'].cumsum()
    df['FullExonPos'] = df['ExonPos'] + df['chromAccum']
    cols = cols[:2] + ['FullExonPos'] + cols[2:] + ['chromAccum']
    return df[cols]


def combine_Covdata(sample, sample_cnv_path="", PON_cnv_path="", verbose=False):
    
    cover_dfs = []

    for chrom in chrom_list:
        # reading sampleCoverage
        sample_cov_file = os.path.join(sample_cnv_path, f"{sample}.{chrom}.cov")
        if not os.path.isfile(sample_cov_file):
            print("No file", sample_cov_file)
            continue
        if verbose:
            print(f"Reading coverage from {chrom} of sample {sample} from {sample_cov_file}.")
        cov_df = pd.read_csv(sample_cov_file, sep='\t', compression="gzip")
        
        #reading PONcoverage
        pon_cov_file = os.path.join(PON_cnv_path, f"{chrom}.filtered.csv.gz")
        if not os.path.isfile(pon_cov_file):
            print("No file", pon_cov_file)
            continue
        if verbose:
            print(f"Reading PON coverage of {chrom} from file {pon_cov_file}.")
        pon_df = pd.read_csv(pon_cov_file, sep='\t', compression="gzip").loc[:,['Chr', 'Pos', 'ExonPos', 'meanCov', 'medianCov', 'std']]
        # column rename
        trans_dict = {col:f"PON{col}" for col in pon_df.columns[3:]}
        pon_df = pon_df.rename(columns=trans_dict)
        # merge sample with PON coverage
        sample_df = cov_df.merge(pon_df, on=['Chr', 'Pos', 'ExonPos'], how="inner").loc[:,['Chr', 'Pos', 'ExonPos', 'Coverage','PONmeanCov', 'PONmedianCov', 'PONstd']]
        
        # normalize the coverage
        sample_df['Coverage'] = sample_df['Coverage'] / sample_df['Coverage'].mean() * 100
        cover_dfs.append(sample_df)   
    cover_df = pd.concat(cover_dfs)
    cover_df['log2ratio'] = np.log2(cover_df['Coverage'] / cover_df['PONmeanCov'])    
    return get_full_exon_pos(cover_df)

In [None]:
sample = "01_A"
cov_df = combine_Covdata(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=True)
cov_df

### get the exome coordinates for snp_df from the cov_df
+ cov_df is more complete
+ needed to have a common coordinate system

In [None]:
def get_full_exon_pos_from_cov(snp_df, cov_df):
    snp_cols = list(snp_df.columns)
    snp_df = snp_df.merge(cov_df.loc[:,['Chr', 'chromAccum']].groupby('Chr').first().reset_index(), on='Chr')
    snp_df['FullExonPos'] = snp_df['ExonPos'] + snp_df['chromAccum']
    cols = snp_cols[:2] + ['FullExonPos'] + snp_cols[2:]
    return snp_df[cols], cov_df.drop(columns='chromAccum')

snp_df, cov_df = get_full_exon_pos_from_cov(snp_df, cov_df)
snp_df

In [None]:
cov_df

### get the log2ratio at a SNP position by merging snp_df and cov_df

In [None]:
def approx_log2ratio(snp_df, cov_df):
    '''
    takes the coverage data and approximates the log2ratio for that SNP from adjacent cov data
    '''
    
    # merge snp_df and cov_df and rename required columns
    merge_df = snp_df.merge(cov_df, on=[
        'Chr',
        'FullExonPos'
    ], how='outer').sort_values('FullExonPos').reset_index(drop=True).drop(
        columns=['Pos'] + list(cov_df.columns[4:-1])
    )
    
    # store the fitting SNPs
    merged_df = merge_df.query('EBscore == EBscore and log2ratio == log2ratio').drop(columns='ExonPos_y').rename(columns={'ExonPos_x':'ExonPos'})
    
    # go on with the SNPs with missing log2ratio
    merge_df = merge_df.query('EBscore != EBscore or log2ratio != log2ratio').rename(columns={
        'ExonPos_x':'ExonPos', 
        'ExonPos_y':'ExonPosL', 
        'log2ratio':'log2ratioL'
    })
    
    merge_dfs = []
    snp_cols = list(snp_df.columns) + ['log2ratio']
    
    for chrom in merge_df['Chr'].unique():
        merge = merge_df.query('Chr == @chrom')
        merge['ExonPosR'] = merge['ExonPosL'].fillna(method="bfill")
        merge['log2ratioR'] = merge['log2ratioL'].fillna(method="bfill")
        merge['ExonPosL'] = merge['ExonPosL'].fillna(method="ffill")
        merge['log2ratioL'] = merge['log2ratioL'].fillna(method="ffill")
        merge['log2ratio'] = merge['log2ratioL'] + (merge['log2ratioR'] - merge['log2ratioL']) / (merge['ExonPosR'] - merge['ExonPosL']) * (merge['ExonPos'] - merge['ExonPosL'])
        merge_dfs.append(merge.loc[:, snp_cols])
    snp_df = pd.concat(merge_dfs).sort_values('FullExonPos').query('EBscore == EBscore')
    snp_df['log2ratio'] = snp_df['log2ratio'].fillna(method='bfill')
    return snp_df

In [None]:
df = approx_log2ratio(snp_df, cov_df)
df

In [None]:
def centerVAF(snp_df):
    '''
    attempting to correct for off-center VAF means
    '''
    
    # get the VAF mean
    meanVAF = snp_df.query('0.05 < VAF < 0.95')['VAF'].mean()
    # init the new variable
    snp_df['centeredVAF'] = 0.5
    snp_df.loc[snp_df['VAF'] <= meanVAF, 'centeredVAF'] = snp_df['VAF'] / meanVAF * 0.5
    snp_df.loc[snp_df['VAF'] > meanVAF, 'centeredVAF'] = 0.5 + 0.5 * (snp_df['VAF'] - meanVAF) / (1-meanVAF)
    return snp_df

In [None]:
snp_df = centerVAF(snp_df)
snp_df

In [None]:
snp_df['centeredVAF'].mean()

## combine SNP data and covData

In [None]:
def get_covNsnp(sample, sample_cnv_path='', PON_cnv_path='', verbose=False):
    '''
    load the coverage_data for a sample and the heteroSNP data and apply the same fullExonCoords
    '''
    print(f'Loading coverage data for sample {sample}')
    cov_df = combine_Covdata(sample, sample_cnv_path=sample_cnv_path, PON_cnv_path=PON_cnv_path, verbose=verbose)
    print(f'Loading SNP data for sample {sample}')
    snp_df = combine_SNPdata(sample, sample_cnv_path=sample_cnv_path, verbose=verbose)
    # get full exonPos from cov_df and remove the accumPos from cov_df
    snp_df, cov_df = get_full_exon_pos_from_cov(snp_df, cov_df)
    # get lo
    snp_df = approx_log2ratio(snp_df, cov_df)
    # 
    snp_df = centerVAF(snp_df)
    print(f"Finished loading sample {sample}")
    return snp_df, cov_df

In [None]:
sample = "01_A"
snp_df, cov_df = get_covNsnp(sample, sample_cnv_path=cnv_path, PON_cnv_path=cnvPON_path, verbose=False)

In [None]:
snp_df[:10]

In [None]:
cov_df[:10]