In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob

In [2]:
cancers = ['BRCA', 'CRC', 'CCRCC', 'LUAD', 'OV', 'UCEC']
mutations = ['truncating', 'missense', 'synonymous']

In [3]:
discordant = pd.read_excel('../data/results/discordant.xlsx')
concordant = pd.read_excel('../data/results/concordant.xlsx')

# Get expression data

In [4]:
def extract_expression(samples, exp, exp_pro, gene, mutation, cancer):
    for sampleid in samples.index:
        if sampleid in exp_pro.columns.tolist():   
            tmp = pd.Series(exp_pro.loc[gene][sampleid])
            tmp.index = [gene]
            tmp['mutation'] = mutation
            tmp['cancer'] = cancer
            tmp['sample_id'] = sampleid
            exp = pd.concat([exp, tmp])
    return exp

In [5]:
def process_exp(exp):
    gen = exp[(exp.index != 'mutation') & (exp.index != 'cancer') & (exp.index != 'sample_id')]

    mut = exp[exp.index =='mutation']
    cancer = exp[exp.index =='cancer']
    sample_id = exp[exp.index =='sample_id']
    genid = gen.index

    gen.reset_index(inplace=True, drop=True)
    mut.reset_index(inplace=True, drop=True)
    cancer.reset_index(inplace=True, drop=True)
    sample_id.reset_index(inplace=True, drop=True)

    expression = pd.concat([gen, mut, cancer, sample_id], axis=1)
    expression['gene'] = genid
    expression.columns=['expression', 'mutation', 'cancer', 'sample_id', 'gene']
    expression.dropna(inplace=True)
    
    return expression

In [6]:
def wild_vs_mut(cordance, path):
    mutant_exp = pd.DataFrame()
    wild_exp = pd.DataFrame()
    
    for index, row in cordance.iterrows():
        gene = row['Gene']
        mutation = row['mutation']
        cancer = row['cancer']
        mut = pd.read_csv('../../../Huang_lab_data/PanCancerProteomicsData_HuangLab/'+
                              cancer+'.WXS.SomaticVariant.'+mutation+'.txt.gz', sep='\t')
        mut.columns = mut.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
        exp_pro = pd.read_csv('../../../Huang_lab_data/PanCancerProteomicsData_HuangLab/'+
                      cancer+path, sep='\t')
        if path == '.transcriptome.FPKM.formatted.tumor.txt.gz':
            exp_pro = np.log2(exp_pro+1) # rna
        exp_pro.columns = exp_pro.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
        
        mutant_samples = mut.loc[gene][mut.loc[gene] != 0] # mutant
        wild_samples = mut.loc[gene][mut.loc[gene] == 0] # wild
    
        mutant_exp = extract_expression(mutant_samples, mutant_exp, exp_pro, gene, mutation, cancer)
        wild_exp = extract_expression(wild_samples, wild_exp, exp_pro, gene, mutation, cancer)

    mutant_expression = process_exp(mutant_exp)
    wild_expression = process_exp(wild_exp)

    mutant_expression['isMut'] = 'mutant'
    wild_expression['isMut'] = 'wild'
    
    exp_wild_vs_mut = pd.concat([wild_expression, mutant_expression])
    
    return exp_wild_vs_mut

In [7]:
pro_path = '.proteome.formatted.normalized.tumor.txt.gz'
rna_path = '.transcriptome.FPKM.formatted.tumor.txt.gz'

In [8]:
dis_pro = wild_vs_mut(discordant, pro_path)
dis_rna = wild_vs_mut(discordant, rna_path)
con_pro = wild_vs_mut(concordant, pro_path)
con_rna = wild_vs_mut(concordant, rna_path)

In [9]:
dis_pro.to_csv('../data/wildVSmut/discordant/pro_exp_wildVsMut.csv')
dis_rna.to_csv('../data/wildVSmut/discordant/rna_exp_wildVsMut.csv')
con_pro.to_csv('../data/wildVSmut/concordant/pro_exp_wildVsMut.csv')
con_rna.to_csv('../data/wildVSmut/concordant/rna_exp_wildVsMut.csv')

# Merge violin plot data with HGVSp data

In [12]:
cordance = ['discordant', 'concordant']
product = ['pro', 'rna']

for c in cordance:
    for p in product:
        violin_dat = pd.read_csv('../data/wildVSmut/'+c+'/'+p+'_exp_wildVsMut.csv')
        for mutation in mutations:
            for cancer in cancers:
                hgvsp = pd.read_csv('../data/HGVSp/' + cancer + mutation + 'HGVSp.csv')
                hgvsp.columns = hgvsp.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
                hgvsp = hgvsp.rename(columns={'Unnamed: 0':'gene'})
                hgvsp.index = hgvsp['gene']
                hgvsp = hgvsp.drop(['gene'], axis=1)
                for sample_id in hgvsp.columns:
                    rows = violin_dat[(violin_dat['mutation'] == mutation) & (violin_dat['cancer'] == cancer) & 
                    (violin_dat['sample_id'] == sample_id)]
                    if rows.shape[0] == 0:
                        continue
                    genes = rows['gene']
                    violin_dat.loc[(violin_dat['mutation'] == mutation) & (violin_dat['cancer'] == cancer) & 
                    (violin_dat['sample_id'] == sample_id), 'hgvsp'] = list(rows.merge(hgvsp, how='left', on='gene')[sample_id])
        violin_dat.to_csv('../data/wildVSmut/'+c+'/'+p+'_exp_wildVsMut.csv')