In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob

In [2]:
cancers = ['BRCA', 'CRC', 'CCRCC', 'LUAD', 'OV', 'UCEC']
mutations = ['truncating', 'missense', 'synonymous']

In [3]:
sig_pro_rna = pd.read_csv('../data/sig_pro_rna.csv')

In [4]:
sig_pro_rna.head()

Unnamed: 0.1,Unnamed: 0,Gene,PrologFC,ProAveExpr,ProFDR,RNAlogFC,RNAAveExpr,RNAFDR,diffLogFC,diffAveExpr,cancer,mutation,lrt,ispsQTL,overlap
0,332,CCDC59,0.617003,0.38627,0.986818,1.452143,2.956698,0.009917,-0.835139,-2.570427,BRCA,missense,False,False,False
1,343,CCSER2,0.452063,0.180476,0.986818,2.1793,2.992272,0.002613,-1.727238,-2.811796,BRCA,missense,False,False,False
2,713,EP300,-0.282211,-0.127549,0.986818,-2.016249,3.394311,0.036776,1.734038,-3.52186,BRCA,missense,False,False,False
3,879,GLOD4,1.700418,0.055922,0.986818,1.823098,3.517731,0.010391,-0.12268,-3.461809,BRCA,missense,False,False,False
4,1174,LGALS9,7.231326,-0.293448,0.005833,3.627117,3.189487,0.033354,3.604209,-3.482935,BRCA,missense,True,False,False


# Find all discordant genes

In [5]:
overlap = sig_pro_rna[sig_pro_rna['overlap'] == True]

In [6]:
overlap.head()

Unnamed: 0.1,Unnamed: 0,Gene,PrologFC,ProAveExpr,ProFDR,RNAlogFC,RNAAveExpr,RNAFDR,diffLogFC,diffAveExpr,cancer,mutation,lrt,ispsQTL,overlap
5,2085,TAOK1,4.423613,-0.176246,0.025076,1.380925,3.241223,0.420269,3.042688,-3.417469,BRCA,missense,True,True,True
6,2155,TNIK,8.618198,-0.149147,0.014347,2.187661,1.98464,0.601864,6.430537,-2.133787,BRCA,missense,True,True,True
11,2368,ZFHX3,-4.240519,0.212571,0.026223,-0.108695,1.46941,0.998688,-4.131824,-1.256839,BRCA,missense,True,True,True
12,2532,BLOC1S2,-5.63459,0.363056,0.003934,-0.019437,3.940234,0.999698,-5.615153,-3.577178,BRCA,synonymous,True,True,True
16,3368,TP53,15.047397,-4.131038,0.003934,1.444938,3.437722,0.998948,13.60246,-7.568761,BRCA,synonymous,True,True,True


# Find all concordant genes

In [None]:
concordant = pd.read_csv('../data/concordant.csv')
concordant.head()

# protein wild vs mut data

In [8]:
exp = pd.DataFrame()

for index, row in overlap.iterrows():
# for index, row in concordant.iterrows():
    gene = row['Gene']
    mutation = row['mutation']
    cancer = row['cancer']
    mut = pd.read_csv('../../../Huang_lab_data/PanCancerProteomicsData_HuangLab/'+
                          cancer+'.WXS.SomaticVariant.'+mutation+'.txt.gz', sep='\t')
    mut.columns = mut.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
    exp_pro = pd.read_csv('../../../Huang_lab_data/PanCancerProteomicsData_HuangLab/'+
                      cancer+'.proteome.formatted.normalized.tumor.txt.gz', sep='\t')
#     exp_pro = pd.read_csv('../../../Huang_lab_data/PanCancerProteomicsData_HuangLab/'+
#                       cancer+'.transcriptome.FPKM.formatted.tumor.txt.gz', sep='\t')
#     exp_pro = np.log2(exp_pro+1) # rna
    exp_pro.columns = exp_pro.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
    samples = mut.loc[gene][mut.loc[gene] != 0] # mutant
#     samples = mut.loc[gene][mut.loc[gene] == 0] # wild

    for sampleid in samples.index:
        if sampleid in exp_pro.columns.tolist():   
            exp = pd.Series(exp_pro.loc[gene][sampleid])
            exp.index = [gene]
            exp['mutation'] = mutation
            exp['cancer'] = cancer
            exp['sample_id'] = sampleid
            exp = pd.concat([exp, exp])

In [9]:
gen = exp[(exp.index != 'mutation') & (exp.index != 'cancer') & (exp.index != 'sample_id')]

mut = exp[exp.index =='mutation']
cancer = exp[exp.index =='cancer']
sample_id = exp[exp.index =='sample_id']
genid = gen.index

gen.reset_index(inplace=True, drop=True)
mut.reset_index(inplace=True, drop=True)
cancer.reset_index(inplace=True, drop=True)
sample_id.reset_index(inplace=True, drop=True)

expression = pd.concat([gen, mut, cancer, sample_id], axis=1)
expression['gene'] = genid
expression.columns=['expression', 'mutation', 'cancer', 'sample_id', 'gene']
expression.dropna(inplace=True)

In [None]:
wild_expression = expression 

In [None]:
mutation_expression = expression

In [None]:
wild_expression['isMut'] = 'wild'
mutant_expression['isMut'] = 'mutant'

In [None]:
exp_wildVsMut = pd.concat([wild_expression, mutant_expression])
exp_wildVsMut.to_csv('../data/wildVSmut/discordant/pro_exp_wildVsMut.csv')
# exp_wildVsMut.to_csv('../data/wildVSmut/discordant/rna_exp_wildVsMut.csv')
# exp_wildVsMut.to_csv('../data/wildVSmut/concordant/pro_exp_wildVsMut.csv')
# exp_wildVsMut.to_csv('../data/wildVSmut/concordant/rna_exp_wildVsMut.csv')

# Merge violin plot data with HGVSp data

In [14]:
cordance = ['discordant', 'concordant']
product = ['pro', 'rna']

for c in cordance:
    for p in product:
        violin_dat = pd.read_csv('../data/wildVSmut/'+cordance+'/'+product+'_exp_wildVsMut.csv')
        for mutation in mutations:
            for cancer in cancers:
                hgvsp = pd.read_csv('../data/HGVSp/' + cancer + mutation + 'HGVSp.csv')
                hgvsp.columns = hgvsp.columns.to_series().str.split(".Tumor",1).apply(lambda x: x[0])
                hgvsp = hgvsp.rename(columns={'Unnamed: 0':'gene'})
                hgvsp.index = hgvsp['gene']
                hgvsp = hgvsp.drop(['gene'], axis=1)
                for sample_id in hgvsp.columns:
                    rows = violin_dat[(violin_dat['mutation'] == mutation) & (violin_dat['cancer'] == cancer) & 
                    (violin_dat['sample_id'] == sample_id)]
                    if rows.shape[0] == 0:
                        continue
                    genes = rows['gene']
                    violin_dat.loc[(violin_dat['mutation'] == mutation) & (violin_dat['cancer'] == cancer) & 
                    (violin_dat['sample_id'] == sample_id), 'hgvsp'] = list(rows.merge(hgvsp, how='left', on='gene')[sample_id])
        violin_dat.to_csv('../data/wildVSmut/'+cordance+'/'+product+'_exp_wildVsMut.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
