In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
cancers = ['BRCA', 'CRC', 'CCRCC', 'LUAD', 'OV', 'UCEC']
mutations = ['truncating', 'missense', 'synonymous']

In [3]:
sig_pro_rna = pd.read_csv('../data/sig_pro_rna.csv')
sig_pro_rna.head()

Unnamed: 0.1,Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,mutation,PrologFC,ProAveExpr,ProFDR,lrt,ispsQTL,overlap
0,0,TP53,0.75243,3.437722,0.004074853,BRCA,missense,5.157955,-4.131038,3.859562e-09,True,False,False
1,137,TP53,-1.135258,3.437722,2.815644e-10,BRCA,truncating,-0.963731,-4.131038,0.3983235,True,False,False
2,138,CDH1,-2.66682,5.395649,2.775379e-06,BRCA,truncating,-5.00302,-0.848601,3.770341e-07,False,False,False
3,139,CBFB,-0.733797,3.584625,0.04010529,BRCA,truncating,-2.523522,-0.127755,0.0007140661,True,False,False
4,140,MAP2K4,-0.701044,3.176204,0.09523538,BRCA,truncating,-3.400984,-0.253837,0.0003419037,True,True,True


In [4]:
pro = pd.read_csv('../data/DNA_Pro_regression/Table.DNA.PRO.regression.linearLIMMA.ProVsMut.csv')
rna = pd.read_csv('../data/DNA_RNA_regression/Table.DNA.RNA.regression.linearLIMMA.RNAVsMut.csv')
pro_rna = rna.merge(pro, on=['Gene', 'cancer', 'mutation'])
pro_rna = pro_rna[['Gene', 'logFC_x', 'AveExpr_x', 'FDR_x', 'cancer', 'mutation', 'logFC_y', 'AveExpr_y', 'FDR_y']]
pro_rna.columns = ['Gene', 'RNAlogFC', 'RNAAveExpr', 'RNAFDR', 'cancer', 'mutation', 'PrologFC', 'ProAveExpr', 'ProFDR']

In [5]:
# find genes that shared in at least two cancer types
genes = sig_pro_rna['Gene'][sig_pro_rna['Gene'].duplicated()].unique()
common_gen = sig_pro_rna[sig_pro_rna['Gene'].isin(genes)]
common_gen.head()

Unnamed: 0.1,Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,mutation,PrologFC,ProAveExpr,ProFDR,lrt,ispsQTL,overlap
0,0,TP53,0.75243,3.437722,0.004074853,BRCA,missense,5.157955,-4.131038,3.859562e-09,True,False,False
1,137,TP53,-1.135258,3.437722,2.815644e-10,BRCA,truncating,-0.963731,-4.131038,0.3983235,True,False,False
12,1612,MSH3,-1.043425,2.524684,1.005399e-05,CRC,truncating,-3.401087,-0.013364,0.0009070224,True,False,False
13,1613,TP53,-0.902561,3.36852,0.0004039609,CRC,truncating,-1.555762,-0.308561,0.5907807,False,False,False
25,1625,GLYR1,-0.663416,4.171711,0.01113622,CRC,truncating,-1.267972,0.213658,0.03648808,False,False,False


In [6]:
# get both sig and non-sig data
com_genes_df = pd.DataFrame()
for mutation in mutations:
    cg = common_gen[common_gen['mutation'] == mutation]
    pr = pro_rna[pro_rna['mutation'] == mutation]
    com_genes = pd.DataFrame(columns=['Gene', 'mutation'])
    for gene in cg['Gene'].unique():
        gen_pro_rna = pr[pr['Gene'] == gene]
        if (len(gen_pro_rna['cancer'].unique()) >= 2):
            com_genes = com_genes.append({'Gene':gene, 'mutation':mutation}, ignore_index=True)
    com_genes_df = pd.concat([com_genes_df, com_genes])

In [7]:
com_gene = pd.DataFrame()
for mutation in mutations:
    mu_cg = com_genes_df[com_genes_df['mutation'] == mutation]
    mu_pro_rna = pro_rna[pro_rna['mutation'] == mutation]
    for gene in mu_cg['Gene']:
        mu_ge_pr = mu_pro_rna[mu_pro_rna['Gene'] == gene]
        com_gene = pd.concat([com_gene, mu_ge_pr])

# get heatmap data

In [8]:
cg = com_gene[['Gene', 'PrologFC', 'cancer', 'mutation']]
cg.head()

Unnamed: 0,Gene,PrologFC,cancer,mutation
137,TP53,-0.963731,BRCA,truncating
1613,TP53,-1.555762,CRC,truncating
3051,TP53,0.962442,LUAD,truncating
3352,TP53,-5.496297,OV,truncating
6070,TP53,-0.374276,UCEC,truncating


In [9]:
heatmap_data = pd.DataFrame()
for mutation in mutations:
    mut_cg = cg[cg['mutation'] == mutation]
    df = pd.DataFrame()
    df['Gene'] = mut_cg['Gene'].unique()
    mut_can_cg = pd.DataFrame()
    for cancer in cancers:
        mcc = mut_cg[mut_cg['cancer'] == cancer]
        mcc = df.merge(mcc, how='left')
        mcc['cancer'] = cancer
        mut_can_cg = pd.concat([mut_can_cg, mcc])
    mut_can_cg['mutation'] = mutation
    heatmap_data = pd.concat([heatmap_data, mut_can_cg])

In [10]:
for mutation in mutations:
    heat_data = heatmap_data[heatmap_data['mutation'] == mutation]
    mut_heat_data = pd.DataFrame(index = heat_data['Gene'].unique())
    for cancer in cancers:
        can_heat_data = heat_data[heat_data['cancer'] == cancer]
        prologfc = can_heat_data['PrologFC']
        prologfc.index = heat_data['Gene'].unique()
        mut_heat_data = pd.concat([mut_heat_data, prologfc], axis=1)
    mut_heat_data.columns = cancers
    mut_heat_data.to_csv('../data/heatmapData'+mutation+'.csv')