In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
cancers = ['BRCA', 'CRC', 'CCRCC', 'LUAD', 'OV', 'UCEC']
mutations = ['truncating', 'missense', 'synonymous']

In [3]:
pro = pd.read_csv('../data/DNA_Pro_regression/Table.DNA.PRO.regression.linearLIMMA.ProVsMut.csv')
rna = pd.read_csv('../data/DNA_RNA_regression/Table.DNA.RNA.regression.linearLIMMA.RNAVsMut.csv')
pro_rna = rna.merge(pro, on=['Gene', 'cancer', 'mutation'])

In [4]:
pro_rna.head()

Unnamed: 0,Unnamed: 0_x,Gene,logFC_x,AveExpr_x,t_x,P.value_x,B_x,FDR_x,RNA,cancer,mutation,Unnamed: 0_y,logFC_y,AveExpr_y,t_y,P.value_y,B_y,FDR_y,Protein
0,1,TP53,0.75243,3.437722,4.524541,1.7e-05,2.679575,0.004075,TP53,BRCA,missense,1,5.157955,-4.131038,7.490992,2.776663e-11,15.315636,3.859562e-09,TP53
1,3,TTN,0.122115,0.341531,2.852456,0.005272,-2.31403,0.327261,TTN,BRCA,missense,65,-0.550319,0.354773,-0.70989,0.4794244,-4.619672,0.9551655,TTN
2,4,UBR1,0.80424,2.699487,2.846444,0.005365,-2.327922,0.327261,UBR1,BRCA,missense,3,1.606496,0.183436,2.767019,0.006740405,-2.508332,0.3123054,UBR1
3,5,MAP3K1,0.989807,3.760067,2.529414,0.012987,-3.01235,0.555211,MAP3K1,BRCA,missense,12,0.950754,0.390073,1.843658,0.06819419,-4.139917,0.789916,MAP3K1
4,7,TNRC6A,0.685377,3.154837,2.376556,0.019379,-3.307212,0.555211,TNRC6A,BRCA,missense,61,0.606315,0.008193,0.758711,0.4498098,-4.65017,0.9551655,TNRC6A


In [5]:
pro_rna = pro_rna[['Gene', 'logFC_x', 'AveExpr_x', 'FDR_x', 'cancer', 'mutation', 'logFC_y', 'AveExpr_y', 'FDR_y']]
pro_rna.columns = ['Gene', 'RNAlogFC', 'RNAAveExpr', 'RNAFDR', 'cancer', 'mutation', 'PrologFC', 'ProAveExpr', 'ProFDR']

In [6]:
# keep genes that are significant in at least on level
sig_pro_rna = pro_rna[(pro_rna['ProFDR'] < 0.05) | (pro_rna['RNAFDR'] < 0.05)]
sig_pro_rna.head()

Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,mutation,PrologFC,ProAveExpr,ProFDR
0,TP53,0.75243,3.437722,0.004074853,BRCA,missense,5.157955,-4.131038,3.859562e-09
137,TP53,-1.135258,3.437722,2.815644e-10,BRCA,truncating,-0.963731,-4.131038,0.3983235
138,CDH1,-2.66682,5.395649,2.775379e-06,BRCA,truncating,-5.00302,-0.848601,3.770341e-07
139,CBFB,-0.733797,3.584625,0.04010529,BRCA,truncating,-2.523522,-0.127755,0.0007140661
140,MAP2K4,-0.701044,3.176204,0.09523538,BRCA,truncating,-3.400984,-0.253837,0.0003419037


In [7]:
lrt = pd.read_csv('../data/lrt/ProVsRNALrt.csv')

In [8]:
sig_pro_rna['lrt'] = False
for index, row in lrt.iterrows():
    gene = row['Gene']
    cancer = row['cancer']
    mutation = row['mutation']
    sig_pro_rna.loc[(sig_pro_rna['Gene'] == gene) & (sig_pro_rna['cancer'] == cancer) & (sig_pro_rna['mutation'] == mutation), 'lrt'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sig_pro_rna['lrt'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [9]:
# find psQTLs using the method in Mirauta et al., eLife, 2020
psQTLs = pro_rna[(pro_rna['ProFDR'] < 0.05) & (pro_rna['RNAFDR'] > 0.05)]

In [10]:
sig_pro_rna['ispsQTL'] = False
for index, row in psQTLs.iterrows():
    gene = row['Gene']
    cancer = row['cancer']
    mutation = row['mutation']
    sig_pro_rna.loc[(sig_pro_rna['Gene'] == gene) & (sig_pro_rna['cancer'] == cancer) & (sig_pro_rna['mutation'] == mutation), 'ispsQTL'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sig_pro_rna['ispsQTL'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [11]:
overlap = psQTLs.merge(lrt, on=['Gene', 'cancer', 'mutation'])

In [12]:
sig_pro_rna['overlap'] = False
for index, row in overlap.iterrows():
    gene = row['Gene']
    cancer = row['cancer']
    mutation = row['mutation']
    sig_pro_rna.loc[(sig_pro_rna['Gene'] == gene) & (sig_pro_rna['cancer'] == cancer) & (sig_pro_rna['mutation'] == mutation), 'overlap'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sig_pro_rna['overlap'] = False


In [13]:
sig_pro_rna.head()

Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,mutation,PrologFC,ProAveExpr,ProFDR,lrt,ispsQTL,overlap
0,TP53,0.75243,3.437722,0.004074853,BRCA,missense,5.157955,-4.131038,3.859562e-09,True,False,False
137,TP53,-1.135258,3.437722,2.815644e-10,BRCA,truncating,-0.963731,-4.131038,0.3983235,True,False,False
138,CDH1,-2.66682,5.395649,2.775379e-06,BRCA,truncating,-5.00302,-0.848601,3.770341e-07,False,False,False
139,CBFB,-0.733797,3.584625,0.04010529,BRCA,truncating,-2.523522,-0.127755,0.0007140661,True,False,False
140,MAP2K4,-0.701044,3.176204,0.09523538,BRCA,truncating,-3.400984,-0.253837,0.0003419037,True,True,True


In [15]:
sig_pro_rna.to_csv('../data/sig_pro_rna.csv')