In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

# Mutation impacts at the mRNA and protein levels

In [2]:
rna = pd.read_csv('../data/DNA_RNA_regression/Table.DNA.RNA.regression.linearLIMMA.RNAVsMut.csv')
pro = pd.read_csv('../data/DNA_Pro_regression/Table.DNA.PRO.regression.linearLIMMA.ProVsMut.csv')

In [3]:
rna.head()

Unnamed: 0.1,Unnamed: 0,Gene,logFC,AveExpr,t,P.value,B,FDR,RNA,cancer,mutation
0,1,TP53,0.75243,3.437722,4.524541,1.7e-05,2.679575,0.004075,TP53,BRCA,missense
1,2,ALPK2,0.994126,0.496255,3.715526,0.000334,0.001841,0.040779,ALPK2,BRCA,missense
2,3,TTN,0.122115,0.341531,2.852456,0.005272,-2.31403,0.327261,TTN,BRCA,missense
3,4,UBR1,0.80424,2.699487,2.846444,0.005365,-2.327922,0.327261,UBR1,BRCA,missense
4,5,MAP3K1,0.989807,3.760067,2.529414,0.012987,-3.01235,0.555211,MAP3K1,BRCA,missense


## seQTLs

In [4]:
sig_rna = rna[rna['FDR'] < 0.05]

In [5]:
# sig_rna = sig_rna.drop(['Unnamed: 0'], axis=1).reset_index(drop=True)
# sig_rna.to_excel('../data/results/eQTLs.xlsx')

In [6]:
# the total number of seQTLs
sig_rna.shape

(104, 11)

In [7]:
# the number of seQTLs by cancer type
sig_rna.groupby(by='cancer').count()

Unnamed: 0_level_0,Unnamed: 0,Gene,logFC,AveExpr,t,P.value,B,FDR,RNA,mutation
cancer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRCA,6,6,6,6,6,6,6,6,6,6
CCRCC,7,7,7,7,7,7,7,7,7,7
CRC,46,46,46,46,46,46,46,46,46,46
LUAD,27,27,27,27,27,27,27,27,27,27
OV,5,5,5,5,5,5,5,5,5,5
UCEC,13,13,13,13,13,13,13,13,13,13


In [8]:
# the number of seQTLs by mutation type
sig_rna.groupby(by='mutation').count()

Unnamed: 0_level_0,Unnamed: 0,Gene,logFC,AveExpr,t,P.value,B,FDR,RNA,cancer
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
missense,54,54,54,54,54,54,54,54,54,54
synonymous,11,11,11,11,11,11,11,11,11,11
truncating,39,39,39,39,39,39,39,39,39,39


## spQTLs

In [9]:
sig_pro = pro[pro['FDR'] < 0.05]

In [10]:
# sig_pro = sig_pro.drop(['Unnamed: 0'], axis=1).reset_index(drop=True)
# sig_pro.to_excel('../data/results/pQTLs.xlsx')

In [11]:
# the total number of spQTLs
sig_pro.shape

(108, 11)

In [12]:
# the number of spQTLs by cancer type
sig_pro.groupby(by='cancer').count()

Unnamed: 0_level_0,Unnamed: 0,Gene,logFC,AveExpr,t,P.value,B,FDR,Protein,mutation
cancer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRCA,4,4,4,4,4,4,4,4,4,4
CCRCC,6,6,6,6,6,6,6,6,6,6
CRC,31,31,31,31,31,31,31,31,31,31
LUAD,8,8,8,8,8,8,8,8,8,8
OV,4,4,4,4,4,4,4,4,4,4
UCEC,55,55,55,55,55,55,55,55,55,55


In [13]:
# the number of spQTLs by mutation type
sig_pro.groupby(by='mutation').count()

Unnamed: 0_level_0,Unnamed: 0,Gene,logFC,AveExpr,t,P.value,B,FDR,Protein,cancer
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
missense,9,9,9,9,9,9,9,9,9,9
synonymous,1,1,1,1,1,1,1,1,1,1
truncating,98,98,98,98,98,98,98,98,98,98


# Mutation impact that are concordant at mRNA and protein levels

In [14]:
sig_pro_rna = pd.read_csv('../data/sig_pro_rna.csv')
sig_pro_rna.head()

Unnamed: 0.1,Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,mutation,PrologFC,ProAveExpr,ProFDR,lrt,ispsQTL,overlap
0,0,TP53,0.75243,3.437722,0.004074853,BRCA,missense,5.157955,-4.131038,3.859562e-09,True,False,False
1,137,TP53,-1.135258,3.437722,2.815644e-10,BRCA,truncating,-0.963731,-4.131038,0.3983235,True,False,False
2,138,CDH1,-2.66682,5.395649,2.775379e-06,BRCA,truncating,-5.00302,-0.848601,3.770341e-07,False,False,False
3,139,CBFB,-0.733797,3.584625,0.04010529,BRCA,truncating,-2.523522,-0.127755,0.0007140661,True,False,False
4,140,MAP2K4,-0.701044,3.176204,0.09523538,BRCA,truncating,-3.400984,-0.253837,0.0003419037,True,True,True


In [15]:
# the number of eQTLs can be detected at protein level
sig_r = sig_pro_rna[sig_pro_rna['RNAFDR'] < 0.05]
sig_r.shape

(48, 13)

In [16]:
# the number of eQTLs that have the same direction as protein
same_dir = sum(np.sign(sig_r['PrologFC']) == np.sign(sig_r['RNAlogFC'])) 
same_dir

43

In [17]:
# the percentage of seQTLs that can be detected at both levels
43 / 48

0.8958333333333334

In [18]:
# QTLs that are significant at both levels
concordant = sig_pro_rna[(sig_pro_rna['RNAFDR'] < 0.05) & (sig_pro_rna['ProFDR'] < 0.05)]
len(concordant)

19

In [19]:
# the percentage of QTLs that are significant at both levels
19 / 48

0.3958333333333333

In [20]:
# the pearson correlation between the logFC of concordant eQTL and pQTL
stats.pearsonr(concordant['RNAlogFC'], concordant['PrologFC'])

(0.9203992136095441, 2.3929893707133057e-08)

In [21]:
# number of concordant QTLs by mutation type
concordant = sig_pro_rna[(sig_pro_rna['RNAFDR'] < 0.05) & (sig_pro_rna['ProFDR'] < 0.05)]
# concordant.to_excel('../data/results/concordant.xlsx')
concordant.groupby('mutation').count()

Unnamed: 0_level_0,Unnamed: 0,Gene,RNAlogFC,RNAAveExpr,RNAFDR,cancer,PrologFC,ProAveExpr,ProFDR,lrt,ispsQTL,overlap
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
missense,3,3,3,3,3,3,3,3,3,3,3,3
synonymous,1,1,1,1,1,1,1,1,1,1,1,1
truncating,15,15,15,15,15,15,15,15,15,15,15,15


# Protein-specific mutation impacts not observed at mRNA levels

In [22]:
sum(sig_pro_rna['lrt'])

99

In [23]:
sum(sig_pro_rna['ispsQTL'])

89

In [24]:
sum(sig_pro_rna['overlap'])

84

In [25]:
# number of truncating psQTLs
psQTLs_tru = sig_pro_rna[(sig_pro_rna['overlap'] == True) & (sig_pro_rna['mutation'] == 'truncating')]
len(psQTLs_tru)

78

In [26]:
# percetage of truncating psQTLs
78 / 84

0.9285714285714286

In [27]:
discordant = sig_pro_rna[sig_pro_rna['overlap'] == True]
# discordant.to_excel('../data/results/discordant.xlsx')