# Exome Aggregation Consortium pLI & eQTLs

### Background

pLI - Probability of loss-of-function intolerance

Enrichment for eQTLs of genes with high pLI scores according to ExAC exome sequencing data

### Imports

In [1]:
import pandas as pd
import re
import os.path

### Functions

In [2]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Read in pLI scores

In [3]:
ExAC = pd.read_csv('../datasets/geneLists/ExAC/EnsemblGeneIDWithExACPLiScore.tsv', sep='\t')
ExAC.head()

Unnamed: 0,Ensembl Gene ID,pLI
0,ENSG00000215405,0.0868643
1,ENSG00000230031,0.5718928
2,ENSG00000138593,0.1017131
3,ENSG00000166157,5.114037e-31
4,ENSG00000166351,0.7366598


### eQTL affected tissues per gene

In [4]:
genesTestedForeQTLsWithNumberOfTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfTissues.csv')
genesTestedForeQTLsWithNumberOfTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,48,ENSG00000272186
1,48,ENSG00000117748
2,48,ENSG00000130856
3,48,ENSG00000169446
4,48,ENSG00000013573


In [5]:
genesTestedForeQTLsWithNumberOfMergedTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfMergedTissues.csv')
genesTestedForeQTLsWithNumberOfMergedTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,26,ENSG00000170604
1,26,ENSG00000178458
2,26,ENSG00000141425
3,26,ENSG00000117115
4,26,ENSG00000105204


In [6]:
PCGenes = pd.read_csv('../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.txt', sep='\t')
PCGenes.head()

Unnamed: 0,Ensembl Gene ID,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000215405,15,20737094,20747114,-1
1,ENSG00000268343,15,21004687,21005367,1
2,ENSG00000230031,15,21040701,21071643,-1
3,ENSG00000138593,15,49280673,49338760,-1
4,ENSG00000268531,15,22011370,22012050,1


In [7]:
genesTestedForeQTLsWithNumberOfTissues = pd.merge(genesTestedForeQTLsWithNumberOfTissues, PCGenes, on="Ensembl Gene ID")
genesTestedForeQTLsWithNumberOfMergedTissues = pd.merge(genesTestedForeQTLsWithNumberOfMergedTissues, PCGenes, on="Ensembl Gene ID")

In [8]:
len(genesTestedForeQTLsWithNumberOfTissues)

19067

In [9]:
genesTestedForeQTLsWithNumberOfTissues.drop(
    genesTestedForeQTLsWithNumberOfTissues[genesTestedForeQTLsWithNumberOfTissues['Chromosome Name'] == 'X'].index,
    inplace=True)
genesTestedForeQTLsWithNumberOfMergedTissues.drop(
    genesTestedForeQTLsWithNumberOfMergedTissues[genesTestedForeQTLsWithNumberOfMergedTissues['Chromosome Name'] == 'X'].index,
    inplace=True)

In [10]:
len(genesTestedForeQTLsWithNumberOfTissues)

18301

##### Bonferroni-corrected

In [11]:
genesWitheQTLTissueCountBonferroni = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfBonferroniAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountBonferroni.head()

Unnamed: 0,Ensembl Gene ID,affectedTissues
0,ENSG00000138593,4
1,ENSG00000166351,1
2,ENSG00000168675,1
3,ENSG00000188992,13
4,ENSG00000185272,8


In [12]:
genesWitheQTLTissueCountBonferroni = pd.merge(genesWitheQTLTissueCountBonferroni, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1
1,ENSG00000166351,1.0,1,21,14982498,15013906,1
2,ENSG00000168675,1.0,47,18,13217497,13652754,1
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1
4,ENSG00000185272,8.0,48,21,15588451,15600693,1
5,ENSG00000182974,1.0,1,15,22368478,22369561,1
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1
7,ENSG00000153575,8.0,48,15,22833395,22873892,1
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1


In [13]:
genesWitheQTLTissueCountBonferroni['affectedTissues'].fillna(0, inplace=True)

##### Filter only for genes that are tested for pLI/haploinsufficiency

In [14]:
genesWitheQTLTissueCountBonferroni = pd.merge(genesWitheQTLTissueCountBonferroni, ExAC, on='Ensembl Gene ID', how='inner')

In [15]:
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,pLI
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1,1.017131e-01
1,ENSG00000166351,1.0,1,21,14982498,15013906,1,7.366598e-01
2,ENSG00000168675,1.0,47,18,13217497,13652754,1,3.773064e-01
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1,8.825424e-09
4,ENSG00000185272,8.0,48,21,15588451,15600693,1,2.910155e-05
5,ENSG00000155304,1.0,48,21,15743436,15755805,-1,1.886370e-03
6,ENSG00000153575,8.0,48,15,22833395,22873892,1,1.261972e-04
7,ENSG00000180530,2.0,48,21,16333556,16437321,-1,9.911722e-01
8,ENSG00000137824,9.0,48,15,41028082,41048049,-1,2.377960e-02
9,ENSG00000137880,1.0,48,15,41056218,41059906,1,4.189688e-02


In [16]:
genesWitheQTLTissueCountBonferroni.loc[genesWitheQTLTissueCountBonferroni['pLI'] <= 0.9 , 'haplo'] = 'N'
genesWitheQTLTissueCountBonferroni.loc[genesWitheQTLTissueCountBonferroni['pLI'] > 0.9 , 'haplo'] = 'Y'

In [17]:
genesWitheQTLTissueCountBonferroni['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountBonferroni['affectedTissues'] / genesWitheQTLTissueCountBonferroni['expressedTissues']

In [18]:
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,pLI,haplo,propOfExpressedAffectedByeQTL
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1,1.017131e-01,N,0.083333
1,ENSG00000166351,1.0,1,21,14982498,15013906,1,7.366598e-01,N,1.000000
2,ENSG00000168675,1.0,47,18,13217497,13652754,1,3.773064e-01,N,0.021277
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1,8.825424e-09,N,0.433333
4,ENSG00000185272,8.0,48,21,15588451,15600693,1,2.910155e-05,N,0.166667
5,ENSG00000155304,1.0,48,21,15743436,15755805,-1,1.886370e-03,N,0.020833
6,ENSG00000153575,8.0,48,15,22833395,22873892,1,1.261972e-04,N,0.166667
7,ENSG00000180530,2.0,48,21,16333556,16437321,-1,9.911722e-01,Y,0.041667
8,ENSG00000137824,9.0,48,15,41028082,41048049,-1,2.377960e-02,N,0.187500
9,ENSG00000137880,1.0,48,15,41056218,41059906,1,4.189688e-02,N,0.020833


In [19]:
genesWitheQTLTissueCountBonferroni.groupby('haplo').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,pLI,pLI,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
haplo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,14053.0,74131510.0,56401940.0,31427.0,31236510.0,57901047.0,110617300.0,249214145.0,14053.0,74073240.0,...,0.184502,0.899854,14053.0,0.19868,0.230245,0.0,0.041667,0.107143,0.282609,1.0
Y,2992.0,76451410.0,55675010.0,214629.0,34577166.5,61440010.5,114248700.0,247495148.0,2992.0,76324500.0,...,0.999786,1.0,2992.0,0.115904,0.151499,0.0,0.020833,0.0625,0.145833,1.0


In [20]:
genesWitheQTLTissueCountBonferroni.to_csv("../outputFiles/genesWitheQTLTissueCountBonferroniAndHaploStatus.csv", sep='\t')

##### Metasoft

In [21]:
genesWitheQTLTissueCountMetasoft = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfMetasoftAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountMetasoft = pd.merge(genesWitheQTLTissueCountMetasoft, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountMetasoft['affectedTissues'].fillna(0, inplace=True)
genesWitheQTLTissueCountMetasoft = pd.merge(genesWitheQTLTissueCountMetasoft, ExAC, on='Ensembl Gene ID', how='inner')
genesWitheQTLTissueCountMetasoft.loc[genesWitheQTLTissueCountMetasoft['pLI'] <= 0.9 , 'haplo'] = 'N'
genesWitheQTLTissueCountMetasoft.loc[genesWitheQTLTissueCountMetasoft['pLI'] > 0.9 , 'haplo'] = 'Y'
genesWitheQTLTissueCountMetasoft['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountMetasoft['affectedTissues'] / genesWitheQTLTissueCountMetasoft['expressedTissues']
genesWitheQTLTissueCountMetasoft

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,pLI,haplo,propOfExpressedAffectedByeQTL
0,ENSG00000138593,24.0,48,15,49280673,49338760,-1,1.017131e-01,N,0.500000
1,ENSG00000166157,1.0,3,21,10906201,11029719,-1,5.114037e-31,N,0.333333
2,ENSG00000168675,10.0,47,18,13217497,13652754,1,3.773064e-01,N,0.212766
3,ENSG00000188992,30.0,30,21,15481134,15583166,-1,8.825424e-09,N,1.000000
4,ENSG00000185272,48.0,48,21,15588451,15600693,1,2.910155e-05,N,1.000000
5,ENSG00000155304,23.0,48,21,15743436,15755805,-1,1.886370e-03,N,0.479167
6,ENSG00000166200,44.0,48,15,49398268,49447858,-1,9.995728e-01,Y,0.916667
7,ENSG00000155307,4.0,48,21,15857549,15955723,-1,7.560236e-02,N,0.083333
8,ENSG00000153575,48.0,48,15,22833395,22873892,1,1.261972e-04,N,1.000000
9,ENSG00000180530,12.0,48,21,16333556,16437321,-1,9.911722e-01,Y,0.250000


In [22]:
genesWitheQTLTissueCountMetasoft.groupby('haplo').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,pLI,pLI,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
haplo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,14053.0,74131510.0,56401940.0,31427.0,31236510.0,57901047.0,110617300.0,249214145.0,14053.0,74073240.0,...,0.184502,0.899854,14053.0,0.679042,0.354233,0.0,0.375,0.833333,1.0,1.0
Y,2992.0,76451410.0,55675010.0,214629.0,34577166.5,61440010.5,114248700.0,247495148.0,2992.0,76324500.0,...,0.999786,1.0,2992.0,0.602575,0.344713,0.0,0.270833,0.645833,0.958333,1.0


In [24]:
genesWitheQTLTissueCountMetasoft[genesWitheQTLTissueCountMetasoft['affectedTissues']>0].groupby('haplo').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,pLI,pLI,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
haplo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,13259.0,74212700.0,56251370.0,31427.0,31478694.0,58143994.0,110471884.0,249214145.0,13259.0,74152060.0,...,0.192325,0.899854,13259.0,0.719706,0.322068,0.020833,0.477767,0.875,1.0,1.0
Y,2955.0,76524400.0,55840750.0,214629.0,34448577.5,61505060.0,114250395.5,247495148.0,2955.0,76397060.0,...,0.999786,1.0,2955.0,0.61012,0.340162,0.020833,0.291667,0.666667,0.958333,1.0


In [25]:
genesWitheQTLTissueCountMetasoft.to_csv("../outputFiles/genesWitheQTLTissueCountMetasoftAndHaploStatus.csv", sep='\t')