# Zarrei et al. CNV Map & eQTLs

### Background

Enrichment for eQTLs of genes within and outside CNV regions.

### Imports

In [1]:
import pandas as pd
import re
import os.path

### Functions

In [2]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Input/Output files

---

### eQTL affected tissues per gene
[Back to top](#Outline)

In [3]:
genesTestedForeQTLsWithNumberOfTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfTissues.csv')
genesTestedForeQTLsWithNumberOfTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,48,ENSG00000272186
1,48,ENSG00000117748
2,48,ENSG00000130856
3,48,ENSG00000169446
4,48,ENSG00000013573


In [4]:
genesTestedForeQTLsWithNumberOfMergedTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfMergedTissues.csv')
genesTestedForeQTLsWithNumberOfMergedTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,26,ENSG00000170604
1,26,ENSG00000178458
2,26,ENSG00000141425
3,26,ENSG00000117115
4,26,ENSG00000105204


In [5]:
PCGenes = pd.read_csv('../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.txt', sep='\t')
PCGenes.head()

Unnamed: 0,Ensembl Gene ID,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000215405,15,20737094,20747114,-1
1,ENSG00000268343,15,21004687,21005367,1
2,ENSG00000230031,15,21040701,21071643,-1
3,ENSG00000138593,15,49280673,49338760,-1
4,ENSG00000268531,15,22011370,22012050,1


In [6]:
genesTestedForeQTLsWithNumberOfTissues = pd.merge(genesTestedForeQTLsWithNumberOfTissues, PCGenes, on="Ensembl Gene ID")
genesTestedForeQTLsWithNumberOfMergedTissues = pd.merge(genesTestedForeQTLsWithNumberOfMergedTissues, PCGenes, on="Ensembl Gene ID")

##### Bonferroni-corrected

In [7]:
genesWitheQTLTissueCountBonferroni = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfBonferroniAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountBonferroni.head()

Unnamed: 0,Ensembl Gene ID,affectedTissues
0,ENSG00000138593,4
1,ENSG00000166351,1
2,ENSG00000168675,1
3,ENSG00000188992,13
4,ENSG00000185272,8


In [8]:
genesWitheQTLTissueCountBonferroni = pd.merge(genesWitheQTLTissueCountBonferroni, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1
1,ENSG00000166351,1.0,1,21,14982498,15013906,1
2,ENSG00000168675,1.0,47,18,13217497,13652754,1
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1
4,ENSG00000185272,8.0,48,21,15588451,15600693,1
5,ENSG00000182974,1.0,1,15,22368478,22369561,1
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1
7,ENSG00000153575,8.0,48,15,22833395,22873892,1
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1


In [9]:
genesWitheQTLTissueCountBonferroni['affectedTissues'].fillna(0, inplace=True)

##### CNVR genes

In [12]:
CNVGenes = pd.read_csv('../datasets/geneLists/ZarreiEtAlCNVRMap/inclusiveRegionsChr1-YGenes.txt', names=['Ensembl Gene ID'])
CNVGenes.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000215405
1,ENSG00000268343
2,ENSG00000230031
3,ENSG00000268531
4,ENSG00000233917


In [13]:
genesWitheQTLTissueCountBonferroni.loc[genesWitheQTLTissueCountBonferroni['Ensembl Gene ID'].isin(CNVGenes['Ensembl Gene ID']) , 'CNV'] = 'Y'
genesWitheQTLTissueCountBonferroni.loc[~genesWitheQTLTissueCountBonferroni['Ensembl Gene ID'].isin(CNVGenes['Ensembl Gene ID']) , 'CNV'] = 'N'

In [14]:
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,CNV
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1,N
1,ENSG00000166351,1.0,1,21,14982498,15013906,1,Y
2,ENSG00000168675,1.0,47,18,13217497,13652754,1,Y
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1,N
4,ENSG00000185272,8.0,48,21,15588451,15600693,1,N
5,ENSG00000182974,1.0,1,15,22368478,22369561,1,Y
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1,N
7,ENSG00000153575,8.0,48,15,22833395,22873892,1,Y
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1,Y
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1,N


In [15]:
genesWitheQTLTissueCountBonferroni['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountBonferroni['affectedTissues'] / genesWitheQTLTissueCountBonferroni['expressedTissues']

In [16]:
genesWitheQTLTissueCountBonferroni.groupby('CNV').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,11943.0,75518120.0,54872020.0,46870.0,33973641.5,59838601.0,111737519.5,249214145.0,11943.0,75482920.0,...,48.0,48.0,11943.0,0.169079,0.217149,0.0,0.020833,0.083333,0.229167,1.0
Y,7124.0,72795100.0,57588790.0,31427.0,27238655.5,56519174.0,110251788.5,248814185.0,7124.0,72672260.0,...,48.0,48.0,7124.0,0.211814,0.241067,0.0,0.041667,0.125,0.291667,1.0


In [17]:
genesWitheQTLTissueCountBonferroni[genesWitheQTLTissueCountBonferroni['affectedTissues']>0].groupby('CNV').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,9517.0,75578340.0,54819670.0,46870.0,33999944.0,60511685.0,111683311.0,249153343.0,9517.0,75540450.0,...,48.0,48.0,9517.0,0.212179,0.223672,0.020833,0.054054,0.125,0.291667,1.0
Y,6221.0,72859040.0,57769290.0,31427.0,26941215.0,56864763.0,110438915.0,248757100.0,6221.0,72730120.0,...,48.0,48.0,6221.0,0.242559,0.243087,0.020833,0.06383,0.145833,0.333333,1.0


In [18]:
genesWitheQTLTissueCountBonferroni.to_csv("../outputFiles/genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus.csv", sep='\t')

##### Metasoft

In [20]:
genesWitheQTLTissueCountMetasoft = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfMetasoftAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountMetasoft = pd.merge(genesWitheQTLTissueCountMetasoft, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountMetasoft['affectedTissues'].fillna(0, inplace=True)
genesWitheQTLTissueCountMetasoft['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountMetasoft['affectedTissues'] / genesWitheQTLTissueCountMetasoft['expressedTissues']
genesWitheQTLTissueCountMetasoft.loc[genesWitheQTLTissueCountMetasoft['Ensembl Gene ID'].isin(CNVGenes['Ensembl Gene ID']) , 'CNV'] = 'Y'
genesWitheQTLTissueCountMetasoft.loc[~genesWitheQTLTissueCountMetasoft['Ensembl Gene ID'].isin(CNVGenes['Ensembl Gene ID']) , 'CNV'] = 'N'
genesWitheQTLTissueCountMetasoft

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,propOfExpressedAffectedByeQTL,CNV
0,ENSG00000138593,24.0,48,15,49280673,49338760,-1,0.500000,N
1,ENSG00000166157,1.0,3,21,10906201,11029719,-1,0.333333,Y
2,ENSG00000168675,10.0,47,18,13217497,13652754,1,0.212766,Y
3,ENSG00000188992,30.0,30,21,15481134,15583166,-1,1.000000,N
4,ENSG00000185272,48.0,48,21,15588451,15600693,1,1.000000,N
5,ENSG00000155304,23.0,48,21,15743436,15755805,-1,0.479167,N
6,ENSG00000197414,1.0,4,15,22736246,22746002,1,0.250000,Y
7,ENSG00000166200,44.0,48,15,49398268,49447858,-1,0.916667,N
8,ENSG00000155307,4.0,48,21,15857549,15955723,-1,0.083333,N
9,ENSG00000243440,2.0,7,21,15964251,16031142,-1,0.285714,N


In [21]:
genesWitheQTLTissueCountMetasoft.groupby('CNV').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,11943.0,75518120.0,54872020.0,46870.0,33973641.5,59838601.0,111737519.5,249214145.0,11943.0,75482920.0,...,48.0,48.0,11943.0,0.627126,0.370999,0.0,0.270833,0.75,1.0,1.0
Y,7124.0,72795100.0,57588790.0,31427.0,27238655.5,56519174.0,110251788.5,248814185.0,7124.0,72672260.0,...,48.0,48.0,7124.0,0.691565,0.343244,0.0,0.416667,0.847826,1.0,1.0


In [22]:
genesWitheQTLTissueCountMetasoft[genesWitheQTLTissueCountMetasoft['affectedTissues']>0].groupby('CNV').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,11097.0,75525770.0,54725110.0,46870.0,34330234.0,60163424.0,111373400.0,249214145.0,11097.0,75488990.0,...,48.0,48.0,11097.0,0.674936,0.340387,0.020833,0.375,0.8125,1.0,1.0
Y,6770.0,72904190.0,57634350.0,31427.0,27200695.5,56872257.5,110304100.0,248437138.0,6770.0,72776920.0,...,48.0,48.0,6770.0,0.727727,0.312503,0.020833,0.5,0.875,1.0,1.0


In [23]:
genesWitheQTLTissueCountMetasoft.to_csv("../outputFiles/genesWitheQTLTissueCountMetasoftAndCNVZarreiStatus.csv", sep='\t')