# Mammalian conserved copy number genes & eQTLs

### Background

Mammalian conserved copy number genes are protein-coding genes with 1-to-1 orthologs in 13 mammalian genomes from Rice & McLysaght (2017)

Enrichment for eQTLs of CCN genes

### Imports

In [1]:
import pandas as pd
import re
import os.path

### Functions

In [2]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Read in genes tested for conserved copy number from Rice & McLysaght (2017)

In [3]:
genesTestedForCCN = pd.read_csv('../datasets/geneLists/RiceAndMcLysaght/genesTestedForCopyNumberConservationAcrossMammals.txt', names=['Ensembl Gene ID'])
genesTestedForCCN.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000186092
1,ENSG00000235249
2,ENSG00000185097
3,ENSG00000187634
4,ENSG00000268179


### Read in genes with conserved copy number from Rice & McLysaght (2017)

In [4]:
CCNGenes = pd.read_csv('../datasets/geneLists/RiceAndMcLysaght/mammalianCopyNumberConservedGenes.txt', names=['Ensembl Gene ID'])
CCNGenes.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000000005
1,ENSG00000000457
2,ENSG00000000460
3,ENSG00000000938
4,ENSG00000001084


### eQTL affected tissues per gene

In [5]:
genesTestedForeQTLsWithNumberOfTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfTissues.csv')
genesTestedForeQTLsWithNumberOfTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,48,ENSG00000272186
1,48,ENSG00000117748
2,48,ENSG00000130856
3,48,ENSG00000169446
4,48,ENSG00000013573


In [6]:
genesTestedForeQTLsWithNumberOfMergedTissues = pd.read_csv('../outputFiles/GTExV7/genesTestedWithNumberOfMergedTissues.csv')
genesTestedForeQTLsWithNumberOfMergedTissues.head()

Unnamed: 0,expressedTissues,Ensembl Gene ID
0,26,ENSG00000170604
1,26,ENSG00000178458
2,26,ENSG00000141425
3,26,ENSG00000117115
4,26,ENSG00000105204


In [7]:
PCGenes = pd.read_csv('../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.txt', sep='\t')
PCGenes.head()

Unnamed: 0,Ensembl Gene ID,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000215405,15,20737094,20747114,-1
1,ENSG00000268343,15,21004687,21005367,1
2,ENSG00000230031,15,21040701,21071643,-1
3,ENSG00000138593,15,49280673,49338760,-1
4,ENSG00000268531,15,22011370,22012050,1


In [8]:
genesTestedForeQTLsWithNumberOfTissues = pd.merge(genesTestedForeQTLsWithNumberOfTissues, PCGenes, on="Ensembl Gene ID")
genesTestedForeQTLsWithNumberOfMergedTissues = pd.merge(genesTestedForeQTLsWithNumberOfMergedTissues, PCGenes, on="Ensembl Gene ID")

In [9]:
len(genesTestedForeQTLsWithNumberOfTissues)

19067

##### Bonferroni-corrected

In [10]:
genesWitheQTLTissueCountBonferroni = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfBonferroniAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountBonferroni.head()

Unnamed: 0,Ensembl Gene ID,affectedTissues
0,ENSG00000138593,4
1,ENSG00000166351,1
2,ENSG00000168675,1
3,ENSG00000188992,13
4,ENSG00000185272,8


In [11]:
genesWitheQTLTissueCountBonferroni = pd.merge(genesWitheQTLTissueCountBonferroni, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1
1,ENSG00000166351,1.0,1,21,14982498,15013906,1
2,ENSG00000168675,1.0,47,18,13217497,13652754,1
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1
4,ENSG00000185272,8.0,48,21,15588451,15600693,1
5,ENSG00000182974,1.0,1,15,22368478,22369561,1
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1
7,ENSG00000153575,8.0,48,15,22833395,22873892,1
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1


In [12]:
genesWitheQTLTissueCountBonferroni['affectedTissues'].fillna(0, inplace=True)

##### Filter only for genes that are tested for pLI/haploinsufficiency

In [13]:
genesWitheQTLTissueCountBonferroni = pd.merge(genesWitheQTLTissueCountBonferroni, genesTestedForCCN, on='Ensembl Gene ID', how='inner')

In [14]:
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1
1,ENSG00000166351,1.0,1,21,14982498,15013906,1
2,ENSG00000168675,1.0,47,18,13217497,13652754,1
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1
4,ENSG00000185272,8.0,48,21,15588451,15600693,1
5,ENSG00000182974,1.0,1,15,22368478,22369561,1
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1
7,ENSG00000153575,8.0,48,15,22833395,22873892,1
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1


In [15]:
genesWitheQTLTissueCountBonferroni['CCN'] = 'N'
genesWitheQTLTissueCountBonferroni.loc[genesWitheQTLTissueCountBonferroni['Ensembl Gene ID'].isin(CCNGenes['Ensembl Gene ID']) , 'CCN'] = 'Y'

In [16]:
genesWitheQTLTissueCountBonferroni['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountBonferroni['affectedTissues'] / genesWitheQTLTissueCountBonferroni['expressedTissues']

In [17]:
genesWitheQTLTissueCountBonferroni

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,CCN,propOfExpressedAffectedByeQTL
0,ENSG00000138593,4.0,48,15,49280673,49338760,-1,Y,0.083333
1,ENSG00000166351,1.0,1,21,14982498,15013906,1,N,1.000000
2,ENSG00000168675,1.0,47,18,13217497,13652754,1,Y,0.021277
3,ENSG00000188992,13.0,30,21,15481134,15583166,-1,N,0.433333
4,ENSG00000185272,8.0,48,21,15588451,15600693,1,N,0.166667
5,ENSG00000182974,1.0,1,15,22368478,22369561,1,N,1.000000
6,ENSG00000155304,1.0,48,21,15743436,15755805,-1,Y,0.020833
7,ENSG00000153575,8.0,48,15,22833395,22873892,1,Y,0.166667
8,ENSG00000180530,2.0,48,21,16333556,16437321,-1,Y,0.041667
9,ENSG00000137824,9.0,48,15,41028082,41048049,-1,N,0.187500


In [18]:
genesWitheQTLTissueCountBonferroni.groupby('CCN').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CCN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,11470.0,72455350.0,56362080.0,31427.0,29707233.75,56222983.5,108537300.0,249214145.0,11470.0,72391340.0,...,48.0,48.0,11470.0,0.192231,0.236953,0.0,0.020833,0.104167,0.270833,1.0
Y,6932.0,77895750.0,54899910.0,135852.0,35400657.75,65324189.0,114418300.0,249153343.0,6932.0,77816830.0,...,48.0,48.0,6932.0,0.16946,0.198499,0.0,0.04,0.104167,0.229167,1.0


In [19]:
genesWitheQTLTissueCountBonferroni.to_csv("../outputFiles/genesWitheQTLTissueCountBonferroniAndCCNStatus.csv", sep='\t')

##### Metasoft

In [20]:
genesWitheQTLTissueCountMetasoft = pd.read_csv('../outputFiles/GTExV7/genesWithNumberOfMetasoftAffectedTissues.csv', index_col = 0)
genesWitheQTLTissueCountMetasoft = pd.merge(genesWitheQTLTissueCountMetasoft, genesTestedForeQTLsWithNumberOfTissues, on='Ensembl Gene ID', how='right')
genesWitheQTLTissueCountMetasoft['affectedTissues'].fillna(0, inplace=True)
genesWitheQTLTissueCountMetasoft = pd.merge(genesWitheQTLTissueCountMetasoft, genesTestedForCCN, on='Ensembl Gene ID', how='inner')
genesWitheQTLTissueCountMetasoft['CCN'] = 'N'
genesWitheQTLTissueCountMetasoft.loc[genesWitheQTLTissueCountMetasoft['Ensembl Gene ID'].isin(CCNGenes['Ensembl Gene ID']) , 'CCN'] = 'Y'
genesWitheQTLTissueCountMetasoft['propOfExpressedAffectedByeQTL'] = genesWitheQTLTissueCountMetasoft['affectedTissues'] / genesWitheQTLTissueCountMetasoft['expressedTissues']
genesWitheQTLTissueCountMetasoft

Unnamed: 0,Ensembl Gene ID,affectedTissues,expressedTissues,Chromosome Name,Gene Start (bp),Gene End (bp),Strand,CCN,propOfExpressedAffectedByeQTL
0,ENSG00000138593,24.0,48,15,49280673,49338760,-1,Y,0.500000
1,ENSG00000166157,1.0,3,21,10906201,11029719,-1,N,0.333333
2,ENSG00000168675,10.0,47,18,13217497,13652754,1,Y,0.212766
3,ENSG00000188992,30.0,30,21,15481134,15583166,-1,N,1.000000
4,ENSG00000185272,48.0,48,21,15588451,15600693,1,N,1.000000
5,ENSG00000155304,23.0,48,21,15743436,15755805,-1,Y,0.479167
6,ENSG00000197414,1.0,4,15,22736246,22746002,1,N,0.250000
7,ENSG00000166200,44.0,48,15,49398268,49447858,-1,N,0.916667
8,ENSG00000155307,4.0,48,21,15857549,15955723,-1,Y,0.083333
9,ENSG00000243440,2.0,7,21,15964251,16031142,-1,N,0.285714


In [21]:
genesWitheQTLTissueCountMetasoft.groupby('CCN').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CCN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,11470.0,72455350.0,56362080.0,31427.0,29707233.75,56222983.5,108537300.0,249214145.0,11470.0,72391340.0,...,48.0,48.0,11470.0,0.652846,0.367199,0.0,0.3125,0.8125,1.0,1.0
Y,6932.0,77895750.0,54899910.0,135852.0,35400657.75,65324189.0,114418300.0,249153343.0,6932.0,77816830.0,...,48.0,48.0,6932.0,0.662746,0.342795,0.0,0.359744,0.789474,1.0,1.0


In [22]:
genesWitheQTLTissueCountMetasoft[genesWitheQTLTissueCountMetasoft['affectedTissues']>0].groupby('CCN').describe()

Unnamed: 0_level_0,Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene End (bp),Gene Start (bp),Gene Start (bp),...,expressedTissues,expressedTissues,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL,propOfExpressedAffectedByeQTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
CCN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
N,10655.0,72294470.0,56136440.0,31427.0,30013956.0,56314322.0,107348318.0,249214145.0,10655.0,72226850.0,...,48.0,48.0,10655.0,0.702782,0.331741,0.020833,0.425532,0.854167,1.0,1.0
Y,6740.0,78066350.0,55134800.0,135852.0,35276306.5,65383868.0,114668927.5,249153343.0,6740.0,77986440.0,...,48.0,48.0,6740.0,0.681625,0.328611,0.020833,0.4,0.811655,1.0,1.0


In [23]:
genesWitheQTLTissueCountMetasoft.to_csv("../outputFiles/genesWitheQTLTissueCountMetasoftAndCCNStatus.csv", sep='\t')