# Compare effect size of eQTLs


### Imports

In [1]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import re
import urllib.request
import os

In [2]:
engine = create_engine('mysql+mysqlconnector://jupyter:password@localhost:3306/gtex', echo=False)

### Functions

Remove version numbers from Ensembl Gene IDs e.g. 'ENSG000000001.8' to 'ENSG000000001'. Enables comparison between IDs from different sources

In [3]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Constants

#### List of GTEx tissues

In [4]:
tissues = ['Adipose - Subcutaneous',
 'Adipose - Visceral (Omentum)',
 'Adrenal Gland',
 'Artery - Aorta',
 'Artery - Coronary',
 'Artery - Tibial',
 'Brain - Amygdala',
 'Brain - Anterior cingulate cortex (BA24)',
 'Brain - Caudate (basal ganglia)',
 'Brain - Cerebellar Hemisphere',
 'Brain - Cerebellum',
 'Brain - Cortex',
 'Brain - Frontal Cortex (BA9)',
 'Brain - Hippocampus',
 'Brain - Hypothalamus',
 'Brain - Nucleus accumbens (basal ganglia)',
 'Brain - Putamen (basal ganglia)',
 'Brain - Spinal cord (cervical c-1)',
 'Brain - Substantia nigra',
 'Breast - Mammary Tissue',
 'Cells - EBV-transformed lymphocytes',
 'Cells - Transformed fibroblasts',
 'Colon - Sigmoid',
 'Colon - Transverse',
 'Esophagus - Gastroesophageal Junction',
 'Esophagus - Mucosa',
 'Esophagus - Muscularis',
 'Heart - Atrial Appendage',
 'Heart - Left Ventricle',
 'Liver',
 'Lung',
 'Minor Salivary Gland',
 'Muscle - Skeletal',
 'Nerve - Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',
 'Prostate',
 'Skin - Not Sun Exposed (Suprapubic)',
 'Skin - Sun Exposed (Lower leg)',
 'Small Intestine - Terminal Ileum',
 'Spleen',
 'Stomach',
 'Testis',
 'Thyroid',
 'Uterus',
 'Vagina',
 'Whole Blood']

---

## Analysis

### Get eQTLs and effect sizes

#### All eQTLs

In [5]:
eQTLsWithSlope = pd.read_sql_query(
    'SELECT gene_id as "Ensembl Gene ID", variant_id, slope, tissue FROM `v7` WHERE sigAfterBonferroni = 1',
     engine,
    coerce_float=True
)
eQTLsWithSlope["Ensembl Gene ID"] = eQTLsWithSlope["Ensembl Gene ID"].apply(removeGeneIDVersions)

In [6]:
eQTLsWithSlope['effectSize'] = eQTLsWithSlope['slope'].abs()

In [7]:
eQTLsWithSlope

Unnamed: 0,Ensembl Gene ID,variant_id,slope,tissue,effectSize
0,ENSG00000225972,1_1093469_G_A_b37,-1.006120,Adipose - Subcutaneous,1.006120
1,ENSG00000248527,1_1169637_T_C_b37,-0.852272,Adipose - Subcutaneous,0.852272
2,ENSG00000248527,1_1170587_C_T_b37,-0.852272,Adipose - Subcutaneous,0.852272
3,ENSG00000224956,1_592297_C_T_b37,-1.182070,Adipose - Subcutaneous,1.182070
4,ENSG00000224956,1_706388_T_C_b37,-1.032600,Adipose - Subcutaneous,1.032600
5,ENSG00000224956,1_710088_C_A_b37,-1.234130,Adipose - Subcutaneous,1.234130
6,ENSG00000224956,1_710195_A_G_b37,-1.213040,Adipose - Subcutaneous,1.213040
7,ENSG00000224956,1_712515_T_TC_b37,-0.906898,Adipose - Subcutaneous,0.906898
8,ENSG00000224956,1_713092_G_A_b37,-1.233040,Adipose - Subcutaneous,1.233040
9,ENSG00000224956,1_714059_C_CCGCCCTTGTGACGTCACGGAAGGCG_b37,-1.297860,Adipose - Subcutaneous,1.297860


#### Only most significant eQTLs

In [8]:
eQTLsWithSlopeMin = pd.read_sql_query(
    'SELECT gene_id as "Ensembl Gene ID", variant_id, slope, tissue FROM `v7` WHERE sigAfterBonferroni = 1 AND pval_nominal = min_pval_nominal',
     engine,
    coerce_float=True
)
eQTLsWithSlopeMin["Ensembl Gene ID"] = eQTLsWithSlopeMin["Ensembl Gene ID"].apply(removeGeneIDVersions)

In [9]:
eQTLsWithSlopeMin['effectSize'] = eQTLsWithSlopeMin['slope'].abs()

In [10]:
eQTLsWithSlopeMin.drop_duplicates(subset=['Ensembl Gene ID','tissue'], keep='first', inplace=True) 

In [11]:
eQTLsWithSlopeMin

Unnamed: 0,Ensembl Gene ID,variant_id,slope,tissue,effectSize
0,ENSG00000225972,1_1093469_G_A_b37,-1.006120,Adipose - Subcutaneous,1.006120
1,ENSG00000248527,1_1169637_T_C_b37,-0.852272,Adipose - Subcutaneous,0.852272
3,ENSG00000224956,1_732801_A_G_b37,-1.064400,Adipose - Subcutaneous,1.064400
4,ENSG00000228327,1_737085_C_T_b37,0.936611,Adipose - Subcutaneous,0.936611
5,ENSG00000237491,1_752478_G_A_b37,1.119980,Adipose - Subcutaneous,1.119980
6,ENSG00000230092,1_752593_T_G_b37,1.006650,Adipose - Subcutaneous,1.006650
7,ENSG00000177757,1_762886_T_G_b37,0.862495,Adipose - Subcutaneous,0.862495
8,ENSG00000225880,1_754861_T_G_b37,1.001020,Adipose - Subcutaneous,1.001020
9,ENSG00000188976,1_884091_C_CACCCTGGTCCCCCTGGTCCCTTTGGCCCTGCACC...,-0.538912,Adipose - Subcutaneous,0.538912
10,ENSG00000187961,1_879676_G_A_b37,0.324954,Adipose - Subcutaneous,0.324954


### Ohnologs

#### All eQTLs

In [55]:
genesWitheQTLTissueCountBonferroniAndOhnologStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndOhnologStatus.csv', sep="\t", index_col="Unnamed: 0")
merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndOhnologStatus[['Ensembl Gene ID','type']], eQTLsWithSlope, how = "inner", on = "Ensembl Gene ID")

In [56]:
merge1

Unnamed: 0,Ensembl Gene ID,type,variant_id,slope,tissue,effectSize
0,ENSG00000138593,ohno,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,ohno,15_49253961_C_A_b37,-0.132532,Lung,0.132532
2,ENSG00000138593,ohno,15_49260601_A_G_b37,0.145181,Lung,0.145181
3,ENSG00000138593,ohno,15_49261516_A_C_b37,0.143796,Lung,0.143796
4,ENSG00000138593,ohno,15_49263951_G_A_b37,0.148980,Lung,0.148980
5,ENSG00000138593,ohno,15_49272084_C_T_b37,0.199498,Lung,0.199498
6,ENSG00000138593,ohno,15_49273571_C_G_b37,0.198058,Lung,0.198058
7,ENSG00000138593,ohno,15_49273619_C_A_b37,0.198058,Lung,0.198058
8,ENSG00000138593,ohno,15_49276416_G_T_b37,0.154227,Lung,0.154227
9,ENSG00000138593,ohno,15_49277795_C_A_b37,0.148026,Lung,0.148026


In [57]:
merge1.groupby('type')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SSD,5704389.0,0.511213,0.279523,0.058667,0.302631,0.446464,0.656283,7.31012
ohno,3372543.0,0.419058,0.245244,0.066592,0.250132,0.359044,0.519662,6.77595
singleton,4866258.0,0.449465,0.251558,0.063101,0.269129,0.389278,0.564749,12.2275


In [58]:
merge1.to_csv('../../outputFiles/eQTLEffectSize/effectSizeAllOhnologs.csv.xz', index=False, compression="xz")

#### Only most significant eQTLs 

In [59]:
merge2 = pd.merge(genesWitheQTLTissueCountBonferroniAndOhnologStatus[['Ensembl Gene ID','type']], eQTLsWithSlopeMin, how = "inner", on = "Ensembl Gene ID")

In [60]:
merge2

Unnamed: 0,Ensembl Gene ID,type,variant_id,slope,tissue,effectSize
0,ENSG00000138593,ohno,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,ohno,15_49376617_GC_G_b37,0.268425,Lung,0.268425
2,ENSG00000138593,ohno,15_49624784_G_C_b37,-0.122000,Nerve - Tibial,0.122000
3,ENSG00000138593,ohno,15_49376617_GC_G_b37,0.352287,Thyroid,0.352287
4,ENSG00000166351,SSD,21_14907589_T_G_b37,0.483956,Testis,0.483956
5,ENSG00000168675,SSD,18_13564699_G_GC_b37,0.256999,Artery - Aorta,0.256999
6,ENSG00000188992,SSD,21_15385969_C_A_b37,-0.458562,Adipose - Visceral (Omentum),0.458562
7,ENSG00000188992,SSD,21_15442450_CGTG_C_b37,-0.536872,Artery - Tibial,0.536872
8,ENSG00000188992,SSD,21_15312788_C_T_b37,-1.342220,Brain - Cerebellar Hemisphere,1.342220
9,ENSG00000188992,SSD,21_15432737_A_G_b37,-1.597840,Brain - Cerebellum,1.597840


In [61]:
merge2.groupby('type')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SSD,53283.0,0.57017,0.383372,0.071583,0.312497,0.477057,0.721229,7.31012
ohno,43754.0,0.505874,0.394973,0.075759,0.270934,0.40322,0.610534,6.77595
singleton,49187.0,0.527007,0.363898,0.068411,0.293015,0.435874,0.653753,12.2275


In [62]:
merge2.to_csv('../../outputFiles/eQTLEffectSize/effectSizeMostSignificantOhnologs.csv.xz', index=False, compression='xz')

### CNVs (Zarrei et al. CNV map)

#### All eQTLs

In [63]:
genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus.csv', sep="\t", index_col="Unnamed: 0")
merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus[['Ensembl Gene ID','CNV']], eQTLsWithSlope, how = "inner", on = "Ensembl Gene ID")

In [64]:
merge1

Unnamed: 0,Ensembl Gene ID,CNV,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49253961_C_A_b37,-0.132532,Lung,0.132532
2,ENSG00000138593,N,15_49260601_A_G_b37,0.145181,Lung,0.145181
3,ENSG00000138593,N,15_49261516_A_C_b37,0.143796,Lung,0.143796
4,ENSG00000138593,N,15_49263951_G_A_b37,0.148980,Lung,0.148980
5,ENSG00000138593,N,15_49272084_C_T_b37,0.199498,Lung,0.199498
6,ENSG00000138593,N,15_49273571_C_G_b37,0.198058,Lung,0.198058
7,ENSG00000138593,N,15_49273619_C_A_b37,0.198058,Lung,0.198058
8,ENSG00000138593,N,15_49276416_G_T_b37,0.154227,Lung,0.154227
9,ENSG00000138593,N,15_49277795_C_A_b37,0.148026,Lung,0.148026


In [65]:
merge1.groupby('CNV')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,7068287.0,0.424135,0.242887,0.058667,0.25315,0.36424,0.529408,7.31012
Y,6874903.0,0.511826,0.278464,0.063101,0.305171,0.448389,0.65452,12.2275


In [66]:
merge1.to_csv('../../outputFiles/eQTLEffectSize/effectSizeAllCNVsZarrei.csv.xz', index=False, compression='xz')

#### Only most significant eQTLs 

In [67]:
merge2 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus[['Ensembl Gene ID','CNV']], eQTLsWithSlopeMin, how = "inner", on = "Ensembl Gene ID")

In [68]:
merge2

Unnamed: 0,Ensembl Gene ID,CNV,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49376617_GC_G_b37,0.268425,Lung,0.268425
2,ENSG00000138593,N,15_49624784_G_C_b37,-0.122000,Nerve - Tibial,0.122000
3,ENSG00000138593,N,15_49376617_GC_G_b37,0.352287,Thyroid,0.352287
4,ENSG00000166351,Y,21_14907589_T_G_b37,0.483956,Testis,0.483956
5,ENSG00000168675,Y,18_13564699_G_GC_b37,0.256999,Artery - Aorta,0.256999
6,ENSG00000188992,N,21_15385969_C_A_b37,-0.458562,Adipose - Visceral (Omentum),0.458562
7,ENSG00000188992,N,21_15442450_CGTG_C_b37,-0.536872,Artery - Tibial,0.536872
8,ENSG00000188992,N,21_15312788_C_T_b37,-1.342220,Brain - Cerebellar Hemisphere,1.342220
9,ENSG00000188992,N,21_15432737_A_G_b37,-1.597840,Brain - Cerebellum,1.597840


In [69]:
merge2.groupby('CNV')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,83908.0,0.5115,0.36954,0.071583,0.279712,0.417771,0.630277,7.31012
Y,62316.0,0.569955,0.394444,0.068411,0.310711,0.473653,0.716484,12.2275


In [70]:
merge2.to_csv('../../outputFiles/eQTLEffectSize/effectSizeMostSignificantCNVsZarrei.csv.xz', index=False, compression='xz')

### CNVs (ExAC CNV data)

#### All eQTLs

In [71]:
genesWitheQTLTissueCountBonferroniAndCNVExACStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCNVExACStatus.csv', sep="\t", index_col="Unnamed: 0")
merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVExACStatus[['Ensembl Gene ID','CNV']], eQTLsWithSlope, how = "inner", on = "Ensembl Gene ID")

In [72]:
merge1

Unnamed: 0,Ensembl Gene ID,CNV,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49253961_C_A_b37,-0.132532,Lung,0.132532
2,ENSG00000138593,N,15_49260601_A_G_b37,0.145181,Lung,0.145181
3,ENSG00000138593,N,15_49261516_A_C_b37,0.143796,Lung,0.143796
4,ENSG00000138593,N,15_49263951_G_A_b37,0.148980,Lung,0.148980
5,ENSG00000138593,N,15_49272084_C_T_b37,0.199498,Lung,0.199498
6,ENSG00000138593,N,15_49273571_C_G_b37,0.198058,Lung,0.198058
7,ENSG00000138593,N,15_49273619_C_A_b37,0.198058,Lung,0.198058
8,ENSG00000138593,N,15_49276416_G_T_b37,0.154227,Lung,0.154227
9,ENSG00000138593,N,15_49277795_C_A_b37,0.148026,Lung,0.148026


In [73]:
merge1.groupby('CNV')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,754091.0,0.429795,0.26814,0.069058,0.244908,0.356103,0.531783,5.7383
Y,9560019.0,0.437981,0.251538,0.058667,0.260327,0.375801,0.547984,12.2275


In [74]:
merge1.to_csv('../../outputFiles/eQTLEffectSize/effectSizeAllCNVsExAC.csv.xz', index=False, compression='xz')

#### Only most significant eQTLs 

In [75]:
merge2 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVExACStatus[['Ensembl Gene ID','CNV']], eQTLsWithSlopeMin, how = "inner", on = "Ensembl Gene ID")

In [76]:
merge2

Unnamed: 0,Ensembl Gene ID,CNV,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49376617_GC_G_b37,0.268425,Lung,0.268425
2,ENSG00000138593,N,15_49624784_G_C_b37,-0.122000,Nerve - Tibial,0.122000
3,ENSG00000138593,N,15_49376617_GC_G_b37,0.352287,Thyroid,0.352287
4,ENSG00000168675,Y,18_13564699_G_GC_b37,0.256999,Artery - Aorta,0.256999
5,ENSG00000185272,Y,21_15550182_A_G_b37,0.405344,Colon - Transverse,0.405344
6,ENSG00000185272,Y,21_15310398_A_G_b37,-0.746203,Esophagus - Mucosa,0.746203
7,ENSG00000185272,Y,21_15613985_G_A_b37,0.430646,Lung,0.430646
8,ENSG00000185272,Y,21_15352877_T_C_b37,0.453413,Nerve - Tibial,0.453413
9,ENSG00000185272,Y,21_15597297_T_C_b37,0.371142,Pituitary,0.371142


In [77]:
merge2.groupby('CNV')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CNV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,10602.0,0.502952,0.383949,0.078558,0.268255,0.399004,0.614947,5.7383
Y,106134.0,0.515809,0.374508,0.071583,0.283132,0.421932,0.633945,12.2275


In [78]:
merge2.to_csv('../../outputFiles/eQTLEffectSize/effectSizeMostSignificantCNVsExAC.csv.xz', index=False, compression='xz')

### Haploinsufficient genes

#### All eQTLs

In [79]:
genesWitheQTLTissueCountBonferroniAndHaploStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndHaploStatus.csv', sep="\t", index_col="Unnamed: 0")
merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndHaploStatus[['Ensembl Gene ID','haplo']], eQTLsWithSlope, how = "inner", on = "Ensembl Gene ID")

In [80]:
merge1

Unnamed: 0,Ensembl Gene ID,haplo,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49253961_C_A_b37,-0.132532,Lung,0.132532
2,ENSG00000138593,N,15_49260601_A_G_b37,0.145181,Lung,0.145181
3,ENSG00000138593,N,15_49261516_A_C_b37,0.143796,Lung,0.143796
4,ENSG00000138593,N,15_49263951_G_A_b37,0.148980,Lung,0.148980
5,ENSG00000138593,N,15_49272084_C_T_b37,0.199498,Lung,0.199498
6,ENSG00000138593,N,15_49273571_C_G_b37,0.198058,Lung,0.198058
7,ENSG00000138593,N,15_49273619_C_A_b37,0.198058,Lung,0.198058
8,ENSG00000138593,N,15_49276416_G_T_b37,0.154227,Lung,0.154227
9,ENSG00000138593,N,15_49277795_C_A_b37,0.148026,Lung,0.148026


In [81]:
merge1.groupby('haplo')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
haplo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,11410765.0,0.469246,0.26098,0.058667,0.280738,0.407117,0.593894,12.2275
Y,1284536.0,0.358881,0.211558,0.066592,0.213115,0.302376,0.448202,6.50876


In [82]:
merge1.to_csv('../../outputFiles/eQTLEffectSize/effectSizeAllHaplo.csv.xz', index=False, compression='xz')

#### Only most significant eQTLs 

In [83]:
merge2 = pd.merge(genesWitheQTLTissueCountBonferroniAndHaploStatus[['Ensembl Gene ID','haplo']], eQTLsWithSlopeMin, how = "inner", on = "Ensembl Gene ID")

In [84]:
merge2

Unnamed: 0,Ensembl Gene ID,haplo,variant_id,slope,tissue,effectSize
0,ENSG00000138593,N,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,N,15_49376617_GC_G_b37,0.268425,Lung,0.268425
2,ENSG00000138593,N,15_49624784_G_C_b37,-0.122000,Nerve - Tibial,0.122000
3,ENSG00000138593,N,15_49376617_GC_G_b37,0.352287,Thyroid,0.352287
4,ENSG00000166351,N,21_14907589_T_G_b37,0.483956,Testis,0.483956
5,ENSG00000168675,N,18_13564699_G_GC_b37,0.256999,Artery - Aorta,0.256999
6,ENSG00000188992,N,21_15385969_C_A_b37,-0.458562,Adipose - Visceral (Omentum),0.458562
7,ENSG00000188992,N,21_15442450_CGTG_C_b37,-0.536872,Artery - Tibial,0.536872
8,ENSG00000188992,N,21_15312788_C_T_b37,-1.342220,Brain - Cerebellar Hemisphere,1.342220
9,ENSG00000188992,N,21_15432737_A_G_b37,-1.597840,Brain - Cerebellum,1.597840


In [85]:
merge2.groupby('haplo')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
haplo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,118068.0,0.54031,0.37676,0.068411,0.29934,0.446248,0.66749,12.2275
Y,16265.0,0.460046,0.404316,0.079915,0.230805,0.343046,0.549641,6.49997


In [86]:
merge2.to_csv('../../outputFiles/eQTLEffectSize/effectSizeMostSignificantHaplo.csv.xz', index=False, compression='xz')

### Copy number conserved genes

#### All eQTLs

In [87]:
genesWitheQTLTissueCountBonferroniAndCCNStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCCNStatus.csv', sep="\t", index_col="Unnamed: 0")
merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCCNStatus[['Ensembl Gene ID','CCN']], eQTLsWithSlope, how = "inner", on = "Ensembl Gene ID")

In [88]:
merge1

Unnamed: 0,Ensembl Gene ID,CCN,variant_id,slope,tissue,effectSize
0,ENSG00000138593,Y,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,Y,15_49253961_C_A_b37,-0.132532,Lung,0.132532
2,ENSG00000138593,Y,15_49260601_A_G_b37,0.145181,Lung,0.145181
3,ENSG00000138593,Y,15_49261516_A_C_b37,0.143796,Lung,0.143796
4,ENSG00000138593,Y,15_49263951_G_A_b37,0.148980,Lung,0.148980
5,ENSG00000138593,Y,15_49272084_C_T_b37,0.199498,Lung,0.199498
6,ENSG00000138593,Y,15_49273571_C_G_b37,0.198058,Lung,0.198058
7,ENSG00000138593,Y,15_49273619_C_A_b37,0.198058,Lung,0.198058
8,ENSG00000138593,Y,15_49276416_G_T_b37,0.154227,Lung,0.154227
9,ENSG00000138593,Y,15_49277795_C_A_b37,0.148026,Lung,0.148026


In [89]:
merge1.groupby('CCN')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CCN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,9119049.0,0.488945,0.273456,0.058667,0.287607,0.42401,0.625331,12.2275
Y,4477809.0,0.420036,0.242219,0.066574,0.251447,0.359867,0.521282,6.50876


In [90]:
merge1.to_csv('../../outputFiles/eQTLEffectSize/effectSizeAllCCN.csv.xz', index=False, compression='xz')

#### Only most significant eQTLs 

In [91]:
merge2 = pd.merge(genesWitheQTLTissueCountBonferroniAndCCNStatus[['Ensembl Gene ID','CCN']], eQTLsWithSlopeMin, how = "inner", on = "Ensembl Gene ID")

In [92]:
merge2

Unnamed: 0,Ensembl Gene ID,CCN,variant_id,slope,tissue,effectSize
0,ENSG00000138593,Y,15_49209986_C_CT_b37,-0.791940,Esophagus - Gastroesophageal Junction,0.791940
1,ENSG00000138593,Y,15_49376617_GC_G_b37,0.268425,Lung,0.268425
2,ENSG00000138593,Y,15_49624784_G_C_b37,-0.122000,Nerve - Tibial,0.122000
3,ENSG00000138593,Y,15_49376617_GC_G_b37,0.352287,Thyroid,0.352287
4,ENSG00000166351,N,21_14907589_T_G_b37,0.483956,Testis,0.483956
5,ENSG00000168675,Y,18_13564699_G_GC_b37,0.256999,Artery - Aorta,0.256999
6,ENSG00000188992,N,21_15385969_C_A_b37,-0.458562,Adipose - Visceral (Omentum),0.458562
7,ENSG00000188992,N,21_15442450_CGTG_C_b37,-0.536872,Artery - Tibial,0.536872
8,ENSG00000188992,N,21_15312788_C_T_b37,-1.342220,Brain - Cerebellar Hemisphere,1.342220
9,ENSG00000188992,N,21_15432737_A_G_b37,-1.597840,Brain - Cerebellum,1.597840


In [93]:
merge2.groupby('CCN')['effectSize'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CCN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N,90408.0,0.54946,0.384248,0.068411,0.297673,0.451697,0.689063,12.2275
Y,52263.0,0.50342,0.373192,0.076504,0.277884,0.410656,0.613109,6.49997


In [94]:
merge2.to_csv('../../outputFiles/eQTLEffectSize/effectSizeMostSignificantCCN.csv.xz', index=False, compression='xz')