# Broad tissue breadth eQTLs

Identify genes affected by broad tissue breadth eQTLs

## Housekeeping

### Imports

In [1]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import re

In [2]:
engine = create_engine('mysql+mysqlconnector://jupyter:password@localhost:3306/gtex', echo=False)

### Functions

Remove version numbers from Ensembl Gene IDs e.g. 'ENSG000000001.8' to 'ENSG000000001'. Enables comparison between IDs from different sources

In [3]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Constants

#### List of GTEx tissues

In [4]:
tissues = ['Adipose - Subcutaneous',
 'Adipose - Visceral (Omentum)',
 'Adrenal Gland',
 'Artery - Aorta',
 'Artery - Coronary',
 'Artery - Tibial',
 'Brain - Amygdala',
 'Brain - Anterior cingulate cortex (BA24)',
 'Brain - Caudate (basal ganglia)',
 'Brain - Cerebellar Hemisphere',
 'Brain - Cerebellum',
 'Brain - Cortex',
 'Brain - Frontal Cortex (BA9)',
 'Brain - Hippocampus',
 'Brain - Hypothalamus',
 'Brain - Nucleus accumbens (basal ganglia)',
 'Brain - Putamen (basal ganglia)',
 'Brain - Spinal cord (cervical c-1)',
 'Brain - Substantia nigra',
 'Breast - Mammary Tissue',
 'Cells - EBV-transformed lymphocytes',
 'Cells - Transformed fibroblasts',
 'Colon - Sigmoid',
 'Colon - Transverse',
 'Esophagus - Gastroesophageal Junction',
 'Esophagus - Mucosa',
 'Esophagus - Muscularis',
 'Heart - Atrial Appendage',
 'Heart - Left Ventricle',
 'Liver',
 'Lung',
 'Minor Salivary Gland',
 'Muscle - Skeletal',
 'Nerve - Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',
 'Prostate',
 'Skin - Not Sun Exposed (Suprapubic)',
 'Skin - Sun Exposed (Lower leg)',
 'Small Intestine - Terminal Ileum',
 'Spleen',
 'Stomach',
 'Testis',
 'Thyroid',
 'Uterus',
 'Vagina',
 'Whole Blood']

---

## Analysis

In [15]:
eQTLsAndNumTissuesAffected = pd.read_csv('../../outputFiles/GTExV7/eQTLsAndNumTissuesAffected.txt')
eQTLsAndNumTissuesAffected

Unnamed: 0,variant_id,gene_id,countUncorrected,countBonferroniCorrected,countMetasoft
0,10_100000625_A_G_b37,ENSG00000138131,5,4.0,5
1,10_100000625_A_G_b37,ENSG00000166024,4,3.0,9
2,10_100000625_A_G_b37,ENSG00000230928,4,3.0,9
3,10_100000645_A_C_b37,ENSG00000138131,1,1.0,2
4,10_100000645_A_C_b37,ENSG00000230928,11,10.0,29
5,10_100002841_C_CT_b37,ENSG00000230928,2,0.0,14
6,10_100003242_T_G_b37,ENSG00000166024,1,0.0,9
7,10_100003242_T_G_b37,ENSG00000230928,1,0.0,4
8,10_100003785_T_C_b37,ENSG00000138131,6,3.0,5
9,10_100003785_T_C_b37,ENSG00000166024,4,2.0,12


In [16]:
eQTLsAndNumTissuesAffected.describe()

Unnamed: 0,countUncorrected,countBonferroniCorrected,countMetasoft
count,7627598.0,7627598.0,7627598.0
mean,4.822141,2.978086,16.51631
std,7.782665,6.681433,15.61892
min,1.0,0.0,0.0
25%,1.0,0.0,2.0
50%,2.0,1.0,11.0
75%,4.0,2.0,30.0
max,48.0,48.0,48.0


### Broad tissue breadth eQTLs
[Back to top](#Outline)

##### Bonferroni-corrected

In [21]:
eQTLsAndNumTissuesAffected.loc[eQTLsAndNumTissuesAffected['countBonferroniCorrected'] > 0, 'countBonferroniCorrected'].quantile(.9)

14.0

In [22]:
len(eQTLsAndNumTissuesAffected[eQTLsAndNumTissuesAffected['countBonferroniCorrected'] >= 14])/len(eQTLsAndNumTissuesAffected[eQTLsAndNumTissuesAffected['countBonferroniCorrected'] > 0])

0.10122680383884824

In [26]:
broadBonferronieQTLAffectedGenes = pd.DataFrame()
broadBonferronieQTLAffectedGenes = pd.Series(eQTLsAndNumTissuesAffected.loc[eQTLsAndNumTissuesAffected['countBonferroniCorrected'] >= 14, 'gene_id'].unique().copy())

In [27]:
broadBonferronieQTLAffectedGenes

0       ENSG00000119943
1       ENSG00000107521
2       ENSG00000095485
3       ENSG00000196072
4       ENSG00000075826
5       ENSG00000272572
6       ENSG00000186862
7       ENSG00000166167
8       ENSG00000269609
9       ENSG00000138111
10      ENSG00000214435
11      ENSG00000156398
12      ENSG00000166275
13      ENSG00000148843
14      ENSG00000197748
15      ENSG00000065621
16      ENSG00000148700
17      ENSG00000165806
18      ENSG00000134463
19      ENSG00000165868
20      ENSG00000165646
21      ENSG00000165650
22      ENSG00000151893
23      ENSG00000183605
24      ENSG00000065665
25      ENSG00000120008
26      ENSG00000226864
27      ENSG00000138152
28      ENSG00000121898
29      ENSG00000203791
             ...       
3826    ENSG00000205663
3827    ENSG00000205664
3828    ENSG00000234449
3829    ENSG00000215301
3830    ENSG00000183690
3831    ENSG00000065923
3832    ENSG00000130988
3833    ENSG00000147123
3834    ENSG00000221994
3835    ENSG00000204620
3836    ENSG0000

In [28]:
broadBonferronieQTLAffectedGenes.to_csv('../../outputFiles/broadBonferronieQTLAffectedGenes.csv', index=False)

In [29]:
len(broadBonferronieQTLAffectedGenes)

3856

### Number of tissues per SNP/gene combo
[Back to top](#Outline)

##### Bonferroni-corrected

In [5]:
tissueCountPerComboBonferroni = pd.read_sql_query(
        'SELECT variant_id, gene_id, COUNT(*) as count FROM `v7` WHERE sigAfterBonferroni = 1 GROUP BY variant_id, gene_id',
        engine,
        coerce_float=True
    )
tissueCountPerComboBonferroni

Unnamed: 0,variant_id,gene_id,count
0,10_100000625_A_G_b37,ENSG00000138131.3,4
1,10_100000625_A_G_b37,ENSG00000166024.9,3
2,10_100000625_A_G_b37,ENSG00000230928.1,3
3,10_100000645_A_C_b37,ENSG00000138131.3,1
4,10_100000645_A_C_b37,ENSG00000230928.1,10
5,10_100003785_T_C_b37,ENSG00000138131.3,3
6,10_100003785_T_C_b37,ENSG00000166024.9,2
7,10_100003785_T_C_b37,ENSG00000230928.1,2
8,10_100004360_G_A_b37,ENSG00000138131.3,1
9,10_100004360_G_A_b37,ENSG00000230928.1,10


In [None]:
tissueCountPerComboBonferroni.to_csv('../../outputFiles/broadBonferronieQTLAffectedGenes.csv', index=False)

In [7]:
tissueCountPerComboBonferroni['gene_id'] = tissueCountPerComboBonferroni['gene_id'].apply(removeGeneIDVersions)
tissueCountPerComboBonferroni.rename(columns = {'gene_id': 'gene'}, inplace=True)

##### Metasoft

```sql
UPDATE v7Metasoft
SET mvalTissues =
(IF(mval_Adipose_Subcutaneous > 0.9, 1, 0)
+ IF(mval_Adipose_Visceral_Omentum > 0.9, 1, 0)
+ IF(mval_Adrenal_Gland > 0.9, 1, 0)
+ IF(mval_Artery_Aorta > 0.9, 1, 0)
+ IF(mval_Artery_Coronary > 0.9, 1, 0)
+ IF(mval_Artery_Tibial > 0.9, 1, 0)
+ IF(mval_Brain_Amygdala > 0.9, 1, 0)
+ IF(mval_Brain_Anterior_cingulate_cortex_BA24 > 0.9, 1, 0)
+ IF(mval_Brain_Caudate_basal_ganglia > 0.9, 1, 0)
+ IF(mval_Brain_Cerebellar_Hemisphere > 0.9, 1, 0)
+ IF(mval_Brain_Cerebellum > 0.9, 1, 0)
+ IF(mval_Brain_Cortex > 0.9, 1, 0)
+ IF(mval_Brain_Frontal_Cortex_BA9 > 0.9, 1, 0)
+ IF(mval_Brain_Hippocampus > 0.9, 1, 0)
+ IF(mval_Brain_Hypothalamus > 0.9, 1, 0)
+ IF(mval_Brain_Nucleus_accumbens_basal_ganglia > 0.9, 1, 0)
+ IF(mval_Brain_Putamen_basal_ganglia > 0.9, 1, 0)
+ IF(`mval_Brain_Spinal_cord_cervical_c-1` > 0.9, 1, 0)
+ IF(mval_Brain_Substantia_nigra > 0.9, 1, 0)
+ IF(mval_Breast_Mammary_Tissue > 0.9, 1, 0)
+ IF(`mval_Cells_EBV-transformed_lymphocytes` > 0.9, 1, 0)
+ IF(mval_Cells_Transformed_fibroblasts > 0.9, 1, 0)
+ IF(mval_Colon_Sigmoid > 0.9, 1, 0)
+ IF(mval_Colon_Transverse > 0.9, 1, 0)
+ IF(mval_Esophagus_Gastroesophageal_Junction > 0.9, 1, 0)
+ IF(mval_Esophagus_Mucosa > 0.9, 1, 0)
+ IF(mval_Esophagus_Muscularis > 0.9, 1, 0)
+ IF(mval_Heart_Atrial_Appendage > 0.9, 1, 0)
+ IF(mval_Heart_Left_Ventricle > 0.9, 1, 0)
+ IF(mval_Liver > 0.9, 1, 0)
+ IF(mval_Lung > 0.9, 1, 0)
+ IF(mval_Minor_Salivary_Gland > 0.9, 1, 0)
+ IF(mval_Muscle_Skeletal > 0.9, 1, 0)
+ IF(mval_Nerve_Tibial > 0.9, 1, 0)
+ IF(mval_Ovary > 0.9, 1, 0)
+ IF(mval_Pancreas > 0.9, 1, 0)
+ IF(mval_Pituitary > 0.9, 1, 0)
+ IF(mval_Prostate > 0.9, 1, 0)
+ IF(mval_Skin_Not_Sun_Exposed_Suprapubic > 0.9, 1, 0)
+ IF(mval_Skin_Sun_Exposed_Lower_leg > 0.9, 1, 0)
+ IF(mval_Small_Intestine_Terminal_Ileum > 0.9, 1, 0)
+ IF(mval_Spleen > 0.9, 1, 0)
+ IF(mval_Stomach > 0.9, 1, 0)
+ IF(mval_Testis > 0.9, 1, 0)
+ IF(mval_Thyroid > 0.9, 1, 0)
+ IF(mval_Uterus > 0.9, 1, 0)
+ IF(mval_Vagina > 0.9, 1, 0)
+ IF(mval_Whole_Blood > 0.9, 1, 0))
```

In [10]:
tissueCountPerComboMetasoft = pd.read_sql_query(
        'SELECT snp, gene, mvalTissues FROM `v7Metasoft`',
        engine,
        coerce_float=True
    )
tissueCountPerComboMetasoft

Unnamed: 0,snp,gene,mvalTissues
0,1_13417_C_CGAGA_b37,ENSG00000227232.4,45
1,1_17559_G_C_b37,ENSG00000224956.5,3
2,1_54490_G_A_b37,ENSG00000237683.5,3
3,1_61920_G_A_b37,ENSG00000238009.2,12
4,1_64649_A_C_b37,ENSG00000238009.2,15
5,1_115746_C_T_b37,ENSG00000237683.5,9
6,1_115746_C_T_b37,ENSG00000238009.2,4
7,1_115746_C_T_b37,ENSG00000239906.1,8
8,1_115746_C_T_b37,ENSG00000241860.2,14
9,1_135203_G_A_b37,ENSG00000237683.5,10


In [11]:
tissueCountPerComboMetasoft.rename(columns = {'snp': 'variant_id', 'mvalTissues': 'count'}, inplace=True)

### Number of eQTLs that affect gene in only one tissue
[Back to top](#Outline)

In [12]:
len(tissueCountPerComboBonferroni[tissueCountPerComboBonferroni['count'] == 1])

2073932

In [13]:
len(tissueCountPerComboMetasoft[tissueCountPerComboMetasoft['count'] == 1])

971435

### Number of eQTLs that affect gene in all 48 tissues
[Back to top](#Outline)

In [31]:
len(tissueCountPerComboBonferroni[tissueCountPerComboBonferroni['count'] == 48])

17338

In [32]:
len(tissueCountPerComboMetasoft[tissueCountPerComboMetasoft['count'] == 48])

111380

### Broad tissue breadth eQTLs
[Back to top](#Outline)

##### Bonferroni-corrected

In [33]:
len(tissueCountPerComboBonferroni[tissueCountPerComboBonferroni['count'] >= 14])/len(tissueCountPerComboBonferroni)

0.10122680383884824

In [34]:
broadBonferronieQTLAffectedGenes = pd.DataFrame()
broadBonferronieQTLs = tissueCountPerComboBonferroni[tissueCountPerComboBonferroni['count'] >= 14].copy()
broadBonferronieQTLs['gene'] = broadBonferronieQTLs['gene'].apply(removeGeneIDVersions)
broadBonferronieQTLAffectedGenes['gene'] = broadBonferronieQTLs['gene'].unique()
broadBonferronieQTLAffectedGenes.to_csv('broadBonferronieQTLAffectedGenes.csv', index=False)

In [35]:
len(broadBonferronieQTLAffectedGenes)

3856

##### Metasoft

In [45]:
len(tissueCountPerComboMetasoft[tissueCountPerComboMetasoft['count'] >= 43])/len(tissueCountPerComboMetasoft)

0.09321715696081519

In [48]:
broadMetasofteQTLAffectedGenes = pd.DataFrame()
broadMetasofteQTLs = tissueCountPerComboMetasoft[tissueCountPerComboMetasoft['count'] >= 43].copy()
broadMetasofteQTLs['gene'] = broadMetasofteQTLs['gene'].apply(removeGeneIDVersions)
broadMetasofteQTLAffectedGenes['gene'] = broadMetasofteQTLs['gene'].unique()
broadMetasofteQTLAffectedGenes.to_csv('broadMetasofteQTLAffectedGenes.csv', index=False)

In [49]:
len(broadMetasofteQTLAffectedGenes)

5883