# Count SNPs tested for eQTL associations


### Imports

In [1]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
import re
import urllib.request
import os

In [2]:
engine = create_engine('mysql+mysqlconnector://jupyter:password@localhost:3306/gtex', echo=False)

### Functions

Remove version numbers from Ensembl Gene IDs e.g. 'ENSG000000001.8' to 'ENSG000000001'. Enables comparison between IDs from different sources

In [3]:
def removeGeneIDVersions(text):
    return re.findall('(ENSG\d+)', text)[0]

### Constants

#### List of GTEx tissues

In [4]:
tissues = ['Adipose - Subcutaneous',
 'Adipose - Visceral (Omentum)',
 'Adrenal Gland',
 'Artery - Aorta',
 'Artery - Coronary',
 'Artery - Tibial',
 'Brain - Amygdala',
 'Brain - Anterior cingulate cortex (BA24)',
 'Brain - Caudate (basal ganglia)',
 'Brain - Cerebellar Hemisphere',
 'Brain - Cerebellum',
 'Brain - Cortex',
 'Brain - Frontal Cortex (BA9)',
 'Brain - Hippocampus',
 'Brain - Hypothalamus',
 'Brain - Nucleus accumbens (basal ganglia)',
 'Brain - Putamen (basal ganglia)',
 'Brain - Spinal cord (cervical c-1)',
 'Brain - Substantia nigra',
 'Breast - Mammary Tissue',
 'Cells - EBV-transformed lymphocytes',
 'Cells - Transformed fibroblasts',
 'Colon - Sigmoid',
 'Colon - Transverse',
 'Esophagus - Gastroesophageal Junction',
 'Esophagus - Mucosa',
 'Esophagus - Muscularis',
 'Heart - Atrial Appendage',
 'Heart - Left Ventricle',
 'Liver',
 'Lung',
 'Minor Salivary Gland',
 'Muscle - Skeletal',
 'Nerve - Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',
 'Prostate',
 'Skin - Not Sun Exposed (Suprapubic)',
 'Skin - Sun Exposed (Lower leg)',
 'Small Intestine - Terminal Ileum',
 'Spleen',
 'Stomach',
 'Testis',
 'Thyroid',
 'Uterus',
 'Vagina',
 'Whole Blood']

---

## Analysis

### Download and count all tested SNP-gene associations for each GTEx tissue

In [55]:
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    print('Downloading '+tissue_url+'.allpairs.txt.gz...')
    urllib.request.urlretrieve('https://storage.googleapis.com/gtex_analysis_v7/single_tissue_eqtl_data/all_snp_gene_associations/'+tissue_url+'.allpairs.txt.gz', tissue_url+'.allpairs.txt.gz')
    print('Unzipping '+tissue_url+'.allpairs.txt.gz...')
    os.system('gunzip '+ tissue_url+'.allpairs.txt.gz')
    print('Counting...')
    os.system('xsv select -d "\t" "gene_id" '+tissue_url+'.allpairs.txt | xsv frequency --asc  --limit 0 > "../../outputFiles/GTExV7/numTestedSNPs/'+tissue+'.txt"')
    print('Deleting '+tissue_url+'.allpairs.txt...')
    os.remove(tissue_url+'.allpairs.txt')

Lung
Downloading Lung.allpairs.txt.gz...
Unzipping Lung.allpairs.txt.gz...
Counting...
Deleting Lung.allpairs.txt...
Minor Salivary Gland
Downloading Minor_Salivary_Gland.allpairs.txt.gz...
Unzipping Minor_Salivary_Gland.allpairs.txt.gz...
Counting...
Deleting Minor_Salivary_Gland.allpairs.txt...
Muscle - Skeletal
Downloading Muscle_Skeletal.allpairs.txt.gz...
Unzipping Muscle_Skeletal.allpairs.txt.gz...
Counting...
Deleting Muscle_Skeletal.allpairs.txt...
Nerve - Tibial
Downloading Nerve_Tibial.allpairs.txt.gz...
Unzipping Nerve_Tibial.allpairs.txt.gz...
Counting...
Deleting Nerve_Tibial.allpairs.txt...
Ovary
Downloading Ovary.allpairs.txt.gz...
Unzipping Ovary.allpairs.txt.gz...
Counting...
Deleting Ovary.allpairs.txt...
Pancreas
Downloading Pancreas.allpairs.txt.gz...
Unzipping Pancreas.allpairs.txt.gz...
Counting...
Deleting Pancreas.allpairs.txt...
Pituitary
Downloading Pituitary.allpairs.txt.gz...
Unzipping Pituitary.allpairs.txt.gz...
Counting...
Deleting Pituitary.allpairs.txt.

### Get number of significant eQTLs for each tissue and calculate proportion of significant SNPs

#### Ohnologs

##### Bonferroni-corrected eQTLs

In [5]:
allTissuesCombined_ohnologs_bonferroni = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneBonferroni = pd.read_sql_query(
        'SELECT gene_id as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7` WHERE sigAfterBonferroni = 1 AND tissue = "' + tissue + '" GROUP BY gene_id',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountBonferroniAndOhnologStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndOhnologStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneBonferroni["Ensembl Gene ID"] = eQTLCountPerGeneBonferroni["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndOhnologStatus[['Ensembl Gene ID','type']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneBonferroni, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_ohnologs_bonferroni = allTissuesCombined_ohnologs_bonferroni.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [9]:
allTissuesCombined_ohnologs_bonferroni

Unnamed: 0,Ensembl Gene ID,type,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,ohno,gene_id,5645,0.0,0.000000,Adipose - Subcutaneous
1,ENSG00000168675,SSD,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,SSD,gene_id,8607,0.0,0.000000,Adipose - Subcutaneous
3,ENSG00000185272,ohno,gene_id,8622,0.0,0.000000,Adipose - Subcutaneous
4,ENSG00000155304,singleton,gene_id,8566,0.0,0.000000,Adipose - Subcutaneous
5,ENSG00000153575,SSD,gene_id,4601,89.0,0.019344,Adipose - Subcutaneous
6,ENSG00000180530,singleton,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000137824,ohno,gene_id,5804,25.0,0.004307,Adipose - Subcutaneous
8,ENSG00000137880,singleton,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000104129,SSD,gene_id,5741,1.0,0.000174,Adipose - Subcutaneous


In [10]:
allTissuesCombined_ohnologs_bonferroni.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,790141.0,790141.0,790141.0
mean,7199.385029,17.646458,0.002585
std,2238.421046,90.743363,0.012777
min,119.0,0.0,0.0
25%,6104.0,0.0,0.0
50%,7160.0,0.0,0.0
75%,8125.0,0.0,0.0
max,28260.0,4540.0,0.524256


In [11]:
allTissuesCombined_ohnologs_bonferroni.to_csv('../../outputFiles/propSNPsSignificanteQTLsOhnologsBonferroni.csv', index=False)

##### Metasoft eQTLs

In [14]:
allTissuesCombined_ohnologs_metasoft = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneMetasoft = pd.read_sql_query(
        'SELECT gene as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7Metasoft` WHERE `mval_' + tissue_url + '` > 0.9 GROUP BY gene',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountMetasoftAndOhnologStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountMetasoftAndOhnologStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneMetasoft["Ensembl Gene ID"] = eQTLCountPerGeneMetasoft["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountMetasoftAndOhnologStatus[['Ensembl Gene ID','type']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneMetasoft, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_ohnologs_metasoft = allTissuesCombined_ohnologs_metasoft.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [15]:
allTissuesCombined_ohnologs_metasoft

Unnamed: 0,Ensembl Gene ID,type,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,ohno,gene_id,5645,16.0,0.002834,Adipose - Subcutaneous
1,ENSG00000168675,SSD,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,SSD,gene_id,8607,248.0,0.028814,Adipose - Subcutaneous
3,ENSG00000185272,ohno,gene_id,8622,111.0,0.012874,Adipose - Subcutaneous
4,ENSG00000155304,singleton,gene_id,8566,26.0,0.003035,Adipose - Subcutaneous
5,ENSG00000166200,SSD,gene_id,5891,30.0,0.005093,Adipose - Subcutaneous
6,ENSG00000155307,SSD,gene_id,8545,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000153575,SSD,gene_id,4601,130.0,0.028255,Adipose - Subcutaneous
8,ENSG00000180530,singleton,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000137824,ohno,gene_id,5804,141.0,0.024294,Adipose - Subcutaneous


In [16]:
allTissuesCombined_ohnologs_metasoft.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,790141.0,790141.0,790141.0
mean,7199.385029,102.445062,0.014918
std,2238.421046,266.765389,0.033716
min,119.0,0.0,0.0
25%,6104.0,0.0,0.0
50%,7160.0,13.0,0.00181
75%,8125.0,107.0,0.01518
max,28260.0,8955.0,0.653515


In [17]:
allTissuesCombined_ohnologs_metasoft.to_csv('../../outputFiles/propSNPsSignificanteQTLsOhnologsMetasoft.csv', index=False)

#### CNVRs (Zarrei et al.)

##### Bonferroni-corrected eQTLs

In [19]:
allTissuesCombined_CNVZarrei_bonferroni = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneBonferroni = pd.read_sql_query(
        'SELECT gene_id as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7` WHERE sigAfterBonferroni = 1 AND tissue = "' + tissue + '" GROUP BY gene_id',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneBonferroni["Ensembl Gene ID"] = eQTLCountPerGeneBonferroni["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVZarreiStatus[['Ensembl Gene ID','CNV']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneBonferroni, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CNVZarrei_bonferroni = allTissuesCombined_CNVZarrei_bonferroni.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [20]:
allTissuesCombined_CNVZarrei_bonferroni

Unnamed: 0,Ensembl Gene ID,CNV,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,0.0,0.000000,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,0.0,0.000000,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,0.0,0.000000,Adipose - Subcutaneous
4,ENSG00000155304,N,gene_id,8566,0.0,0.000000,Adipose - Subcutaneous
5,ENSG00000153575,Y,gene_id,4601,89.0,0.019344,Adipose - Subcutaneous
6,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000137824,N,gene_id,5804,25.0,0.004307,Adipose - Subcutaneous
8,ENSG00000137880,N,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000104129,N,gene_id,5741,1.0,0.000174,Adipose - Subcutaneous


In [21]:
allTissuesCombined_CNVZarrei_bonferroni.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,790141.0,790141.0,790141.0
mean,7199.385029,17.646458,0.002585
std,2238.421046,90.743363,0.012777
min,119.0,0.0,0.0
25%,6104.0,0.0,0.0
50%,7160.0,0.0,0.0
75%,8125.0,0.0,0.0
max,28260.0,4540.0,0.524256


In [22]:
allTissuesCombined_CNVZarrei_bonferroni.to_csv('../../outputFiles/propSNPsSignificanteQTLsCNVZarreiBonferroni.csv', index=False)

##### Metasoft eQTLs

In [23]:
allTissuesCombined_CNVZarrei_metasoft = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneMetasoft = pd.read_sql_query(
        'SELECT gene as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7Metasoft` WHERE `mval_' + tissue_url + '` > 0.9 GROUP BY gene',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountMetasoftAndCNVZarreiStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountMetasoftAndCNVZarreiStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneMetasoft["Ensembl Gene ID"] = eQTLCountPerGeneMetasoft["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountMetasoftAndCNVZarreiStatus[['Ensembl Gene ID','CNV']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneMetasoft, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CNVZarrei_metasoft = allTissuesCombined_CNVZarrei_metasoft.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [24]:
allTissuesCombined_CNVZarrei_metasoft

Unnamed: 0,Ensembl Gene ID,CNV,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,16.0,0.002834,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,248.0,0.028814,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,111.0,0.012874,Adipose - Subcutaneous
4,ENSG00000155304,N,gene_id,8566,26.0,0.003035,Adipose - Subcutaneous
5,ENSG00000166200,N,gene_id,5891,30.0,0.005093,Adipose - Subcutaneous
6,ENSG00000155307,N,gene_id,8545,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000153575,Y,gene_id,4601,130.0,0.028255,Adipose - Subcutaneous
8,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000137824,N,gene_id,5804,141.0,0.024294,Adipose - Subcutaneous


In [25]:
allTissuesCombined_CNVZarrei_metasoft.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,790141.0,790141.0,790141.0
mean,7199.385029,102.445062,0.014918
std,2238.421046,266.765389,0.033716
min,119.0,0.0,0.0
25%,6104.0,0.0,0.0
50%,7160.0,13.0,0.00181
75%,8125.0,107.0,0.01518
max,28260.0,8955.0,0.653515


In [26]:
allTissuesCombined_CNVZarrei_metasoft.to_csv('../../outputFiles/propSNPsSignificanteQTLsCNVZarreiMetasoft.csv', index=False)

#### CNVs (ExAC data)

##### Bonferroni-corrected eQTLs

In [27]:
allTissuesCombined_CNVExAC_bonferroni = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneBonferroni = pd.read_sql_query(
        'SELECT gene_id as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7` WHERE sigAfterBonferroni = 1 AND tissue = "' + tissue + '" GROUP BY gene_id',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountBonferroniAndCNVExACStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCNVExACStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneBonferroni["Ensembl Gene ID"] = eQTLCountPerGeneBonferroni["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCNVExACStatus[['Ensembl Gene ID','CNV']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneBonferroni, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CNVExAC_bonferroni = allTissuesCombined_CNVExAC_bonferroni.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [28]:
allTissuesCombined_CNVExAC_bonferroni

Unnamed: 0,Ensembl Gene ID,CNV,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,0.0,0.000000,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000185272,Y,gene_id,8622,0.0,0.000000,Adipose - Subcutaneous
3,ENSG00000155304,Y,gene_id,8566,0.0,0.000000,Adipose - Subcutaneous
4,ENSG00000180530,N,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
5,ENSG00000137824,Y,gene_id,5804,25.0,0.004307,Adipose - Subcutaneous
6,ENSG00000137880,Y,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000104129,Y,gene_id,5741,1.0,0.000174,Adipose - Subcutaneous
8,ENSG00000177150,Y,gene_id,8431,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000155313,Y,gene_id,7272,0.0,0.000000,Adipose - Subcutaneous


In [29]:
allTissuesCombined_CNVExAC_bonferroni.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,651494.0,651494.0,651494.0
mean,7325.415852,15.831474,0.002309
std,2079.277298,70.025695,0.010511
min,1154.0,0.0,0.0
25%,6270.0,0.0,0.0
50%,7224.0,0.0,0.0
75%,8125.0,0.0,0.0
max,28260.0,3610.0,0.524256


In [30]:
allTissuesCombined_CNVExAC_bonferroni.to_csv('../../outputFiles/propSNPsSignificanteQTLsCNVExACBonferroni.csv', index=False)

##### Metasoft eQTLs

In [31]:
allTissuesCombined_CNVExAC_metasoft = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneMetasoft = pd.read_sql_query(
        'SELECT gene as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7Metasoft` WHERE `mval_' + tissue_url + '` > 0.9 GROUP BY gene',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountMetasoftAndCNVExACStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountMetasoftAndCNVExACStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneMetasoft["Ensembl Gene ID"] = eQTLCountPerGeneMetasoft["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountMetasoftAndCNVExACStatus[['Ensembl Gene ID','CNV']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneMetasoft, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CNVExAC_metasoft = allTissuesCombined_CNVExAC_metasoft.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [32]:
allTissuesCombined_CNVExAC_metasoft

Unnamed: 0,Ensembl Gene ID,CNV,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,16.0,0.002834,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000185272,Y,gene_id,8622,111.0,0.012874,Adipose - Subcutaneous
3,ENSG00000155304,Y,gene_id,8566,26.0,0.003035,Adipose - Subcutaneous
4,ENSG00000166200,N,gene_id,5891,30.0,0.005093,Adipose - Subcutaneous
5,ENSG00000155307,Y,gene_id,8545,0.0,0.000000,Adipose - Subcutaneous
6,ENSG00000180530,N,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000137824,Y,gene_id,5804,141.0,0.024294,Adipose - Subcutaneous
8,ENSG00000137880,Y,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000104129,Y,gene_id,5741,295.0,0.051385,Adipose - Subcutaneous


In [33]:
allTissuesCombined_CNVExAC_metasoft.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,651494.0,651494.0,651494.0
mean,7325.415852,95.647765,0.013803
std,2079.277298,215.870015,0.029745
min,1154.0,0.0,0.0
25%,6270.0,0.0,0.0
50%,7224.0,12.0,0.001724
75%,8125.0,104.0,0.014501
max,28260.0,8955.0,0.653515


In [34]:
allTissuesCombined_CNVExAC_metasoft.to_csv('../../outputFiles/propSNPsSignificanteQTLsCNVExACMetasoft.csv', index=False)

#### Haploinsufficient genes

##### Bonferroni-corrected eQTLs

In [36]:
allTissuesCombined_Haplo_bonferroni = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneBonferroni = pd.read_sql_query(
        'SELECT gene_id as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7` WHERE sigAfterBonferroni = 1 AND tissue = "' + tissue + '" GROUP BY gene_id',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountBonferroniAndHaploStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndHaploStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneBonferroni["Ensembl Gene ID"] = eQTLCountPerGeneBonferroni["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndHaploStatus[['Ensembl Gene ID','haplo']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneBonferroni, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_Haplo_bonferroni = allTissuesCombined_Haplo_bonferroni.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [37]:
allTissuesCombined_Haplo_bonferroni

Unnamed: 0,Ensembl Gene ID,haplo,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,0.0,0.000000,Adipose - Subcutaneous
1,ENSG00000168675,N,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,0.0,0.000000,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,0.0,0.000000,Adipose - Subcutaneous
4,ENSG00000155304,N,gene_id,8566,0.0,0.000000,Adipose - Subcutaneous
5,ENSG00000153575,N,gene_id,4601,89.0,0.019344,Adipose - Subcutaneous
6,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000137824,N,gene_id,5804,25.0,0.004307,Adipose - Subcutaneous
8,ENSG00000137880,N,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000104129,N,gene_id,5741,1.0,0.000174,Adipose - Subcutaneous


In [38]:
allTissuesCombined_Haplo_bonferroni.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,724409.0,724409.0,724409.0
mean,7334.491661,17.525046,0.00249
std,2132.637488,86.479499,0.011607
min,119.0,0.0,0.0
25%,6256.0,0.0,0.0
50%,7228.0,0.0,0.0
75%,8149.0,0.0,0.0
max,28260.0,4540.0,0.524256


In [39]:
allTissuesCombined_Haplo_bonferroni.to_csv('../../outputFiles/propSNPsSignificanteQTLsHaploBonferroni.csv', index=False)

##### Metasoft eQTLs

In [40]:
allTissuesCombined_Haplo_metasoft = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneMetasoft = pd.read_sql_query(
        'SELECT gene as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7Metasoft` WHERE `mval_' + tissue_url + '` > 0.9 GROUP BY gene',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountMetasoftAndHaploStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountMetasoftAndHaploStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneMetasoft["Ensembl Gene ID"] = eQTLCountPerGeneMetasoft["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountMetasoftAndHaploStatus[['Ensembl Gene ID','haplo']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneMetasoft, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_Haplo_metasoft = allTissuesCombined_Haplo_metasoft.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [41]:
allTissuesCombined_Haplo_metasoft

Unnamed: 0,Ensembl Gene ID,haplo,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,N,gene_id,5645,16.0,0.002834,Adipose - Subcutaneous
1,ENSG00000168675,N,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,248.0,0.028814,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,111.0,0.012874,Adipose - Subcutaneous
4,ENSG00000155304,N,gene_id,8566,26.0,0.003035,Adipose - Subcutaneous
5,ENSG00000166200,Y,gene_id,5891,30.0,0.005093,Adipose - Subcutaneous
6,ENSG00000155307,N,gene_id,8545,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000153575,N,gene_id,4601,130.0,0.028255,Adipose - Subcutaneous
8,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000137824,N,gene_id,5804,141.0,0.024294,Adipose - Subcutaneous


In [42]:
allTissuesCombined_Haplo_metasoft.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,724409.0,724409.0,724409.0
mean,7334.491661,103.041729,0.0146
std,2132.637488,262.663843,0.032225
min,119.0,0.0,0.0
25%,6256.0,0.0,0.0
50%,7228.0,14.0,0.001882
75%,8149.0,109.0,0.015187
max,28260.0,8955.0,0.653515


In [43]:
allTissuesCombined_Haplo_metasoft.to_csv('../../outputFiles/propSNPsSignificanteQTLsHaploMetasoft.csv', index=False)

#### Conserved copy number genes

##### Bonferroni-corrected eQTLs

In [44]:
allTissuesCombined_CCN_bonferroni = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneBonferroni = pd.read_sql_query(
        'SELECT gene_id as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7` WHERE sigAfterBonferroni = 1 AND tissue = "' + tissue + '" GROUP BY gene_id',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountBonferroniAndCCNStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountBonferroniAndCCNStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneBonferroni["Ensembl Gene ID"] = eQTLCountPerGeneBonferroni["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountBonferroniAndCCNStatus[['Ensembl Gene ID','CCN']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneBonferroni, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CCN_bonferroni = allTissuesCombined_CCN_bonferroni.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [45]:
allTissuesCombined_CCN_bonferroni

Unnamed: 0,Ensembl Gene ID,CCN,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,Y,gene_id,5645,0.0,0.000000,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,0.0,0.000000,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,0.0,0.000000,Adipose - Subcutaneous
4,ENSG00000155304,Y,gene_id,8566,0.0,0.000000,Adipose - Subcutaneous
5,ENSG00000153575,Y,gene_id,4601,89.0,0.019344,Adipose - Subcutaneous
6,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000137824,N,gene_id,5804,25.0,0.004307,Adipose - Subcutaneous
8,ENSG00000137880,Y,gene_id,5790,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000104129,N,gene_id,5741,1.0,0.000174,Adipose - Subcutaneous


In [46]:
allTissuesCombined_CCN_bonferroni.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,774180.0,774180.0,774180.0
mean,7198.019774,17.562916,0.00257
std,2234.033048,90.408335,0.012775
min,119.0,0.0,0.0
25%,6107.0,0.0,0.0
50%,7156.0,0.0,0.0
75%,8118.0,0.0,0.0
max,28260.0,4540.0,0.524256


In [47]:
allTissuesCombined_CCN_bonferroni.to_csv('../../outputFiles/propSNPsSignificanteQTLsCCNBonferroni.csv', index=False)

##### Metasoft eQTLs

In [48]:
allTissuesCombined_CCN_metasoft = pd.DataFrame()
for tissue in tissues:
    print(tissue)
    tissue_url = tissue.replace(' - ','_').replace('(','').replace(')','').replace(' ','_')
    eQTLCountPerGeneMetasoft = pd.read_sql_query(
        'SELECT gene as "Ensembl Gene ID", COUNT(*) as eQTLs FROM `v7Metasoft` WHERE `mval_' + tissue_url + '` > 0.9 GROUP BY gene',
        engine,
        coerce_float=True
    )
    numTestedSNPs = pd.read_csv('../../outputFiles/GTExV7/numTestedSNPs/' + tissue + '.txt')
    numTestedSNPs.rename(columns={'value': 'Ensembl Gene ID', 'count': 'SNPs'}, inplace=True)
    genesWitheQTLTissueCountMetasoftAndCCNStatus = pd.read_csv('../../outputFiles/genesWitheQTLTissueCountMetasoftAndCCNStatus.csv', sep="\t", index_col="Unnamed: 0")
    
    numTestedSNPs["Ensembl Gene ID"] = numTestedSNPs["Ensembl Gene ID"].apply(removeGeneIDVersions)
    eQTLCountPerGeneMetasoft["Ensembl Gene ID"] = eQTLCountPerGeneMetasoft["Ensembl Gene ID"].apply(removeGeneIDVersions)
    
    merge1 = pd.merge(genesWitheQTLTissueCountMetasoftAndCCNStatus[['Ensembl Gene ID','CCN']], numTestedSNPs, how = "inner", on = "Ensembl Gene ID")
    merge2 = pd.merge(merge1, eQTLCountPerGeneMetasoft, how = "left", on = "Ensembl Gene ID")
    merge2['eQTLs'] = merge2['eQTLs'].fillna(0)
    
    merge2['propSignif'] = merge2['eQTLs']/merge2['SNPs']
    merge2['tissue'] = tissue
    allTissuesCombined_CCN_metasoft = allTissuesCombined_CCN_metasoft.append(merge2)

Adipose - Subcutaneous
Adipose - Visceral (Omentum)
Adrenal Gland
Artery - Aorta
Artery - Coronary
Artery - Tibial
Brain - Amygdala
Brain - Anterior cingulate cortex (BA24)
Brain - Caudate (basal ganglia)
Brain - Cerebellar Hemisphere
Brain - Cerebellum
Brain - Cortex
Brain - Frontal Cortex (BA9)
Brain - Hippocampus
Brain - Hypothalamus
Brain - Nucleus accumbens (basal ganglia)
Brain - Putamen (basal ganglia)
Brain - Spinal cord (cervical c-1)
Brain - Substantia nigra
Breast - Mammary Tissue
Cells - EBV-transformed lymphocytes
Cells - Transformed fibroblasts
Colon - Sigmoid
Colon - Transverse
Esophagus - Gastroesophageal Junction
Esophagus - Mucosa
Esophagus - Muscularis
Heart - Atrial Appendage
Heart - Left Ventricle
Liver
Lung
Minor Salivary Gland
Muscle - Skeletal
Nerve - Tibial
Ovary
Pancreas
Pituitary
Prostate
Skin - Not Sun Exposed (Suprapubic)
Skin - Sun Exposed (Lower leg)
Small Intestine - Terminal Ileum
Spleen
Stomach
Testis
Thyroid
Uterus
Vagina
Whole Blood


In [49]:
allTissuesCombined_CCN_metasoft

Unnamed: 0,Ensembl Gene ID,CCN,field,SNPs,eQTLs,propSignif,tissue
0,ENSG00000138593,Y,gene_id,5645,16.0,0.002834,Adipose - Subcutaneous
1,ENSG00000168675,Y,gene_id,7911,0.0,0.000000,Adipose - Subcutaneous
2,ENSG00000188992,N,gene_id,8607,248.0,0.028814,Adipose - Subcutaneous
3,ENSG00000185272,N,gene_id,8622,111.0,0.012874,Adipose - Subcutaneous
4,ENSG00000155304,Y,gene_id,8566,26.0,0.003035,Adipose - Subcutaneous
5,ENSG00000166200,N,gene_id,5891,30.0,0.005093,Adipose - Subcutaneous
6,ENSG00000155307,Y,gene_id,8545,0.0,0.000000,Adipose - Subcutaneous
7,ENSG00000153575,Y,gene_id,4601,130.0,0.028255,Adipose - Subcutaneous
8,ENSG00000180530,Y,gene_id,8460,0.0,0.000000,Adipose - Subcutaneous
9,ENSG00000137824,N,gene_id,5804,141.0,0.024294,Adipose - Subcutaneous


In [50]:
allTissuesCombined_CCN_metasoft.describe()

Unnamed: 0,SNPs,eQTLs,propSignif
count,774180.0,774180.0,774180.0
mean,7198.019774,101.934655,0.014792
std,2234.033048,265.26121,0.033449
min,119.0,0.0,0.0
25%,6107.0,0.0,0.0
50%,7156.0,13.0,0.0018
75%,8118.0,106.0,0.015088
max,28260.0,8955.0,0.653515


In [51]:
allTissuesCombined_CCN_metasoft.to_csv('../../outputFiles/propSNPsSignificanteQTLsCCNMetasoft.csv', index=False)