# dbGAP 	(Database of Genotypes and Phenotypes)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://www.ncbi.nlm.nih.gov/gap

In [4]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [5]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/dbGAP/untility_functions.py'>

# Load Data

In [22]:
df = pd.read_csv('Input/GWASCatalogDump.txt', sep='\t')

In [8]:
df.head()

Unnamed: 0,marker_accession,marker_type_name,chr_id,chr_pos,p_value,pubmed_id,author,journal,pubmed_link,study,...,platform,ci_95,chr_location,initial_size,replicate_size,merged,cur_snp_id,snp_gene_symbols,snp_gene_ids,snp_gene_validated
0,3934834,SNP,1,1005805,6e-07,19851299,Johansson A,Obesity (Silver Spring),http://www.ncbi.nlm.nih.gov/pubmed/19851299?it...,Linkage and genome-wide association analysis o...,...,"Illumina [318,237]",,1p36.33,"Up to 3,925 European individuals",,0,3934834,,,
1,425277,SNP,1,2069171,2e-08,20881960,Lango Allen H,Nature,http://www.ncbi.nlm.nih.gov/pubmed/20881960,Hundreds of variants clustered in genomic loci...,...,"Affymetrix, Illumina [2,834,208] (imputed)",,,"133,653 European ancestry individuals","50,074 European ancestry individuals",0,425277,PRKCZ,5590.0,1.0
2,3753242,SNP,1,2069680,1e-06,21107309,McClay JL,Neuropsychopharmacology,http://www.ncbi.nlm.nih.gov/pubmed/21107309,Genome-wide pharmacogenomic study of neurocogn...,...,"Affymetrix [492,900]",,1p36.33,738 schizophrenia cases,,0,3753242,PRKCZ,5590.0,1.0
3,2477686,SNP,1,2392647,6e-12,22197933,Hu Z,Nat Genet,http://www.ncbi.nlm.nih.gov/pubmed/22197933,A genome-wide association study in Chinese men...,...,"Affymetrix [587,347]",[1.26-1.52],,"981 Chinese ancestry cases, 1,657 Chinese ance...","1,946 Chinese ancestry cases, 4,077 Chinese an...",0,2477686,,,
4,734999,SNP,1,2513215,3e-09,21297633,Anderson CA,Nat Genet,http://www.ncbi.nlm.nih.gov/pubmed/21297633,Meta-analysis identifies 29 additional ulcerat...,...,Affymetrix &amp; Illumina [~1.1 million] (impu...,[1.01-1.09],1p36.32,"6,687 European ancestry cases, 19,718 European...","9,628 European ancestry cases, 12,917 European...",0,734999,,,


# Get Relevant Data

In [25]:
df = df[['trait', 'gene']]

In [26]:
df.head()

Unnamed: 0,trait,gene
0,Body mass index,
1,Height,PRKCZ
2,Reasoning,PRKCZ
3,Non-obstructive azoospermia,"PEX10, MMEL1"
4,Ulcerative colitis,"TNFRSF14, MMEL1, PLCH2, C1orf93"


In [31]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if type(df.ix[index, 'gene']) != float:
        lst2 = df.ix[index, 'gene'].split(',')
        lst1 = [df.ix[index, 'trait'].split('(')[0]]*(len(lst2))
        temp = pd.DataFrame()
        temp['GeneSymbol'] = lst2
        temp['Complex'] = lst1
        df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  8913 Out of 8913   

In [32]:
df_interactions.head()

Unnamed: 0,GeneSymbol,Complex
0,PRKCZ,Height
0,PRKCZ,Reasoning
0,PEX10,Non-obstructive azoospermia
1,MMEL1,Non-obstructive azoospermia
0,TNFRSF14,Ulcerative colitis


In [33]:
df_interactions.shape

(10746, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols¶

In [34]:
df_interactions.set_index('GeneSymbol', inplace=True)

In [35]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  10746 Out of 10746   

# Drop Duplicates

In [36]:
df_interactions.reset_index(inplace=True)

In [37]:
df_interactions.drop_duplicates(inplace=True)

In [38]:
df_interactions.shape

(6088, 2)

# Create Binary Matrix

In [39]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  3616 Out of 3616   

In [40]:
binary_matrix.head()

Unnamed: 0,Immunoglobulin A,Interstitial lung disease,Smoking cessation,Systemic lupus erythematosus,5-HTT brain serotonin transporter levels,Crohn&#39;s disease and sarcoidosis,Eye color,Nephrolithiasis,Soluble leptin receptor levels,Lung cancer,...,Response to iloperidone treatment,Intracranial aneurysm,Event-related brain oscillations,Response to angiotensin II receptor blocker therapy,Mean corpuscular hemoglobin,Biliary atresia,Idiopathic pulmonary fibrosis,Menopause,Fasting glucose-related traits,Fetal hemoglobin levels
AKR1C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RPP30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FBXL19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ARID2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
binary_matrix.shape

(3616, 591)

# Save Binary Matrix

In [43]:
filename = '~/./Documents/Harmonizome/dbGAP/Output/dbgap_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [44]:
path = '/Users/moshesilverstein/Documents/Harmonizome/dbGAP/Output/'

In [45]:
name = 'dbgap_gene_set'

In [46]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  591 Out of 591   

# Create Attribute Library

In [47]:
path = '/Users/moshesilverstein/Documents/Harmonizome/dbGAP/Output/'

In [48]:
name = 'dbgap_attribute_set'

In [49]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  3616 Out of 3616   

# Create Gene Similarity Matrix

In [50]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [51]:
gene_similarity_matix.head()

Unnamed: 0,AKR1C1,RPP30,FBXL19,ARID2,LAT,VSNL1,IL4R,CHST12,IL23R,CCDC3,...,HERC5,ZBTB38,RSPO3,USP40,CBLB,CREM,PKHD1,PPARGC1B,RAD50,SKAP1
AKR1C1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RPP30,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FBXL19,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ARID2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LAT,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [52]:
filename = '~/./Documents/Harmonizome/dbGAP/Output/dbgap_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [53]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [54]:
attribute_similarity_matix.head()

Unnamed: 0,Immunoglobulin A,Interstitial lung disease,Smoking cessation,Systemic lupus erythematosus,5-HTT brain serotonin transporter levels,Crohn&#39;s disease and sarcoidosis,Eye color,Nephrolithiasis,Soluble leptin receptor levels,Lung cancer,...,Response to iloperidone treatment,Intracranial aneurysm,Event-related brain oscillations,Response to angiotensin II receptor blocker therapy,Mean corpuscular hemoglobin,Biliary atresia,Idiopathic pulmonary fibrosis,Menopause,Fasting glucose-related traits,Fetal hemoglobin levels
Immunoglobulin A,1.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Interstitial lung disease,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Smoking cessation,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Systemic lupus erythematosus,0.020408,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5-HTT brain serotonin transporter levels,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [55]:
filename = '~/./Documents/Harmonizome/dbGAP/Output/dbgap_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [56]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  3616 Out of 3616   

In [57]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,AKR1C1,1645
1,RPP30,10556
2,FBXL19,54620
3,ARID2,196528
4,LAT,27040


In [58]:
gene_list.shape

(3616, 2)

# Save Gene List

In [59]:
filename = '~/./Documents/Harmonizome/dbGAP/Output/dbgap_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [60]:
attribute_list = uf.createAttributeList(binary_matrix)

In [61]:
attribute_list.head()

Unnamed: 0,Attributes
0,Immunoglobulin A
1,Interstitial lung disease
2,Smoking cessation
3,Systemic lupus erythematosus
4,5-HTT brain serotonin transporter levels


In [62]:
attribute_list.shape

(591, 1)

# Save Attribute List

In [63]:
filename = '~/./Documents/Harmonizome/dbGAP/Output/dbgap_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [64]:
path = '/Users/moshesilverstein/Documents/Harmonizome/dbGAP/Output/'

In [65]:
name = 'dbgap_gene_attribute_edge_list'

In [66]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  591 Out of 591   

 The number of statisticaly relevent gene-attribute associations is: 6088
