# GAD (Gene-Disease Associations)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://geneticassociationdb.nih.gov/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GAD/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/data/GADCDC/GADCDC_data.zip', sep='\t', low_memory=False)

In [4]:
df.head()

Unnamed: 0,ID,ALTNAME,DISEASE,DIS_CLAS,CH_BAND,GENE,DNAPOST,P_VALUE,REFERENC,PUBMEDID,...,I_CHECK,UCSC,CHECKED,IPADDR,CNV,GEOLOC,MESHDIS,ACEVIEW,CTD,SNPEDIA
0,125158,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Xiao, W. Y. et al. 2005",16120569.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
1,125159,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Zhou, L. X. et al. 2005",16143070.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
2,125160,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,alopecia areata,IMMUNE,6p21.3,HLA-A,,,"Xiao, F. L. et al. 2005",16185849.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,Alopecia Areata,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
3,125155,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Kang, L. et al. 2005",15793795.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
4,125157,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,melanoma,CANCER,6p21.3,HLA-A,,,"Spinola, H. et al. 2005",16101833.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A


In [5]:
df.shape

(167132, 89)

# Get Relevant Data

In [6]:
df = df[['GENE', 'DISEASE']]

In [7]:
df.dropna(how='any', inplace=True)

In [8]:
df.head()

Unnamed: 0,GENE,DISEASE
0,HLA-A,leukemia
1,HLA-A,leukemia
2,HLA-A,alopecia areata
3,HLA-A,leukemia
4,HLA-A,melanoma


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [10]:
df.set_index('GENE', inplace=True)

In [11]:
mf.mapgenesymbols(df)

Progeres: 100%  163203 Out of 163203   

# Drop Duplicates

In [12]:
df.reset_index(inplace=True)

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.shape

(109098, 2)

# Create Binary Matrix

In [15]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  14107 Out of 14107   

In [16]:
binary_matrix.head()

Unnamed: 0,Renal cell carcinoma,congenital goiter and defective TG synthesis.,"diabetes, type 2; hypertension; glucose tolerance",carotid artery atherosclerosis,Crohn Disease|Inflammatory Bowel Diseases,"blood pressure; left ventricular mass; atherosclerosis, carotid",Body Weight|Hypertension|Insulin Resistance|Obesity,myocardial infarction; folate; homocysteine,side-effects of clozapine,Coronary Restenosis|Recurrence,...,"Chronic renal failure|Hypertension|Kidney Failure, Chronic|Nephrosclerosis","Adenoma|Colonic Neoplasms|Colonic Polyps|Neoplasm Recurrence, Local","myocardial infarct; C-reactive protein; stroke, ischemic",Alopecia Areata|,"retinopathy, diabetic; nephropathy in other diseases; neuropathy",obesity; bulimia nervosa,Bloom syndrome; Fanconi's anemia,Language Development Disorders|Mental Retardation,Graves disease,schizophrenia; body fat; weight gain
EGF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MIR128-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HLA-J,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LSAMP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIAA0895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
binary_matrix.shape

(14107, 15522)

# Save Binary Matrix

In [18]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [19]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [20]:
name = 'gad_gene_set'

In [21]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  15522 Out of 15522   

# Create Attribute Library

In [22]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [23]:
name = 'gad_attribute_set'

In [24]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  14107 Out of 14107   

# Create Gene Similarity Matrix

In [25]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [26]:
gene_similarity_matix.head()

Unnamed: 0,EGF,MIR128-2,HLA-J,LSAMP,KIAA0895,TNFSF9,KCNIP2,FAT1P1,MYH6,HNF1B,...,PDZD2,SEPT9,GOT2P1,NANOG,FOXD2,MTMR8,HSPA1A,PIGG,PARK7,LILRB4
EGF,1.0,0.014706,0.0,0.013889,0.0,0.013158,0.0,0.0,0.013889,0.053191,...,0.014085,0.013333,0.0,0.0,0.0,0.0,0.055556,0.014286,0.0,0.0
MIR128-2,0.014706,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HLA-J,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,0.0,0.0,0.0
LSAMP,0.013889,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.028571,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIAA0895,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [27]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [28]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [29]:
attribute_similarity_matix.head()

Unnamed: 0,Renal cell carcinoma,congenital goiter and defective TG synthesis.,"diabetes, type 2; hypertension; glucose tolerance",carotid artery atherosclerosis,Crohn Disease|Inflammatory Bowel Diseases,"blood pressure; left ventricular mass; atherosclerosis, carotid",Body Weight|Hypertension|Insulin Resistance|Obesity,myocardial infarction; folate; homocysteine,side-effects of clozapine,Coronary Restenosis|Recurrence,...,"Chronic renal failure|Hypertension|Kidney Failure, Chronic|Nephrosclerosis","Adenoma|Colonic Neoplasms|Colonic Polyps|Neoplasm Recurrence, Local","myocardial infarct; C-reactive protein; stroke, ischemic",Alopecia Areata|,"retinopathy, diabetic; nephropathy in other diseases; neuropathy",obesity; bulimia nervosa,Bloom syndrome; Fanconi's anemia,Language Development Disorders|Mental Retardation,Graves disease,schizophrenia; body fat; weight gain
Renal cell carcinoma,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
congenital goiter and defective TG synthesis.,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0
"diabetes, type 2; hypertension; glucose tolerance",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
carotid artery atherosclerosis,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Crohn Disease|Inflammatory Bowel Diseases,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [30]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [31]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  14107 Out of 14107   

In [32]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,EGF,1950
1,MIR128-2,406916
2,HLA-J,3137
3,LSAMP,4045
4,KIAA0895,23366


In [33]:
gene_list.shape

(14107, 2)

# Save Gene List

In [34]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [35]:
attribute_list = mf.createAttributeList(binary_matrix)

In [36]:
attribute_list.head()

Unnamed: 0,Attributes
0,Renal cell carcinoma
1,congenital goiter and defective TG synthesis.
2,"diabetes, type 2; hypertension; glucose tolerance"
3,carotid artery atherosclerosis
4,Crohn Disease|Inflammatory Bowel Diseases


In [37]:
attribute_list.shape

(15522, 1)

# Save Attribute List

In [38]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [39]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [40]:
name = 'gad_gene_attribute_edge_list'

In [41]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  15522 Out of 15522   

 The number of statisticaly relevent gene-attribute associations is: 109097
