# GAD (High Level Gene-Disease Associations)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://geneticassociationdb.nih.gov/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GAD/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/data/GADCDC/GADCDC_data.zip', sep='\t', low_memory=False)

In [4]:
df.head()

Unnamed: 0,ID,ALTNAME,DISEASE,DIS_CLAS,CH_BAND,GENE,DNAPOST,P_VALUE,REFERENC,PUBMEDID,...,I_CHECK,UCSC,CHECKED,IPADDR,CNV,GEOLOC,MESHDIS,ACEVIEW,CTD,SNPEDIA
0,125158,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Xiao, W. Y. et al. 2005",16120569.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
1,125159,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Zhou, L. X. et al. 2005",16143070.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
2,125160,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,alopecia areata,IMMUNE,6p21.3,HLA-A,,,"Xiao, F. L. et al. 2005",16185849.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,Alopecia Areata,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
3,125155,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,leukemia,CANCER,6p21.3,HLA-A,,,"Kang, L. et al. 2005",15793795.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A
4,125157,||HLA-A||HLA-H||HLA-J||leukocyte antigen-A*010...,melanoma,CANCER,6p21.3,HLA-A,,,"Spinola, H. et al. 2005",16101833.0,...,,http://genome.ucsc.edu/cgi-bin/hgTracks?org=Hu...,,,http://projects.tcag.ca/variation/search.asp?k...,,,http://www.ncbi.nlm.nih.gov/IEB/Research/Acemb...,http://ctd.mdibl.org/detail.go?type=gene&db=GE...,http://www.snpedia.com/index.php/HLA-A


In [5]:
df.shape

(167132, 89)

# Get Relevant Data

In [6]:
df = df[['GENE', 'DIS_CLAS']]

In [7]:
df.dropna(how='any', inplace=True)

In [8]:
df.head()

Unnamed: 0,GENE,DIS_CLAS
0,HLA-A,CANCER
1,HLA-A,CANCER
2,HLA-A,IMMUNE
3,HLA-A,CANCER
4,HLA-A,CANCER


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [9]:
df.set_index('GENE', inplace=True)

In [10]:
uf.mapgenesymbols(df)

Progeres: 99%  166329 Out of 166447   

# Drop Duplicates

In [11]:
df.reset_index(inplace=True)

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.shape

# Create Binary Matrix

In [14]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  14120 Out of 14120   

In [15]:
binary_matrix.head()

Unnamed: 0,METABOLIC,CARDIOVASCULAR,NEUROLOGICAL,AGING,VISION,UNKNOWN,RENAL,MITOCHONDRIAL,DEVELOPMENTAL,INFECTION,OTHER,REPRODUCTION,HEMATOLOGICAL,PHARMACOGENOMIC,PSYCH,CANCER,NORMALVARIATION,IMMUNE,CHEMDEPENDENCY
BLOC1S5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
RPS3AP1,1,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
HLA-DRB5,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,1,0
PPIAP15,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
SLC25A31,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [16]:
binary_matrix.shape

(14120, 19)

# Save Binary Matrix

In [17]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_high_level_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [18]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [19]:
name = 'gad_high_level_gene_set'

In [20]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  19 Out of 19   

# Create Attribute Library

In [21]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [22]:
name = 'gad_high_level_attribute_set'

In [23]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  14120 Out of 14120   

# Create Gene Similarity Matrix

In [24]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [25]:
gene_similarity_matix.head()

Unnamed: 0,BLOC1S5,RPS3AP1,HLA-DRB5,PPIAP15,SLC25A31,CNBP,ZMAT4,FTCD,LIN28AP2,LUZP4,...,NTSR2,LY9,ZGPAT,LYPD2,ADGRG1,TECRL,CSF1,HEMGN,C20orf194,MIRLET7B
BLOC1S5,1.0,0.166667,0.111111,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RPS3AP1,0.166667,1.0,0.25,0.125,0.0,0.0,0.428571,0.125,0.333333,0.0,...,0.142857,0.0,0.0,0.166667,0.166667,0.0,0.153846,0.0,0.0,0.0
HLA-DRB5,0.111111,0.25,1.0,0.333333,0.111111,0.0,0.444444,0.090909,0.1,0.111111,...,0.1,0.111111,0.1,0.111111,0.111111,0.090909,0.5,0.2,0.111111,0.111111
PPIAP15,0.0,0.125,0.333333,1.0,0.0,0.0,0.4,0.0,0.25,0.333333,...,0.0,0.333333,0.25,0.333333,0.333333,0.2,0.333333,0.2,0.0,0.0
SLC25A31,0.0,0.0,0.111111,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Save Gene Similarity Matrix

In [26]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_high_level_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [27]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [28]:
attribute_similarity_matix.head()

Unnamed: 0,METABOLIC,CARDIOVASCULAR,NEUROLOGICAL,AGING,VISION,UNKNOWN,RENAL,MITOCHONDRIAL,DEVELOPMENTAL,INFECTION,OTHER,REPRODUCTION,HEMATOLOGICAL,PHARMACOGENOMIC,PSYCH,CANCER,NORMALVARIATION,IMMUNE,CHEMDEPENDENCY
METABOLIC,1.0,0.359425,0.236038,0.094996,0.06652,0.152298,0.136891,0.000494,0.143238,0.119843,0.158212,0.100171,0.171606,0.193904,0.188712,0.210567,0.059855,0.220424,0.240031
CARDIOVASCULAR,0.359425,1.0,0.237573,0.101557,0.070139,0.158739,0.151608,0.000407,0.137739,0.123572,0.16183,0.114599,0.185641,0.20577,0.17316,0.201881,0.066732,0.221887,0.233084
NEUROLOGICAL,0.236038,0.237573,1.0,0.129453,0.089696,0.187752,0.147508,0.001284,0.125563,0.134162,0.179328,0.126888,0.140387,0.197488,0.181164,0.195994,0.085577,0.19286,0.18863
AGING,0.094996,0.101557,0.129453,1.0,0.079462,0.109244,0.1166,0.000957,0.079839,0.088235,0.116143,0.104461,0.090702,0.109069,0.106013,0.108372,0.085136,0.096691,0.087188
VISION,0.06652,0.070139,0.089696,0.079462,1.0,0.131323,0.110268,0.001368,0.08341,0.078723,0.133752,0.141717,0.068447,0.092068,0.071749,0.086909,0.141674,0.090203,0.058384


# Save Attribute Similarity Matrix

In [29]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_high_level_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [30]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  14120 Out of 14120   

In [31]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,BLOC1S5,63915.0
1,RPS3AP1,101242000.0
2,HLA-DRB5,3127.0
3,PPIAP15,5487.0
4,SLC25A31,83447.0


In [32]:
gene_list.shape

(14120, 2)

# Save Gene List

In [33]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_high_level_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [34]:
attribute_list = uf.createAttributeList(binary_matrix)

In [35]:
attribute_list.head()

Unnamed: 0,Attributes
0,METABOLIC
1,CARDIOVASCULAR
2,NEUROLOGICAL
3,AGING
4,VISION


In [36]:
attribute_list.shape

(19, 1)

# Save Attribute List

In [37]:
filename = '~/./Documents/Harmonizome/GAD/Output/gad_high_level_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [38]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GAD/Output/'

In [39]:
name = 'gad_high_level_gene_attribute_edge_list'

In [40]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  19 Out of 19   

 The number of statisticaly relevent gene-attribute associations is: 42691
