# Public Health Genomics Knowledge Base (PHGKB)

Author: Moshe Silverstein <br/>
Date: 11-17 <br/>
Data Source: https://phgkb.cdc.gov/PHGKB/downloadCenter.action

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/untility_functions.py'>

# Load Data

In [11]:
df = pd.read_csv('Input/Disease-GeneID.txt', skiprows=4, sep='~', header=None)

In [12]:
df.head()

Unnamed: 0,0
0,22q11 Deletion Syndrome(C2936346)\tCTD(1283)\t...
1,"46, XX Disorders of Sex Development(C2936403)\..."
2,"46, XY Disorders of Sex Development(C2751824)\..."
3,"Abdomen, Acute(C0000727)\tMEFV(4210)"
4,Abdominal Neoplasms(C0000735)\tCHEK1(1111)\tPS...


In [15]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2))
    temp = pd.DataFrame()
    temp['Phenotype'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  3032 Out of 3032   

In [24]:
df_interactions.reset_index(inplace=True)

In [26]:
df_interactions.drop('index', axis=1, inplace=True)

In [27]:
df_interactions.head()

Unnamed: 0,Phenotype,Gene
0,22q11 Deletion Syndrome(C2936346),CTD(1283)
1,22q11 Deletion Syndrome(C2936346),COMT(1312)
2,22q11 Deletion Syndrome(C2936346),JMJD1C(221037)
3,22q11 Deletion Syndrome(C2936346),FGF1(2246)
4,22q11 Deletion Syndrome(C2936346),FGF10(2255)


In [28]:
lst1 = []
lst2 = []

for i, index in enumerate(df_interactions.index):
    
    progressPercent = ((i+1)/len(df_interactions.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df_interactions.index)))
    sys.stdout.flush()
    
    lst1.append(df_interactions.ix[index, 'Phenotype'].split('(')[0])
    lst2.append(df_interactions.ix[index, 'Gene'].split('(')[0])
    
df_interactions['Phenotype'] = lst1
df_interactions['Gene'] = lst2

Progeres: 99%  572067 Out of 572161   

In [29]:
df_interactions.head()

Unnamed: 0,Phenotype,Gene
0,22q11 Deletion Syndrome,CTD
1,22q11 Deletion Syndrome,COMT
2,22q11 Deletion Syndrome,JMJD1C
3,22q11 Deletion Syndrome,FGF1
4,22q11 Deletion Syndrome,FGF10


In [30]:
df_interactions.shape

(572161, 2)

In [31]:
df_interactions.drop_duplicates(inplace=True)

In [32]:
df_interactions.shape

(548395, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [34]:
df_interactions.set_index('Gene', inplace=True)

In [35]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  548395 Out of 548395   

# Drop Duplicates

In [36]:
df_interactions.reset_index(inplace=True)

In [37]:
df_interactions.drop_duplicates(inplace=True)

In [38]:
df_interactions.shape

(531075, 2)

# Create Binary Matrix

In [39]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  13221 Out of 13221   

In [40]:
binary_matrix.head()

Unnamed: 0,Forensic Psychiatry,Vomiting,Vaginal Neoplasms,Bone Marrow Neoplasms,"Gonadal Dysgenesis, 46,XY",Glaucoma,Respiratory Insufficiency,Tooth Demineralization,22q11 Deletion Syndrome,Prader-Willi Syndrome,...,"Amnesia, Transient Global",Accelerated Idioventricular Rhythm,Blepharoptosis,"Keratoderma, Palmoplantar","Dementia, Vascular","Lupus Erythematosus, Cutaneous","Pneumonia, Pneumocystis",Supratentorial Neoplasms,Genetic Determinism,Vitamin B 12 Deficiency
SOX14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MCM4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACTA2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CYTL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COL23A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
binary_matrix.shape

(13221, 2911)

# Save Binary Matrix

In [42]:
filename = '~/./Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/phgkb_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [43]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/'

In [44]:
name = 'phgkb_gene_set'

In [45]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  2911 Out of 2911   

# Create Attribute Library

In [46]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/'

In [47]:
name = 'phgkb_attribute_set'

In [48]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  13221 Out of 13221   

# Create Gene Similarity Matrix

In [49]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [50]:
gene_similarity_matix.head()

Unnamed: 0,SOX14,MCM4,ACTA2,CYTL1,COL23A1,ORAI1,CHAC1,NKPD1,ADA,KCTD13,...,CDKL1,CYP19A1,CYP8B1,SHE,STON2,COMP,MAP1LC3A,DGKB,HTRA2,EQTN
SOX14,1.0,0.040816,0.025641,0.25,0.032258,0.041667,0.222222,0.0,0.022222,0.0,...,0.0,0.012422,0.065217,0.0,0.0,0.034483,0.027778,0.053571,0.03125,0.0
MCM4,0.040816,1.0,0.160377,0.0,0.056338,0.081395,0.048387,0.0,0.135,0.0,...,0.02,0.111446,0.15,0.037736,0.046154,0.072917,0.333333,0.108696,0.055556,0.02
ACTA2,0.025641,0.160377,1.0,0.013158,0.142857,0.12963,0.093023,0.0,0.174312,0.012821,...,0.025641,0.156977,0.070796,0.076923,0.065934,0.32,0.068627,0.190909,0.129032,0.025641
CYTL1,0.25,0.0,0.013158,1.0,0.035714,0.021739,0.055556,0.0,0.005556,0.0,...,0.0,0.003106,0.022222,0.0,0.0,0.017857,0.0,0.018182,0.034483,0.0
COL23A1,0.032258,0.056338,0.142857,0.035714,1.0,0.138462,0.069767,0.0,0.072165,0.033333,...,0.142857,0.067073,0.140625,0.0,0.166667,0.150685,0.033898,0.202899,0.14,0.142857


# Save Gene Similarity Matrix

In [51]:
filename = '~/./Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/phgkb_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [52]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [53]:
attribute_similarity_matix.head()

Unnamed: 0,Forensic Psychiatry,Vomiting,Vaginal Neoplasms,Bone Marrow Neoplasms,"Gonadal Dysgenesis, 46,XY",Glaucoma,Respiratory Insufficiency,Tooth Demineralization,22q11 Deletion Syndrome,Prader-Willi Syndrome,...,"Amnesia, Transient Global",Accelerated Idioventricular Rhythm,Blepharoptosis,"Keratoderma, Palmoplantar","Dementia, Vascular","Lupus Erythematosus, Cutaneous","Pneumonia, Pneumocystis",Supratentorial Neoplasms,Genetic Determinism,Vitamin B 12 Deficiency
Forensic Psychiatry,1.0,0.015873,0.0,0.0,0.071429,0.0,0.010582,0.0,0.030303,0.047619,...,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.021277,0.0,0.029412
Vomiting,0.015873,1.0,0.0,0.007092,0.0,0.036232,0.033898,0.009852,0.013699,0.007407,...,0.008264,0.0,0.0,0.007874,0.034483,0.019868,0.015038,0.065789,0.014493,0.041958
Vaginal Neoplasms,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bone Marrow Neoplasms,0.0,0.007092,0.0,1.0,0.0,0.009202,0.009852,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0
"Gonadal Dysgenesis, 46,XY",0.071429,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [54]:
filename = '~/./Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/phgkb_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [55]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  13221 Out of 13221   

In [56]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,SOX14,8403
1,MCM4,4173
2,ACTA2,59
3,CYTL1,54360
4,COL23A1,91522


In [57]:
gene_list.shape

(13221, 2)

# Save Gene List

In [58]:
filename = '~/./Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/phgkb_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [59]:
attribute_list = uf.createAttributeList(binary_matrix)

In [60]:
attribute_list.head()

Unnamed: 0,Attributes
0,Forensic Psychiatry
1,Vomiting
2,Vaginal Neoplasms
3,Bone Marrow Neoplasms
4,"Gonadal Dysgenesis, 46,XY"


In [61]:
attribute_list.shape

(2911, 1)

# Save Attribute List

In [62]:
filename = '~/./Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/phgkb_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [63]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Public Health Genomics Knowledge Base (PHGKB)/Output/'

In [64]:
name = 'phgkb_gene_attribute_edge_list'

In [65]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  2911 Out of 2911   

 The number of statisticaly relevent gene-attribute associations is: 531075
