# Human Phenotype Ontology

Author: Moshe Silverstein <br/>
Date: 11-17 <br/>
Data Source: http://www.human-phenotype-ontology.org/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Human Phenotype Ontology/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt', sep='\t', skiprows=1, header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3
0,HP:0001459,1-3 toe syndactyly,2737,GLI3
1,HP:0006088,1-5 finger complete cutaneous syndactyly,64327,LMBR1
2,HP:0010708,1-5 finger syndactyly,6469,SHH
3,HP:0010708,1-5 finger syndactyly,64327,LMBR1
4,HP:0010713,1-5 toe syndactyly,2737,GLI3


In [5]:
df.shape

(419597, 4)

# Get Relevant Data

In [6]:
df = df[[1,3]] 

In [7]:
df.head()

Unnamed: 0,1,3
0,1-3 toe syndactyly,GLI3
1,1-5 finger complete cutaneous syndactyly,LMBR1
2,1-5 finger syndactyly,SHH
3,1-5 finger syndactyly,LMBR1
4,1-5 toe syndactyly,GLI3


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [8]:
df.set_index(3, inplace=True)

In [9]:
uf.mapgenesymbols(df)

Progeres: 100%  419597 Out of 419597   

# Drop Duplicates

In [10]:
df.reset_index(inplace=True)

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(415410, 2)

# Create Binary Matrix

In [13]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  3644 Out of 3644   

In [14]:
binary_matrix.head()

Unnamed: 0,Astrocytosis,Alopecia universalis,Aplasia/Hypoplasia of the pubic bone,Thromboembolic stroke,Respiratory arrest,Lipomatous tumor,Enlarged cisterna magna,Spontaneous esophageal perforation,Cerebellar Purkinje layer atrophy,Post-angioplasty coronary artery restenosis,...,Craniofacial dysostosis,Prominent U wave,Abnormal hypothalamus morphology,Contractures of the interphalangeal joint of the thumb,Hepatocellular necrosis,Oligodactyly,Vertebral clefting,Anterior beaking of lumbar vertebrae,Glioblastoma,Abnormality of the musculature of the upper arm
SNRPB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTULIN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MAPK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCF4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SHPK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
binary_matrix.shape

(3644, 7841)

# Save Binary Matrix

In [16]:
filename = '~/./Documents/Harmonizome/Human Phenotype Ontology/Output/hpo_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library¶

In [17]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Human Phenotype Ontology/Output/'

In [18]:
name = 'hpo_gene_set'

In [19]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  7841 Out of 7841   

# Create Attribute Library

In [20]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Human Phenotype Ontology/Output/'

In [21]:
name = 'hpo_attribute_set'

In [22]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  3644 Out of 3644   

# Create Gene Similarity Matrix

In [23]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [24]:
gene_similarity_matix.head()

Unnamed: 0,SNRPB,OTULIN,MAPK1,TCF4,SHPK,SLC39A8,KHDC3L,LOXHD1,AGTR1,CNTNAP1,...,PRRX1,CEP83,NR0B2,DGKE,GSC,CSF3R,NAGS,CDKN1C,SEC63,CYP19A1
SNRPB,1.0,0.051793,0.293801,0.14902,0.156667,0.107407,0.004505,0.017699,0.125,0.162698,...,0.209059,0.055777,0.012931,0.024194,0.302158,0.035433,0.036885,0.130542,0.11811,0.076433
OTULIN,0.051793,1.0,0.045296,0.062827,0.070513,0.053097,0.02381,0.020408,0.042105,0.066038,...,0.030864,0.024096,0.078431,0.042254,0.052023,0.238806,0.140625,0.02952,0.142857,0.07483
MAPK1,0.293801,0.045296,1.0,0.196154,0.153614,0.131757,0.0,0.023077,0.09375,0.089404,...,0.167683,0.04878,0.003704,0.010453,0.174041,0.038194,0.032143,0.143187,0.066667,0.065527
TCF4,0.14902,0.062827,0.196154,1.0,0.147887,0.130769,0.0,0.002695,0.082262,0.06357,...,0.131944,0.062663,0.010724,0.023256,0.122494,0.071429,0.047745,0.180747,0.112272,0.088435
SHPK,0.156667,0.070513,0.153614,0.147887,1.0,0.181287,0.0,0.0,0.238095,0.132948,...,0.111111,0.142857,0.014706,0.090278,0.122881,0.092105,0.04,0.186885,0.140244,0.071111


# Save Gene Similarity Matrix

In [25]:
filename = '~/./Documents/Harmonizome/Human Phenotype Ontology/Output/hpo_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [26]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [27]:
attribute_similarity_matix.head()

Unnamed: 0,Astrocytosis,Alopecia universalis,Aplasia/Hypoplasia of the pubic bone,Thromboembolic stroke,Respiratory arrest,Lipomatous tumor,Enlarged cisterna magna,Spontaneous esophageal perforation,Cerebellar Purkinje layer atrophy,Post-angioplasty coronary artery restenosis,...,Craniofacial dysostosis,Prominent U wave,Abnormal hypothalamus morphology,Contractures of the interphalangeal joint of the thumb,Hepatocellular necrosis,Oligodactyly,Vertebral clefting,Anterior beaking of lumbar vertebrae,Glioblastoma,Abnormality of the musculature of the upper arm
Astrocytosis,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alopecia universalis,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aplasia/Hypoplasia of the pubic bone,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.025,0.057143,0.0,0.0,0.0
Thromboembolic stroke,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Respiratory arrest,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [28]:
filename = '~/./Documents/Harmonizome/Human Phenotype Ontology/Output/hpo_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [29]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  3644 Out of 3644   

In [30]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,SNRPB,6628
1,OTULIN,90268
2,MAPK1,5594
3,TCF4,6925
4,SHPK,23729


In [31]:
gene_list.shape

(3644, 2)

# Save Gene List

In [32]:
filename = '~/./Documents/Harmonizome/Human Phenotype Ontology/Output/hpo_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [33]:
attribute_list = uf.createAttributeList(binary_matrix)

In [34]:
attribute_list.head()

Unnamed: 0,Attributes
0,Astrocytosis
1,Alopecia universalis
2,Aplasia/Hypoplasia of the pubic bone
3,Thromboembolic stroke
4,Respiratory arrest


In [35]:
attribute_list.shape

(7841, 1)

# Save Attribute List

In [36]:
filename = '~/./Documents/Harmonizome/Human Phenotype Ontology/Output/hpo_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [37]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Human Phenotype Ontology/Output/'

In [38]:
name = 'hpo_gene_attribute_edge_list'

In [39]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  7841 Out of 7841   

 The number of statisticaly relevent gene-attribute associations is: 415410
