 # Human Phenotype Ontology

 Author: Moshe Silverstein <br/>
 Date: 11-17 <br/>
 Data Source: http://www.human-phenotype-ontology.org/

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

 ### Load Mapping Dictionaries

In [6]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.13s/it]


 ### Output Path

In [7]:
output_name = 'hpo'

path = 'Output/HPO'
if not os.path.exists(path):
    os.makedirs(path)

In [8]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the Human Phenotype Ontology Data Set',
) %}

 # Load Data

In [9]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.txt$',
    name='phenotype_gene_list', 
    label='Phenotypes to Genes', 
    default='Input/HPO/phenotype_to_genes.txt',
    section='data')
}}, skiprows=1, header=None, sep='\t')

```python

df = pd.read_csv('Input/HPO/phenotype_to_genes.txt', skiprows=1, header=None, sep='\t')
```

In [10]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,HP:0000002,Abnormality of body height,26229,B3GAT3,-,mim2gene,OMIM:245600
1,HP:0000002,Abnormality of body height,2892,GRIA3,,orphadata,ORPHA:364028
2,HP:0000002,Abnormality of body height,64374,SIL1,,orphadata,ORPHA:559
3,HP:0000002,Abnormality of body height,10588,MTHFS,-,mim2gene,OMIM:618367
4,HP:0000002,Abnormality of body height,51633,OTUD6B,-,mim2gene,OMIM:617452


In [11]:
df.shape

(841750, 7)

 # Pre-process Data

 ## Get Relevant Data

In [12]:
df = df[[1,3]] 
df = df.set_index(3)

In [13]:
df.head()

Unnamed: 0_level_0,1
3,Unnamed: 1_level_1
B3GAT3,Abnormality of body height
GRIA3,Abnormality of body height
SIL1,Abnormality of body height
MTHFS,Abnormality of body height
OTUD6B,Abnormality of body height


 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [14]:
df = uf.mapgenesymbols(df, symbol_lookup)
df.shape

100%|██████████| 841750/841750 [00:01<00:00, 811950.44it/s]


(839426, 1)

 # Analyze Data

 ## Create Binary Matrix

In [15]:
binary_matrix = uf.createBinaryMatrix(df)
binary_matrix.head()

Unnamed: 0,1-3 toe syndactyly,1-5 finger complete cutaneous syndactyly,1-5 finger syndactyly,1-5 toe syndactyly,1-minute APGAR score of 0,1-minute APGAR score of 1,11 pairs of ribs,2-3 finger syndactyly,2-3 toe cutaneous syndactyly,2-3 toe syndactyly,...,Yellow nails,Yellow papule,"Yellow subcutaneous tissue covered by thin, scaly skin",Yellow-brown discoloration of the teeth,Yellow/white lesions of the macula,Yellow/white lesions of the retina,Young adult onset,Z-band streaming,Zollinger-Ellison syndrome,Zonular cataract
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2ML1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGAB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
binary_matrix.shape

(4251, 9059)

In [19]:
uf.saveData(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

 ## Create Gene List

In [20]:
gene_list = uf.createGeneList(binary_matrix, geneid_lookup)
gene_list.head()

100%|██████████| 4251/4251 [00:00<00:00, 295992.34it/s]


Unnamed: 0,GeneSym,GeneID
0,A2M,2
1,A2ML1,144568
2,A4GALT,53947
3,AAAS,8086
4,AAGAB,79719


In [21]:
gene_list.shape

(4251, 2)

In [22]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [23]:
attribute_list = uf.createAttributeList(matrix)
attribute_list.head()

1-3 toe syndactyly
1-5 finger complete cutaneous syndactyly
1-5 finger syndactyly
1-5 toe syndactyly
1-minute APGAR score of 0


In [24]:
attribute_list.shape

(9059, 0)

In [25]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [26]:
uf.createUpGeneSetLib(binary_matrix, path, output_name + '_gene_up_set')

100%|██████████| 4911/4911 [00:00<00:00, 18073.46it/s]


In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, 
                           output_name + '_attribute_up_set')

100%|██████████| 4251/4251 [00:00<00:00, 12118.64it/s]


 ## Create Attribute Similarity Matrix

In [28]:
attribute_similarity_matrix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

Unnamed: 0,1-3 toe syndactyly,1-5 finger complete cutaneous syndactyly,1-5 finger syndactyly,1-5 toe syndactyly,1-minute APGAR score of 0,1-minute APGAR score of 1,11 pairs of ribs,2-3 finger syndactyly,2-3 toe cutaneous syndactyly,2-3 toe syndactyly,...,Yellow nails,Yellow papule,"Yellow subcutaneous tissue covered by thin, scaly skin",Yellow-brown discoloration of the teeth,Yellow/white lesions of the macula,Yellow/white lesions of the retina,Young adult onset,Z-band streaming,Zollinger-Ellison syndrome,Zonular cataract
1-3 toe syndactyly,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-5 finger complete cutaneous syndactyly,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-5 finger syndactyly,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-5 toe syndactyly,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-minute APGAR score of 0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [30]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [31]:
gene_similarity_matrix = uf.createSimilarityMatrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

Unnamed: 0,A2M,A2ML1,A4GALT,AAAS,AAGAB,AARS1,AARS2,AASS,ABAT,ABCA1,...,ZNF644,ZNF687,ZNF711,ZNF750,ZNF81,ZNHIT3,ZP2,ZP3,ZP4,ZSWIM6
A2M,1.0,0.020161,0.034483,0.052023,0.022222,0.055556,0.13,0.140351,0.087912,0.056604,...,0.032258,0.03125,0.028037,0.025,0.030612,0.042056,0.0,0.032258,0.0,0.038462
A2ML1,0.020161,1.0,0.008734,0.136499,0.053957,0.159459,0.090278,0.047244,0.094891,0.135385,...,0.013043,0.047059,0.166205,0.034188,0.151429,0.177778,0.012658,0.017467,0.013043,0.155452
A4GALT,0.034483,0.008734,1.0,0.0,0.029412,0.004878,0.0,0.04878,0.0,0.020979,...,0.111111,0.023256,0.0,0.055556,0.0,0.0,0.0,0.111111,0.0,0.00365
AAAS,0.052023,0.136499,0.0,1.0,0.115578,0.260563,0.130233,0.107955,0.156566,0.120301,...,0.0125,0.026178,0.155116,0.036364,0.160839,0.184564,0.005952,0.0,0.006211,0.154054
AAGAB,0.022222,0.053957,0.029412,0.115578,1.0,0.03876,0.013245,0.039604,0.007246,0.045226,...,0.014085,0.081633,0.056911,0.094595,0.052174,0.01938,0.012821,0.028571,0.014085,0.056426


In [32]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [35]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

The number of statisticaly relevent gene-attribute associations is: 577116


 # Create Downloadable Save File

In [34]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)