# Importing Orphanet data into the knowledge graph  
Orphanet has Creative Commons-licensed data that we can use to identify patients with rare diseases.

HOOM is a module that qualifies the annotation between a clinical entity and phenotypic abnormalities according to a frequency and by integrating the notion of diagnostic criterion. See info on HOOM [here](http://www.orphadata.org/cgi-bin/img/PDF/WhatIsHOOM.pdf).

## References
Main Orphanet website: http://www.orphadata.org/cgi-bin/index.php  
Download data:
- Human Phenotype Ontology (HPO) database: https://hpo.jax.org/app/download/ontology
- Orphanet Rare Disease Ontology (ORDO) database: https://bioportal.bioontology.org/ontologies/ORDO
- HPO-ORDO Ontological Model (HOOM) database: https://bioportal.bioontology.org/ontologies/HOOM

Web service that allows one to browse RDF data: https://protege.stanford.edu/products.php  
XML parser documentation: https://docs.python.org/3/library/xml.etree.elementtree.html  
ORDO documentation: http://www.orphadata.org/cgi-bin/img/PDF/WhatIsORDO.pdf  
HOOM documentation: http://www.orphadata.org/cgi-bin/img/PDF/WhatIsHOOM.pdf  


## Exploring the ORDO Data

In [6]:
import xml.etree.ElementTree as ET
import re, requests, json
import pandas as pd

# Download the latest release of data from https://bioportal.bioontology.org/ontologies/ORDO
ordo_tree = ET.parse('ordo_orphanet.owl')
ordo_root = ordo_tree.getroot()

In [169]:
len(ordo_root)

40101

In [188]:
len(list(ordo_root.iter()))

5

In [7]:
ns = {
    "base":"http://www.orpha.net/ontology/ORDO_en_4.0.owl",
    "dc":"http://purl.org/dc/elements/1.1/",
    "efo":"http://www.ebi.ac.uk/efo/",
    "obo":"http://purl.obolibrary.org/obo/",
    "owl":"http://www.w3.org/2002/07/owl#",
    "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "xml":"http://www.w3.org/XML/1998/namespace",
    "xsd":"http://www.w3.org/2001/XMLSchema#",
    "ORDO":"http://www.orpha.net/ORDO/",
    "rdfs":"http://www.w3.org/2000/01/rdf-schema#",
    "skos":"http://www.w3.org/2004/02/skos/core#",
    "terms":"http://purl.org/dc/terms/",
    "licenses":"https://creativecommons.org/licenses/",
    "oboInOwl":"http://www.geneontology.org/formats/oboInOwl#",
    "Orphanet_":"http://www.orpha.net/ORDO/Orphanet_#"
}

for i in range(0, 50):
    print("ITEM ", i)
    print(list(ordo_root[i].iter()))
    print('####################################')

ITEM  0
[<Element '{http://www.w3.org/2002/07/owl#}Ontology' at 0x7fd458164bd0>, <Element '{http://www.w3.org/2002/07/owl#}versionIRI' at 0x7fd458164cc0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164d60>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164db0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164e00>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164ea0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164ef0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164f40>, <Element '{http://purl.org/dc/terms/}created' at 0x7fd45816e040>, <Element '{http://purl.org/dc/terms/}license' at 0x7fd45816e130>, <Element '{http://purl.org/dc/terms/}modified' at 0x7fd45816e1d0>, <Element '{http://www.w3.org/2002/07/owl#}versionInfo' at 0x7fd45816e2c0>]
####################################
ITEM  1
[<Element '{http://www.w3.org/2002/07/owl#}AnnotationProperty' at 0x7fd45816e310>, <Elem

In [8]:
# Inspect the components of an average entity
if ordo_root[40].tag.endswith('Class'):
    for item in list(ordo_root[40].iter()):
        print('TAG:  ', item.tag)
        print('ATTRIBUTE:  ', item.attrib)
        print('TEXT:  ', item.text)
        print('\n####################################\n')

TAG:   {http://www.w3.org/2002/07/owl#}Class
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about': 'http://www.orpha.net/ORDO/Orphanet_10'}
TEXT:   
        

####################################

TAG:   {http://www.w3.org/2000/01/rdf-schema#}subClassOf
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://www.orpha.net/ORDO/Orphanet_377789'}
TEXT:   None

####################################

TAG:   {http://www.w3.org/2000/01/rdf-schema#}subClassOf
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://www.orpha.net/ORDO/Orphanet_557493'}
TEXT:   None

####################################

TAG:   {http://www.ebi.ac.uk/efo/}definition
ATTRIBUTE:   {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
TEXT:   A rare sex chromosome number anomaly disorder characterized, genetically, by the presence of an extra X and Y chromosome in males and, clinically, by tall stature, dysfunctional testes associated with infertility and ins

## Import the Orphanet disease entities into a graph database

In [24]:
# Make a dictionary that maps age categories to upper and lower age ranges
age_of_onset_dict = {
    "adolescent": {"age_onset_lower": 12, "age_onset_upper": 18},
    "adult": {"age_onset_lower": 19, "age_onset_upper": 65},
    "all ages": {"age_onset_lower": 0, "age_onset_upper": 120},
    "antenatal": {"age_onset_lower": 0, "age_onset_upper": 0},
    "childhood": {"age_onset_lower": 2, "age_onset_upper": 11},
    "elderly": {"age_onset_lower": 65, "age_onset_upper": 120},
    "infancy": {"age_onset_lower": 0, "age_onset_upper": 2},
    "neonatal": {"age_onset_lower": 0, "age_onset_upper": 0},
    "no age of onset data available": {"age_onset_lower": 0, "age_onset_upper": 120}
}

# Make a dictionary that maps prevalence/incidence categories to floating point numbers
freq_in_pop = {
    "1-5 / 10 000": (1+5)/2/10000,
    "1-9 / 1 000 000": (1+9)/2/1000000,
    "1-9 / 100 000": (1+9)/2/100000,
    "6-9 / 10 000": (6+9)/2/10000,
    "<1 / 1 000 000": 1/1000000,
    ">1 / 1000": 1/1000
}

# Make a dictionary that maps uri's to entity names
entity_names_dict = {}
for i in range(0, len(ordo_root)):
    try:
        uri = ordo_root[i].attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
        name = ordo_root[i].find('{http://www.w3.org/2000/01/rdf-schema#}label').text
        entity_names_dict[uri] = name
    except:
        pass

# Make a dictionary of annotation property names
annotation_property_dict = {}
for item in ordo_root.findall('owl:AnnotationProperty', ns):
    try:
        val = (item[0].text
               .replace('Relationship between clinical entity and ','')
               .replace('Relationship between a clinical entity and ', '')
               .replace('Relationship between the clinical entity and the ', '')
               .replace('the geographical area for which epidemiological data (Epidemiology) is available', 'region where epidemiological data is available')
               .replace('its ', '')
               .replace('.','')
               .replace(' ','_')
              )
        key = item.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
        annotation_property_dict[key] = val
    except:
        pass
del annotation_property_dict['http://www.orpha.net/ORDO/Orphanet_C021']


# Iterate through all entities in ORDO, create a dictionary of each entity's properties, 
# and add the dictionary to entity_list
entity_list = []
for i in range(0, len(ordo_root)):

    item_prop_dict = {}
    
    # Save the properties of an entity into a dictionary
    if ordo_root[i].tag.endswith('Class'):
        age_onset_lower = None
        age_onset_upper = None
        point_prevalence_range = None
        birth_prevalence_range = None
        lifetime_prevalence_range = None
        annual_incidence_range = None
        regions_list = []
#         item_prop_dict = {}
        for item in list(ordo_root[i].iter()):

            item_prop = item.tag[1:].replace('}', '')
            item_val = item.text

            # Add the entity's ORPHA_ID and other vocab IDs to the entity's dictionary
            if item_prop.endswith('notation') or item_prop.endswith('hasDbXref'):
                try:
                    item_prop_dict[item_val.split(':')[0]] = item_val.split(':')[1]
                except:
                    pass

            #Add the entity's definition to the entity's dictionary
            if item_prop.endswith('definition'):
                item_prop_dict['definition'] = item_val

            # Add the entity's name to the entity's dictionary
            if item_prop.endswith('label'):
                item_prop_dict['name'] = item_val

            try:
                # change the property and value names to human-friendly form:
                item_prop = annotation_property_dict[item_prop]
                item_val = entity_names_dict[item_val]

                # Change age of onset categories into numerical upper and lower bounds
                if item_prop == "age_of_onset":
                    if age_onset_lower == None:
                        age_onset_lower = age_of_onset_dict[item_val]['age_onset_lower']
                        age_onset_upper = age_of_onset_dict[item_val]['age_onset_upper']
                    if age_of_onset_dict[item_val]['age_onset_lower'] < age_onset_lower:
                        age_onset_lower = age_of_onset_dict[item_val]['age_onset_lower']
                    if age_of_onset_dict[item_val]['age_onset_upper'] > age_onset_upper:
                        age_onset_upper = age_of_onset_dict[item_val]['age_onset_upper']

                # Transform and average incidence/prevalence values
                elif item_prop in ["point_prevalence_range", "birth_prevalence_range", "lifetime_prevalence_range", "annual_incidence_range"]:
                    # Change prevelance/incidence values to floating point averages
                    item_val = freq_in_pop[item_val]

                    # When multiple values for the same measure of incidence/prevalence exist, take their mean
                    if item_prop == "point_prevalence_range":
                        if point_prevalence_range == None:
                            point_prevalence_range = item_val
                        else:
                            point_prevalence_range = (point_prevalence_range+item_val)/2
                    elif item_prop == "birth_prevalence_range":
                        if birth_prevalence_range == None:
                            birth_prevalence_range = item_val
                        else:
                            birth_prevalence_range = (birth_prevalence_range+item_val)/2
                    elif item_prop == "lifetime_prevalence_range":
                        if lifetime_prevalence_range == None:
                            lifetime_prevalence_range = item_val
                        else:
                            lifetime_prevalence_range = (lifetime_prevalence_range+item_val)/2
                    elif item_prop == "annual_incidence_range":
                        if annual_incidence_range == None:
                            annual_incidence_range = item_val
                        else:
                            annual_incidence_range = (annual_incidence_range+item_val)/2

                # Consolidate values for region_where_epidemiological_data_is_available into a list
                elif item_prop == 'region_where_epidemiological_data_is_available':
                    regions_list.append(item_val)

                else:
                    # Add the item's property and value to the entity's dictionary
                    item_prop_dict[item_prop] = item_val

            except:
                pass

        # Add the onset age 
        if age_onset_upper != None:
            item_prop_dict["age_onset_upper"] = age_onset_upper
#             item_properties_list.append(item_prop_dict)
        if age_onset_lower != None:
            item_prop_dict["age_onset_lower"] = age_onset_lower
#             item_properties_list.append(item_prop_dict)

        # Add the prevalence/incidence values
        if point_prevalence_range != None:
            item_prop_dict["point_prevalence"] = point_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if birth_prevalence_range != None:
            item_prop_dict["birth_prevalence"] = birth_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if lifetime_prevalence_range != None:        
            item_prop_dict["lifetime_prevalence"] = lifetime_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if annual_incidence_range != None:        
            item_prop_dict["annual_incidence"] = annual_incidence_range
#             item_properties_list.append(item_prop_dict)

        # Add the regions where epi data  is available
        if len(regions_list) > 0:        
            item_prop_dict["region_where_epidemiological_data_is_available"] = str(regions_list)
            
    entity_list.append(item_prop_dict)

# Transform the data into a dataframe and write to csv
ordo_entity_dataframe = pd.DataFrame(entity_list)
ordo_entity_dataframe.head()

Unnamed: 0,definition,ICD-10,MeSH,MedDRA,UMLS,modes_of_inheritance,name,ORPHA,age_onset_upper,age_onset_lower,...,OMIM,point_prevalence,annual_incidence,lifetime_prevalence,Ensembl,Genatlas,HGNC,SwissProt,Reactome,IUPHAR
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [25]:
ordo_entity_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40101 entries, 0 to 40100
Data columns (total 22 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   definition                                      6726 non-null   object 
 1   ICD-10                                          7389 non-null   object 
 2   MeSH                                            1724 non-null   object 
 3   MedDRA                                          1139 non-null   object 
 4   UMLS                                            4722 non-null   object 
 5   modes_of_inheritance                            5530 non-null   object 
 6   name                                            15205 non-null  object 
 7   ORPHA                                           10735 non-null  object 
 8   age_onset_upper                                 6249 non-null   float64
 9   age_onset_lower                        

In [26]:
ordo_entity_dataframe[ordo_entity_dataframe['ORPHA'].isnull()].name.value_counts()

calcium voltage-gated channel subunit alpha1 D                         1
CYLD lysine 63 deubiquitinase                                          1
syntaxin 3                                                             1
solute carrier family 7 member 14                                      1
minichromosome maintenance 9 homologous recombination repair factor    1
                                                                      ..
CD151 molecule (Raph blood group)                                      1
ubiquitin like modifier activating enzyme 5                            1
N-glycanase 1                                                          1
luteinizing hormone/choriogonadotropin receptor                        1
phenylalanine hydroxylase                                              1
Name: name, Length: 4470, dtype: int64

Looks like the things which lack an ORPHA ID number are not disease entities, so we can remove them before importing disease entities into a graph. We'll drop duplicate rows while we're at it.

In [29]:
# Drop rows which lack an ORPHA ID
ordo_entity_dataframe.dropna(subset=['ORPHA'], inplace=True)

# Drop empty columns
ordo_entity_dataframe.dropna(axis=1, how='all', inplace=True)

# Drop duplicate rows
ordo_entity_dataframe.drop_duplicates(inplace=True)

ordo_entity_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10735 entries, 40 to 40069
Data columns (total 16 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   definition                                      6659 non-null   object 
 1   ICD-10                                          7389 non-null   object 
 2   MeSH                                            1724 non-null   object 
 3   MedDRA                                          1139 non-null   object 
 4   UMLS                                            4722 non-null   object 
 5   modes_of_inheritance                            5530 non-null   object 
 6   name                                            10735 non-null  object 
 7   ORPHA                                           10735 non-null  object 
 8   age_onset_upper                                 6249 non-null   float64
 9   age_onset_lower                       

Okay, all the entities at least have a name and an ORPHA ID. Now we can write out to CSV and import to a graph database.

In [30]:
ordo_entity_dataframe.to_csv('ordo_entities.csv', index=False)

Move the 'ordo_entity_properties.csv' file we just created into the import folder for the graph database.

In [17]:
# import getpass
# password = getpass.getpass("\nPlease enter the Neo4j database password to continue \n")
password = 'NikeshIsCool'

from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://localhost:7687", auth=('neo4j',password))
session=driver.session()

To-do:
- split the region_where_epidemiological_data_is_available property into a list on import

In [32]:
# Batch import the CSV into the database

command = '''
LOAD CSV WITH HEADERS FROM "file:///ordo_entities.csv" as row
CREATE (o:OrphEntity {
    name: row.name,
    definition: row.definition,
    icd_10_id: row['ICD-10'],
    mesh_id: row.MeSH,
    medra_id: row.MedDRA,
    umls_id: row.UMLS,
    omim_id: row.OMIM,
    modes_of_inheritance: row.modes_of_inheritance,
    orpha_id: row.ORPHA,
    age_onset_upper: toInteger(row.age_onset_upper),
    age_onset_lower: toInteger(row.age_onset_lower),
    birth_prevalence: toFloat(row.birth_prevalence),
    region_where_epidemiological_data_is_available: row.region_where_epidemiological_data_is_available,
    point_prevalence: toFloat(row.point_prevalence),
    annual_incidence: toFloat(row.annual_incidence),
    lifetime_prevalence: toFloat(row.lifetime_prevalence)
    })
'''
session.run(command)

<neo4j.work.result.Result at 0x7fd3e11a75b0>

## Exploring the HOOM Data

In [5]:
import xml.etree.ElementTree as ET
import re, requests, json
import pandas as pd

# Download the latest release of data from https://bioportal.bioontology.org/ontologies/HOOM
hoom_tree = ET.parse('hoom_orphanet.owl')
hoom_root = hoom_tree.getroot()

In [11]:
len(hoom_root)

516474

In [12]:
for i in range(0, 50):
    print("ITEM ", i)
    print(list(hoom_root[i].iter()))
    print('####################################')

ITEM  0
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92ad10>]
####################################
ITEM  1
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92ac70>]
####################################
ITEM  2
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92a4a0>]
####################################
ITEM  3
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92a0e0>]
####################################
ITEM  4
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92a450>]
####################################
ITEM  5
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92a400>]
####################################
ITEM  6
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92e0e0>]
####################################
ITEM  7
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92e180>]
####################################
ITEM  8
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7f59aa92

Sample of hoom Equivalent Classes element:

    <EquivalentClasses>
        <Class IRI="#Orpha:91495_HP:0000667_Freq:OC"/>
        <ObjectIntersectionOf>
            <ObjectSomeValuesFrom>
                <ObjectProperty IRI="http://purl.org/oban/association_has_object"/>
                <Class IRI="http://purl.obolibrary.org/obo/HP_0000667"/>
            </ObjectSomeValuesFrom>
            <ObjectSomeValuesFrom>
                <ObjectProperty IRI="http://purl.org/oban/association_has_subject"/>
                <Class IRI="http://www.orpha.net/ORDO/Orphanet_91495"/>
            </ObjectSomeValuesFrom>
            <ObjectSomeValuesFrom>
                <ObjectProperty IRI="http://purl.org/oban/has_provenance"/>
                <Class IRI="#Source_120"/>
            </ObjectSomeValuesFrom>
 
            <ObjectSomeValuesFrom>
                <ObjectProperty IRI="#has_frequency"/>
                <Class IRI="#Occasional"/>
            </ObjectSomeValuesFrom>
            <DataHasValue>
                <DataProperty IRI="#validation_association_date"/>
                <Literal datatypeIRI="http://www.w3.org/2001/XMLSchema#dateTime">2021-10-14T00:00:00.0</Literal>
            </DataHasValue>
        </ObjectIntersectionOf>
    </EquivalentClasses>

In [5]:
# Inspect the components of an average entity
if hoom_root[40000].tag.endswith('Class'):
    for item in list(hoom_root[40000].iter()):
        print('TAG:  ', item.tag)
        print('ATTRIBUTE:  ', item.attrib)
        print('TEXT:  ', item.text)
        print('\n####################################\n')

## Exploring Human Phenotype Ontology entities

In [136]:
import xml.etree.ElementTree as ET
import re, requests, json
import pandas as pd

# Download the latest release of data. Using bash, do: curl https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.owl > hp.owl
hpo_tree = ET.parse('hp.owl')
hpo_root = hpo_tree.getroot()

In [137]:
len(hpo_root)

107909

Example data:

    <owl:Class rdf:about="http://purl.obolibrary.org/obo/HP_0001510">
        <owl:equivalentClass>
            <owl:Restriction>
                <owl:onProperty rdf:resource="http://purl.obolibrary.org/obo/BFO_0000051"/>
                <owl:someValuesFrom>
                    <owl:Class>
                        <owl:intersectionOf rdf:parseType="Collection">
                            <rdf:Description rdf:about="http://purl.obolibrary.org/obo/PATO_0000502"/>
                            <owl:Restriction>
                                <owl:onProperty rdf:resource="http://purl.obolibrary.org/obo/RO_0000052"/>
                                <owl:someValuesFrom rdf:resource="http://purl.obolibrary.org/obo/GO_0040007"/>
                            </owl:Restriction>
                            <owl:Restriction>
                                <owl:onProperty rdf:resource="http://purl.obolibrary.org/obo/RO_0002573"/>
                                <owl:someValuesFrom rdf:resource="http://purl.obolibrary.org/obo/PATO_0000460"/>
                            </owl:Restriction>
                        </owl:intersectionOf>
                    </owl:Class>
                </owl:someValuesFrom>
            </owl:Restriction>
        </owl:equivalentClass>
        <rdfs:subClassOf rdf:resource="http://purl.obolibrary.org/obo/HP_0001507"/>
        <obo:IAO_0000115 rdf:datatype="http://www.w3.org/2001/XMLSchema#string">A deficiency or slowing down of growth pre- and postnatally.</obo:IAO_0000115>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001434</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001512</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001514</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001517</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001532</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0008847</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0008870</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0008886</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0008893</oboInOwl:hasAlternativeId>
        <oboInOwl:hasAlternativeId rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0008926</oboInOwl:hasAlternativeId>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">SNOMEDCT_US:276617005</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">SNOMEDCT_US:444896005</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">SNOMEDCT_US:59576002</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">UMLS:C0151686</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">UMLS:C0456070</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">UMLS:C0878787</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">UMLS:C1837385</oboInOwl:hasDbXref>
        <oboInOwl:hasDbXref rdf:datatype="http://www.w3.org/2001/XMLSchema#string">UMLS:C3552463</oboInOwl:hasDbXref>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Delayed growth</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Growth deficiency</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Growth delay</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Growth failure</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Growth retardation</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Poor growth</oboInOwl:hasExactSynonym>
        <oboInOwl:hasExactSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Retarded growth</oboInOwl:hasExactSynonym>
        <oboInOwl:hasRelatedSynonym rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Very poor growth</oboInOwl:hasRelatedSynonym>
        <oboInOwl:id rdf:datatype="http://www.w3.org/2001/XMLSchema#string">HP:0001510</oboInOwl:id>
        <rdfs:comment rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Poor or abnormally slow gains in weight or height in a child.</rdfs:comment>
        <rdfs:label rdf:datatype="http://www.w3.org/2001/XMLSchema#string">Growth delay</rdfs:label>
    </owl:Class>


In [142]:
print(list(hpo_root[40000].iter()))

[<Element '{http://www.w3.org/2002/07/owl#}Class' at 0x7f5a050c1b80>, <Element '{http://www.w3.org/2002/07/owl#}equivalentClass' at 0x7f5a050c1bd0>, <Element '{http://www.w3.org/2002/07/owl#}Restriction' at 0x7f5a050c1c20>, <Element '{http://www.w3.org/2002/07/owl#}onProperty' at 0x7f5a050c1cc0>, <Element '{http://www.w3.org/2002/07/owl#}someValuesFrom' at 0x7f5a050c1d60>, <Element '{http://www.w3.org/2002/07/owl#}Class' at 0x7f5a050c1e00>, <Element '{http://www.w3.org/2002/07/owl#}intersectionOf' at 0x7f5a050c1ea0>, <Element '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description' at 0x7f5a050c1f40>, <Element '{http://www.w3.org/2002/07/owl#}Restriction' at 0x7f5a050c5040>, <Element '{http://www.w3.org/2002/07/owl#}onProperty' at 0x7f5a050c5090>, <Element '{http://www.w3.org/2002/07/owl#}someValuesFrom' at 0x7f5a050c50e0>, <Element '{http://www.w3.org/2002/07/owl#}Restriction' at 0x7f5a050c51d0>, <Element '{http://www.w3.org/2002/07/owl#}onProperty' at 0x7f5a050c5220>, <Element '{ht

## Import Human Phenotype Ontology entities into a graph database

In [169]:
entity_list = []

for i in range(0, len(hpo_root)):

    item_dict = {}

    if hpo_root[i].tag.endswith("Class"):
        for item in list(hpo_root[i].iter()):
            item_prop = item.tag[1:].replace('}', '').split('/')[-1]
            item_val = item.text

            try:
                # Add the entity's HPO_ID and other vocab IDs to the entity's dictionary 
                if item_prop.endswith('hasDbXref') or item_prop.endswith('id'):
                    db = item_val.split(':')[0]
                    ID = item_val.split(':')[1]
                    item_dict[db] = ID

                # Add the entity's label to the entity's dictionary
                elif item_prop.endswith('label'):
                    item_dict['name'] = item_val

                # Add the entity's definition to the entity's dictionary
                elif item_prop.endswith('IAO_0000115'):
                    item_dict['definition'] = item_val

    #             else:
    #                 item_dict[item_prop] = item_val
    #         print(item_dict)
            except:
                pass
    entity_list.append(item_dict)

    #         print(item_prop)
    #         print(item_val)
    #         print(item.tag)
human_phenotype_entities = pd.DataFrame(entity_list)

In [170]:
human_phenotype_entities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107909 entries, 0 to 107908
Columns: 168 entries, name to HGNC
dtypes: object(168)
memory usage: 138.3+ MB


In [172]:
# Drop rows which lack an HP ID
human_phenotype_entities.dropna(subset=['HP'], inplace=True)

# Drop empty columns
human_phenotype_entities.dropna(axis=1, how='all', inplace=True)

# Drop duplicate rows
human_phenotype_entities.drop_duplicates(inplace=True)

human_phenotype_entities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10653 entries, 27573 to 68192
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         10653 non-null  object
 1   definition   8168 non-null   object
 2   UMLS         10403 non-null  object
 3   HP           10653 non-null  object
 4   MSH          1939 non-null   object
 5   SNOMEDCT_US  3177 non-null   object
 6   MEDDRA       93 non-null     object
 7   Fyler        155 non-null    object
 8   NCIT         184 non-null    object
 9   COHD         1 non-null      object
 10  EFO          1 non-null      object
 11  ICD10        1 non-null      object
 12  ICD9         1 non-null      object
 13  ICD-10       38 non-null     object
 14  EPCC         13 non-null     object
 15  DOID         2 non-null      object
 16  MONDO        1 non-null      object
 17  ICD-O        3 non-null      object
 18  MP           9 non-null      object
 19  MPATH        4 non-nu

In [177]:
# Keep only the name, definition, UMLS CUI, Human Phenome Ontology ID, MeSH ID, and SNOMED ID
human_phenotype_entities = human_phenotype_entities[["name", "definition", "UMLS", "HP", "MSH", "SNOMEDCT_US"]]

In [179]:
human_phenotype_entities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10653 entries, 27573 to 68192
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         10653 non-null  object
 1   definition   8168 non-null   object
 2   UMLS         10403 non-null  object
 3   HP           10653 non-null  object
 4   MSH          1939 non-null   object
 5   SNOMEDCT_US  3177 non-null   object
dtypes: object(6)
memory usage: 582.6+ KB


In [178]:
human_phenotype_entities.to_csv('human_phenotype_entities.csv', index=False)

Move the CSV of human phenotype entities into the import folder for the graph database.

In [180]:
# import getpass
# password = getpass.getpass("\nPlease enter the Neo4j database password to continue \n")
password = 'NikeshIsCool'

from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://localhost:7687", auth=('neo4j',password))
session=driver.session()

In [182]:
# Batch import the CSV into the database

command = '''
LOAD CSV WITH HEADERS FROM "file:///human_phenotype_entities.csv" as row
CREATE (o:HPOentity {
    name: row.name,
    hpo_id: row.HP,
    definition: row.definition,
    mesh_id: row.MSH,
    umls_id: row.UMLS,
    snomedct_us_id: row.SNOMEDCT_US
    })
'''
session.run(command)

<neo4j.work.result.Result at 0x7f59ea9159a0>

## Import the Orphanet relationships into a graph database

In [6]:
ns = {
    "base":"http://www.semanticweb.org/ontology/HOOM",
    "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "xml":"http://www.w3.org/XML/1998/namespace",
    "xsd":"http://www.w3.org/2001/XMLSchema#",
    "rdfs":"http://www.w3.org/2000/01/rdf-schema#",
    "terms":"http://purl.org/dc/terms/",
    "dc":"http://purl.org/dc/elements/1.1/",
    "owl":"http://www.w3.org/2002/07/owl#",
    "licenses":"https://creativecommons.org/licenses/"
}

# Make a dictionary of frequency values
freq_dict = {
    "#VeryRare" : (1+4)/2/100,
    "#Occasional" : (5+29)/2/100,
    "#Frequent" : (30+79)/2/100,
    "#VeryFrequent" : (80+99)/2/100,
    "#Obligate" : 100/100
}

# Make a dictionary of sources
source_dict = {}
for i in range(0, len(hoom_root)):
    if hoom_root[i].tag.endswith('EquivalentClasses'):
        for item in list(hoom_root[i].iter()):
            if item.tag.endswith('Class'):
                if '#Source_' in item.attrib['IRI']:
                    source_ID = item.attrib['IRI'].split('/')[-1]
                if 'PMID' in item.attrib['IRI'] or 'PMC' in item.attrib['IRI'] or 'ISBN' in item.attrib['IRI'] or 'doi' in item.attrib['IRI'] or 'DOI' in item.attrib['IRI']:
                    source = item.attrib['IRI'].split('/')[-1][1:]
                    source_dict[source_ID] = source

# Get the subject, object, and properties of each relationship
relation_dataframe = []
for i in range(0, len(hoom_root)):
    item_dict = {}
    if hoom_root[i].tag.endswith('EquivalentClasses'):
        for item in list(hoom_root[i].iter()):
            if item.tag.endswith('ObjectSomeValuesFrom'):
                
                item_prop = item.find('owl:ObjectProperty', ns).attrib['IRI'].split('/')[-1]
                item_val = item.find('owl:Class', ns).attrib['IRI'].split('/')[-1]
                
                # Strip the 'Orphanet_' prefix from ORPHA IDs and set the column name as "ORPHA"
                if item_prop == 'association_has_subject':
                    item_val = item_val.split('_')[1]
                    item_prop = 'ORPHA'
                
                # Strip the "HP_" prefix from HPO IDs and set the column name as "HP"
                if item_prop == 'association_has_object':
                    item_val = item_val.split('_')[1]
                    item_prop = 'HP'
                
                # Replace source ID number with list of sources
                if item_prop == 'has_provenance':
                    item_val = source_dict[item_val]
                    item_prop = 'evidence'
                    
                # Give diagnostic criteria more descriptive categories and column name
                if item_prop == '#has_DC_attribute':
                    item_val = item_val[1:]
                    item_prop = 'diagnostic_criterion_attribute'
                    
                # Replace frequency categories with their numerical meaning
                if item_prop == '#has_frequency':
                    item_val = freq_dict[item_val]
                    item_prop = 'approx_frequency'
                
                item_dict[item_prop] = item_val
#                 print(item_prop, ":", item_val)
        
# #         print(item_dict)
#         print('\n')

        relation_dataframe.append(item_dict)

relation_dataframe = pd.DataFrame(relation_dataframe)

In [7]:
# Drop rows which lack an HP ID
relation_dataframe.dropna(subset=['HP', 'ORPHA'], inplace=True)

# Drop empty columns
relation_dataframe.dropna(axis=1, how='all', inplace=True)

# Drop duplicate rows
relation_dataframe.drop_duplicates(inplace=True)

relation_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113929 entries, 13 to 116595
Data columns (total 5 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   diagnostic_criterion_attribute  1290 non-null    object 
 1   HP                              113929 non-null  object 
 2   ORPHA                           113929 non-null  object 
 3   evidence                        79308 non-null   object 
 4   approx_frequency                113223 non-null  float64
dtypes: float64(1), object(4)
memory usage: 5.2+ MB


In [8]:
relation_dataframe

Unnamed: 0,diagnostic_criterion_attribute,HP,ORPHA,evidence,approx_frequency
13,,0410019,71272,PMID:855842_PMID:16614981_PMID:26697813,0.545
14,,0002036,71272,PMID:855842_PMID:16614981_PMID:26697813,0.545
15,,0100633,71272,PMID:855842_PMID:16614981_PMID:26697813,0.545
16,,0002020,71272,PMID:855842_PMID:16614981_PMID:26697813,0.895
17,,0002533,71272,PMID:855842_PMID:16614981_PMID:26697813,0.895
...,...,...,...,...,...
116591,,0002313,3208,PMID:6318158_PMID:2710360_PMID:7609447_PMID:81...,0.170
116592,,0001824,3208,PMID:6318158_PMID:2710360_PMID:7609447_PMID:81...,0.545
116593,,0001626,3208,PMID:6318158_PMID:2710360_PMID:7609447_PMID:81...,0.545
116594,,0011343,3208,PMID:6318158_PMID:2710360_PMID:7609447_PMID:81...,0.170


In [9]:
relation_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113929 entries, 13 to 116595
Data columns (total 5 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   diagnostic_criterion_attribute  1290 non-null    object 
 1   HP                              113929 non-null  object 
 2   ORPHA                           113929 non-null  object 
 3   evidence                        79308 non-null   object 
 4   approx_frequency                113223 non-null  float64
dtypes: float64(1), object(4)
memory usage: 5.2+ MB


In [13]:
relation_dataframe["diagnostic_criterion_attribute"].value_counts()

Exclusion_DC        706
Criterion_DC        575
Pathognomomic_DC      9
Name: diagnostic_criterion_attribute, dtype: int64

In [14]:
relation_dataframe[relation_dataframe["diagnostic_criterion_attribute"] == 'Pathognomomic_DC']

Unnamed: 0,diagnostic_criterion_attribute,HP,ORPHA,evidence,approx_frequency
5444,Pathognomomic_DC,7968,91495,PMID:30375202_PMID:33728125_PMID:29207745_PMID...,1.0
12549,Pathognomomic_DC,11763,300385,PMID:15741248_PMID:23400299_PMID:17287410_PMID...,1.0
21616,Pathognomomic_DC,149,206484,PMID:10999808_PMID:24826988_PMID:23933507_PMID...,0.895
31103,Pathognomomic_DC,25186,91412,PMID:27608283_PMID:33598261_PMID:32644484,1.0
38896,Pathognomomic_DC,30261,49,PMID:21204662_PMID:26673776_PMID:16142564_PMID...,0.895
48802,Pathognomomic_DC,8186,116,PMID:20301568_PMID:842515_PMID:1872243_PMID:76...,0.17
49168,Pathognomomic_DC,100247,396,PMID:28759492_PMID:1477559,0.895
96380,Pathognomomic_DC,10459,325345,PMID:24758178_PMID:23065160_PMID:16624884_PMID...,0.895
104868,Pathognomomic_DC,3005,251992,PMID:2917322_PMID:10421273_PMID:15827098_PMID:...,1.0


In [10]:
relation_dataframe.to_csv('hoom_relations.csv', index=False)

Move the CSV of relations into the graph database import folder.

In [2]:
# import getpass
# password = getpass.getpass("\nPlease enter the Neo4j database password to continue \n")
password = 'NikeshIsCool'

from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://localhost:7687", auth=('neo4j',password))
session=driver.session()

In [2]:
# Create indices for HPO and ORPHA IDs

command = '''
CREATE INDEX hpo_id FOR (n:HPOentity) ON (n.hpo_id)
'''
session.run(command)

command = '''
CREATE INDEX orpha_id FOR (n:OrphEntity) ON (n.orpha_id)
'''
session.run(command)

<neo4j.work.result.Result at 0x7fd121c69340>

In [3]:
# Create indices for UMLS CUIs for HPOentity and OrphEntity
command = '''
CREATE INDEX HPOentity_CUI FOR (n:HPOentity) ON (n.umls_id)
'''
session.run(command)

command = '''
CREATE INDEX OrphEntity_CUI FOR (n:OrphEntity) ON (n.umls_id)
'''
session.run(command)

<neo4j.work.result.Result at 0x7fac35d6e7c0>

In [12]:
# Batch import the CSV into the database

command = '''
LOAD CSV WITH HEADERS FROM "file:///hoom_relations.csv" as row
MATCH (finding:HPOentity {hpo_id: row.HP})
MATCH (disease:OrphEntity {orpha_id: row.ORPHA})
MERGE (finding)-[r:ASSOC_WITH]->(disease)
SET r.approx_frequency=row.approx_frequency, r.evidence=row.evidence, r.diagnostic_criterion_attribute=row.diagnostic_criterion_attribute
'''
session.run(command)

<neo4j.work.result.Result at 0x7fd096a505b0>

In [16]:
# Connect Human Phenotype entities with UMLS concepts
command = '''
MATCH (finding:HPOentity)
MATCH (c:Concept {cui:finding.umls_id})
WHERE c.cui_pref_term IS NOT NULL
MERGE (finding)-[r:SYNONYM]->(c)
SET r.source='Orphanet'
'''
session.run(command)

In [17]:
# Connect Human Phenotype entities with UMLS concepts
command = '''
MATCH (disease:OrphEntity)
MATCH (c:Concept {cui:disease.umls_id})
WHERE c.cui_pref_term IS NOT NULL
MERGE (disease)-[r:SYNONYM]->(c)
SET r.source='Orphanet'
'''
session.run(command)

<neo4j.work.result.Result at 0x7fd09697a8e0>