# Importing Orphanet data into the knowledge graph  
Orphanet has Creative Commons-licensed data that we can use to identify patients with rare diseases.

HOOM is a module that qualifies the annotation between a clinical entity and phenotypic abnormalities according to a frequency and by integrating the notion of diagnostic criterion. See info on HOOM [here](http://www.orphadata.org/cgi-bin/img/PDF/WhatIsHOOM.pdf).

## Exploring the ORDO Data

In [6]:
import xml.etree.ElementTree as ET
import re, requests, json
import pandas as pd

# Download the latest release of data from https://bioportal.bioontology.org/ontologies/ORDO
ordo_tree = ET.parse('ordo_orphanet.owl')
ordo_root = ordo_tree.getroot()

In [169]:
len(ordo_root)

40101

In [188]:
len(list(ordo_root.iter()))

5

In [7]:
ns = {
    "base":"http://www.orpha.net/ontology/ORDO_en_4.0.owl",
    "dc":"http://purl.org/dc/elements/1.1/",
    "efo":"http://www.ebi.ac.uk/efo/",
    "obo":"http://purl.obolibrary.org/obo/",
    "owl":"http://www.w3.org/2002/07/owl#",
    "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "xml":"http://www.w3.org/XML/1998/namespace",
    "xsd":"http://www.w3.org/2001/XMLSchema#",
    "ORDO":"http://www.orpha.net/ORDO/",
    "rdfs":"http://www.w3.org/2000/01/rdf-schema#",
    "skos":"http://www.w3.org/2004/02/skos/core#",
    "terms":"http://purl.org/dc/terms/",
    "licenses":"https://creativecommons.org/licenses/",
    "oboInOwl":"http://www.geneontology.org/formats/oboInOwl#",
    "Orphanet_":"http://www.orpha.net/ORDO/Orphanet_#"
}

for i in range(0, 50):
    print("ITEM ", i)
    print(list(ordo_root[i].iter()))
    print('####################################')

ITEM  0
[<Element '{http://www.w3.org/2002/07/owl#}Ontology' at 0x7fd458164bd0>, <Element '{http://www.w3.org/2002/07/owl#}versionIRI' at 0x7fd458164cc0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164d60>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164db0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164e00>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164ea0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164ef0>, <Element '{http://purl.org/dc/elements/1.1/}creator' at 0x7fd458164f40>, <Element '{http://purl.org/dc/terms/}created' at 0x7fd45816e040>, <Element '{http://purl.org/dc/terms/}license' at 0x7fd45816e130>, <Element '{http://purl.org/dc/terms/}modified' at 0x7fd45816e1d0>, <Element '{http://www.w3.org/2002/07/owl#}versionInfo' at 0x7fd45816e2c0>]
####################################
ITEM  1
[<Element '{http://www.w3.org/2002/07/owl#}AnnotationProperty' at 0x7fd45816e310>, <Elem

In [8]:
# Inspect the components of an average entity
if ordo_root[40].tag.endswith('Class'):
    for item in list(ordo_root[40].iter()):
        print('TAG:  ', item.tag)
        print('ATTRIBUTE:  ', item.attrib)
        print('TEXT:  ', item.text)
        print('\n####################################\n')

TAG:   {http://www.w3.org/2002/07/owl#}Class
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about': 'http://www.orpha.net/ORDO/Orphanet_10'}
TEXT:   
        

####################################

TAG:   {http://www.w3.org/2000/01/rdf-schema#}subClassOf
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://www.orpha.net/ORDO/Orphanet_377789'}
TEXT:   None

####################################

TAG:   {http://www.w3.org/2000/01/rdf-schema#}subClassOf
ATTRIBUTE:   {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://www.orpha.net/ORDO/Orphanet_557493'}
TEXT:   None

####################################

TAG:   {http://www.ebi.ac.uk/efo/}definition
ATTRIBUTE:   {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
TEXT:   A rare sex chromosome number anomaly disorder characterized, genetically, by the presence of an extra X and Y chromosome in males and, clinically, by tall stature, dysfunctional testes associated with infertility and ins

## Import the Orphanet disease entities into a graph database

In [24]:
# Make a dictionary that maps age categories to upper and lower age ranges
age_of_onset_dict = {
    "adolescent": {"age_onset_lower": 12, "age_onset_upper": 18},
    "adult": {"age_onset_lower": 19, "age_onset_upper": 65},
    "all ages": {"age_onset_lower": 0, "age_onset_upper": 120},
    "antenatal": {"age_onset_lower": 0, "age_onset_upper": 0},
    "childhood": {"age_onset_lower": 2, "age_onset_upper": 11},
    "elderly": {"age_onset_lower": 65, "age_onset_upper": 120},
    "infancy": {"age_onset_lower": 0, "age_onset_upper": 2},
    "neonatal": {"age_onset_lower": 0, "age_onset_upper": 0},
    "no age of onset data available": {"age_onset_lower": 0, "age_onset_upper": 120}
}

# Make a dictionary that maps prevalence/incidence categories to floating point numbers
freq_in_pop = {
    "1-5 / 10 000": (1+5)/2/10000,
    "1-9 / 1 000 000": (1+9)/2/1000000,
    "1-9 / 100 000": (1+9)/2/100000,
    "6-9 / 10 000": (6+9)/2/10000,
    "<1 / 1 000 000": 1/1000000,
    ">1 / 1000": 1/1000
}

# Make a dictionary that maps uri's to entity names
entity_names_dict = {}
for i in range(0, len(ordo_root)):
    try:
        uri = ordo_root[i].attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
        name = ordo_root[i].find('{http://www.w3.org/2000/01/rdf-schema#}label').text
        entity_names_dict[uri] = name
    except:
        pass

# Make a dictionary of annotation property names
annotation_property_dict = {}
for item in ordo_root.findall('owl:AnnotationProperty', ns):
    try:
        val = (item[0].text
               .replace('Relationship between clinical entity and ','')
               .replace('Relationship between a clinical entity and ', '')
               .replace('Relationship between the clinical entity and the ', '')
               .replace('the geographical area for which epidemiological data (Epidemiology) is available', 'region where epidemiological data is available')
               .replace('its ', '')
               .replace('.','')
               .replace(' ','_')
              )
        key = item.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
        annotation_property_dict[key] = val
    except:
        pass
del annotation_property_dict['http://www.orpha.net/ORDO/Orphanet_C021']


# Iterate through all entities in ORDO, create a dictionary of each entity's properties, 
# and add the dictionary to entity_list
entity_list = []
for i in range(0, len(ordo_root)):

    item_prop_dict = {}
    
    # Save the properties of an entity into a dictionary
    if ordo_root[i].tag.endswith('Class'):
        age_onset_lower = None
        age_onset_upper = None
        point_prevalence_range = None
        birth_prevalence_range = None
        lifetime_prevalence_range = None
        annual_incidence_range = None
        regions_list = []
#         item_prop_dict = {}
        for item in list(ordo_root[i].iter()):

            item_prop = item.tag[1:].replace('}', '')
            item_val = item.text

            # Add the entity's ORPHA_ID and other vocab IDs to the entity's dictionary
            if item_prop.endswith('notation') or item_prop.endswith('hasDbXref'):
                try:
                    item_prop_dict[item_val.split(':')[0]] = item_val.split(':')[1]
                except:
                    pass

            #Add the entity's definition to the entity's dictionary
            if item_prop.endswith('definition'):
                item_prop_dict['definition'] = item_val

            # Add the entity's name to the entity's dictionary
            if item_prop.endswith('label'):
                item_prop_dict['name'] = item_val

            try:
                # change the property and value names to human-friendly form:
                item_prop = annotation_property_dict[item_prop]
                item_val = entity_names_dict[item_val]

                # Change age of onset categories into numerical upper and lower bounds
                if item_prop == "age_of_onset":
                    if age_onset_lower == None:
                        age_onset_lower = age_of_onset_dict[item_val]['age_onset_lower']
                        age_onset_upper = age_of_onset_dict[item_val]['age_onset_upper']
                    if age_of_onset_dict[item_val]['age_onset_lower'] < age_onset_lower:
                        age_onset_lower = age_of_onset_dict[item_val]['age_onset_lower']
                    if age_of_onset_dict[item_val]['age_onset_upper'] > age_onset_upper:
                        age_onset_upper = age_of_onset_dict[item_val]['age_onset_upper']

                # Transform and average incidence/prevalence values
                elif item_prop in ["point_prevalence_range", "birth_prevalence_range", "lifetime_prevalence_range", "annual_incidence_range"]:
                    # Change prevelance/incidence values to floating point averages
                    item_val = freq_in_pop[item_val]

                    # When multiple values for the same measure of incidence/prevalence exist, take their mean
                    if item_prop == "point_prevalence_range":
                        if point_prevalence_range == None:
                            point_prevalence_range = item_val
                        else:
                            point_prevalence_range = (point_prevalence_range+item_val)/2
                    elif item_prop == "birth_prevalence_range":
                        if birth_prevalence_range == None:
                            birth_prevalence_range = item_val
                        else:
                            birth_prevalence_range = (birth_prevalence_range+item_val)/2
                    elif item_prop == "lifetime_prevalence_range":
                        if lifetime_prevalence_range == None:
                            lifetime_prevalence_range = item_val
                        else:
                            lifetime_prevalence_range = (lifetime_prevalence_range+item_val)/2
                    elif item_prop == "annual_incidence_range":
                        if annual_incidence_range == None:
                            annual_incidence_range = item_val
                        else:
                            annual_incidence_range = (annual_incidence_range+item_val)/2

                # Consolidate values for region_where_epidemiological_data_is_available into a list
                elif item_prop == 'region_where_epidemiological_data_is_available':
                    regions_list.append(item_val)

                else:
                    # Add the item's property and value to the entity's dictionary
                    item_prop_dict[item_prop] = item_val

            except:
                pass

        # Add the onset age 
        if age_onset_upper != None:
            item_prop_dict["age_onset_upper"] = age_onset_upper
#             item_properties_list.append(item_prop_dict)
        if age_onset_lower != None:
            item_prop_dict["age_onset_lower"] = age_onset_lower
#             item_properties_list.append(item_prop_dict)

        # Add the prevalence/incidence values
        if point_prevalence_range != None:
            item_prop_dict["point_prevalence"] = point_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if birth_prevalence_range != None:
            item_prop_dict["birth_prevalence"] = birth_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if lifetime_prevalence_range != None:        
            item_prop_dict["lifetime_prevalence"] = lifetime_prevalence_range
#             item_properties_list.append(item_prop_dict)
        if annual_incidence_range != None:        
            item_prop_dict["annual_incidence"] = annual_incidence_range
#             item_properties_list.append(item_prop_dict)

        # Add the regions where epi data  is available
        if len(regions_list) > 0:        
            item_prop_dict["region_where_epidemiological_data_is_available"] = str(regions_list)
            
    entity_list.append(item_prop_dict)

# Transform the data into a dataframe and write to csv
ordo_entity_dataframe = pd.DataFrame(entity_list)
ordo_entity_dataframe.head()

Unnamed: 0,definition,ICD-10,MeSH,MedDRA,UMLS,modes_of_inheritance,name,ORPHA,age_onset_upper,age_onset_lower,...,OMIM,point_prevalence,annual_incidence,lifetime_prevalence,Ensembl,Genatlas,HGNC,SwissProt,Reactome,IUPHAR
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [25]:
ordo_entity_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40101 entries, 0 to 40100
Data columns (total 22 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   definition                                      6726 non-null   object 
 1   ICD-10                                          7389 non-null   object 
 2   MeSH                                            1724 non-null   object 
 3   MedDRA                                          1139 non-null   object 
 4   UMLS                                            4722 non-null   object 
 5   modes_of_inheritance                            5530 non-null   object 
 6   name                                            15205 non-null  object 
 7   ORPHA                                           10735 non-null  object 
 8   age_onset_upper                                 6249 non-null   float64
 9   age_onset_lower                        

In [26]:
ordo_entity_dataframe[ordo_entity_dataframe['ORPHA'].isnull()].name.value_counts()

calcium voltage-gated channel subunit alpha1 D                         1
CYLD lysine 63 deubiquitinase                                          1
syntaxin 3                                                             1
solute carrier family 7 member 14                                      1
minichromosome maintenance 9 homologous recombination repair factor    1
                                                                      ..
CD151 molecule (Raph blood group)                                      1
ubiquitin like modifier activating enzyme 5                            1
N-glycanase 1                                                          1
luteinizing hormone/choriogonadotropin receptor                        1
phenylalanine hydroxylase                                              1
Name: name, Length: 4470, dtype: int64

Looks like the things which lack an ORPHA ID number are not disease entities, so we can remove them before importing disease entities into a graph. We'll drop duplicate rows while we're at it.

In [29]:
# Drop rows which lack an ORPHA ID
ordo_entity_dataframe.dropna(subset=['ORPHA'], inplace=True)

# Drop empty columns
ordo_entity_dataframe.dropna(axis=1, how='all', inplace=True)

# Drop duplicate rows
ordo_entity_dataframe.drop_duplicates(inplace=True)

ordo_entity_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10735 entries, 40 to 40069
Data columns (total 16 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   definition                                      6659 non-null   object 
 1   ICD-10                                          7389 non-null   object 
 2   MeSH                                            1724 non-null   object 
 3   MedDRA                                          1139 non-null   object 
 4   UMLS                                            4722 non-null   object 
 5   modes_of_inheritance                            5530 non-null   object 
 6   name                                            10735 non-null  object 
 7   ORPHA                                           10735 non-null  object 
 8   age_onset_upper                                 6249 non-null   float64
 9   age_onset_lower                       

Okay, all the entities at least have a name and an ORPHA ID. Now we can write out to CSV and import to a graph database.

In [30]:
ordo_entity_dataframe.to_csv('ordo_entities.csv', index=False)

Move the 'ordo_entity_properties.csv' file we just created into the import folder for the graph database.

In [17]:
# import getpass
# password = getpass.getpass("\nPlease enter the Neo4j database password to continue \n")
password = 'NikeshIsCool'

from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://localhost:7687", auth=('neo4j',password))
session=driver.session()

To-do:
- split the region_where_epidemiological_data_is_available property into a list on import

In [32]:
# Batch import the CSV into the database

command = '''
LOAD CSV WITH HEADERS FROM "file:///ordo_entities.csv" as row
CREATE (o:OrphEntity {
    name: row.name,
    definition: row.definition,
    icd_10_id: row['ICD-10'],
    mesh_id: row.MeSH,
    medra_id: row.MedDRA,
    umls_id: row.UMLS,
    omim_id: row.OMIM,
    modes_of_inheritance: row.modes_of_inheritance,
    orpha_id: row.ORPHA,
    age_onset_upper: toInteger(row.age_onset_upper),
    age_onset_lower: toInteger(row.age_onset_lower),
    birth_prevalence: toFloat(row.birth_prevalence),
    region_where_epidemiological_data_is_available: row.region_where_epidemiological_data_is_available,
    point_prevalence: toFloat(row.point_prevalence),
    annual_incidence: toFloat(row.annual_incidence),
    lifetime_prevalence: toFloat(row.lifetime_prevalence)
    })
'''
session.run(command)

<neo4j.work.result.Result at 0x7fd3e11a75b0>

## Exploring the HOOM Data

In [33]:
# Download the latest release of data from https://bioportal.bioontology.org/ontologies/HOOM
hoom_tree = ET.parse('hoom_orphanet.owl')
hoom_root = hoom_tree.getroot()

In [34]:
len(hoom_root)

516474

In [35]:
for i in range(0, 50):
    print("ITEM ", i)
    print(list(hoom_root[i].iter()))
    print('####################################')

ITEM  0
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc9434f0>]
####################################
ITEM  1
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc943540>]
####################################
ITEM  2
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc9435e0>]
####################################
ITEM  3
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc943630>]
####################################
ITEM  4
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc943680>]
####################################
ITEM  5
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc9436d0>]
####################################
ITEM  6
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc943770>]
####################################
ITEM  7
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc9437c0>]
####################################
ITEM  8
[<Element '{http://www.w3.org/2002/07/owl#}Prefix' at 0x7fd3dc94

In [39]:
# Inspect the components of an average entity
if ordo_root[4000].tag.endswith('Class'):
    for item in list(hoom_root[4000].iter()):
        print('TAG:  ', item.tag)
        print('ATTRIBUTE:  ', item.attrib)
        print('TEXT:  ', item.text)
        print('\n####################################\n')