## Data pre-process

In [10]:
# imports for the whole notebook
from xml.etree import ElementTree as ET
import pandas as pd
import csv
import numpy as np
import math
from pathlib import Path
import re

### Convert XML dataset to CSV

In [5]:
# Dataset: clinical signs and symptoms in rare diseases
# http://www.orphadata.org/cgi-bin/index.php (Phenotypes associated with rare disorders)
tree = ET.parse('data/en_product4.xml')
root = tree.getroot()


headers = ['HPODisorderSetStatus_id', 'Disorder_id', 'OrphaCode', 'ExpertLink', 'Name', 'DisorderType_id',
            'DisorderType_name', 'DisorderGroup_id', 'DisorderGroup_Name', 'HPODisorderAssociation_id',
            'HPO_id', 'HPOId', 'HPOTerm', 'HPOFrequency_id', 'HPOFrequency_Name', 'DiagnosticCriteria_id',
            'DiagnosticCriteria_Name', 'Source', 'ValidationStatus', 'Online', 'ValidationDate']


def find_value(row_data, source_tag, target_tag_name, field, text=True):
    """Finds a sub-tag of a source tag and inputs its value into a dictionary containing the current row's data
    
    Args:
        row_data (dict):
            The data for the current row associated with the csv fields
        source_tag (Element):
            XML parent tag to search from
        target_tag_name (str):
            Name of the sub-tag to find
        field (str):
            Field in the csv file
        text (bool):
            Indicates if the value of the tag to retrieve is its inner text or its id attribute
    Returns:
        tag (Element):
            Returns the found tag
    """
    tag = source_tag.find(target_tag_name)
    tag_v = ''
    
    if tag is not None:  #retrieving either the inner text or the id attribute of the tag
        if text: tag_v = tag.text
        elif (len(tag.attrib) > 0): tag_v = tag.attrib['id']
    row_data[field] = tag_v if tag_v is not None else ''
    
    return tag


with open('data/en_product4.csv', 'w', encoding='utf-8') as fd:
    csvwriter = csv.DictWriter(fd, delimiter=',', fieldnames=headers)
    csvwriter.writeheader()
    
    # iterating through all the disorders
    for status in root.find('HPODisorderSetStatusList').findall('HPODisorderSetStatus'):
        row_data = {}
        row_data['HPODisorderSetStatus_id'] = status.attrib['id']
        
        disorder_tag = find_value(row_data, status, 'Disorder', 'Disorder_id', text=False)
        find_value(row_data, disorder_tag, 'OrphaCode', 'OrphaCode', text=True)
        find_value(row_data, disorder_tag, 'ExpertLink', 'ExpertLink', text=True)
        find_value(row_data, disorder_tag, 'Name', 'Name', text=True)
        
        disordertype_tag = find_value(row_data, disorder_tag, 'DisorderType', 'DisorderType_id', text=False)
        find_value(row_data, disordertype_tag, 'Name', 'DisorderType_name', text=True)
        disordergroup_tag = find_value(row_data, disorder_tag, 'DisorderGroup', 'DisorderGroup_id', text=False)
        find_value(row_data, disordergroup_tag, 'Name', 'DisorderGroup_Name', text=True)
        
        for field in ['Source', 'ValidationStatus', 'Online', 'ValidationDate']:
            find_value(row_data, status, field, field, text=True)
        
        # iterating through all the disorder associations and writing a row for each
        for association in disorder_tag.find('HPODisorderAssociationList').findall('HPODisorderAssociation'):
            row_data['HPODisorderAssociation_id'] = association.attrib['id']
            
            hpo_tag = find_value(row_data, association, 'HPO', 'HPO_id', text=False)
            find_value(row_data, hpo_tag, 'HPOId', 'HPOId', text=True)
            find_value(row_data, hpo_tag, 'HPOTerm', 'HPOTerm', text=True)
            hpofrequency_tag = find_value(row_data, association, 'HPOFrequency', 'HPOFrequency_id', text=False)
            find_value(row_data, hpofrequency_tag, 'Name', 'HPOFrequency_Name', text=True)
            
            diagnosticcriteria_tag = find_value(row_data, association, 'DiagnosticCriteria', 'DiagnosticCriteria_id', text=False)
            find_value(row_data, diagnosticcriteria_tag, 'Name', 'DiagnosticCriteria_Name', text=True)
            
            csvwriter.writerow(row_data)

In [39]:
df = pd.read_csv('data/en_product4.csv')
df.head(5)

Unnamed: 0,HPODisorderSetStatus_id,Disorder_id,OrphaCode,ExpertLink,Name,DisorderType_id,DisorderType_name,DisorderGroup_id,DisorderGroup_Name,HPODisorderAssociation_id,...,HPOId,HPOTerm,HPOFrequency_id,HPOFrequency_Name,DiagnosticCriteria_id,DiagnosticCriteria_Name,Source,ValidationStatus,Online,ValidationDate
0,1,2,58,http://www.orpha.net/consor/cgi-bin/OC_Exp.php...,Alexander disease,21394,Disease,36547,Disorder,327485,...,HP:0000256,Macrocephaly,28412,Very frequent (99-80%),,,,y,y,2016-06-01 00:00:00.0
1,1,2,58,http://www.orpha.net/consor/cgi-bin/OC_Exp.php...,Alexander disease,21394,Disease,36547,Disorder,327486,...,HP:0001249,Intellectual disability,28412,Very frequent (99-80%),,,,y,y,2016-06-01 00:00:00.0
2,1,2,58,http://www.orpha.net/consor/cgi-bin/OC_Exp.php...,Alexander disease,21394,Disease,36547,Disorder,327487,...,HP:0001250,Seizures,28412,Very frequent (99-80%),,,,y,y,2016-06-01 00:00:00.0
3,1,2,58,http://www.orpha.net/consor/cgi-bin/OC_Exp.php...,Alexander disease,21394,Disease,36547,Disorder,327488,...,HP:0001257,Spasticity,28412,Very frequent (99-80%),,,,y,y,2016-06-01 00:00:00.0
4,1,2,58,http://www.orpha.net/consor/cgi-bin/OC_Exp.php...,Alexander disease,21394,Disease,36547,Disorder,327489,...,HP:0001274,Agenesis of corpus callosum,28412,Very frequent (99-80%),,,,y,y,2016-06-01 00:00:00.0


### Merge ORDO, HP and HOOM ontologies (OWL)

https://bioportal.bioontology.org/ontologies/ORDO?p=summary

https://bioportal.bioontology.org/ontologies/HP?p=summary

https://bioportal.bioontology.org/ontologies/HOOM?p=summary

Using Protégé, merge HP into HOOM and ORDO into HOOM

'HOOM is a module that qualifies the annotation between a clinical entity and phenotypic abnormalities according to a frequency and by integrating the notion of diagnostic criterion.'

### Merge ORDO and HP ontologies using the dataset (CSV)

In [None]:
df_ordo = pd.read_csv('data/ORDO.csv', dtype='object')
df_hp = pd.read_csv('data/HP.csv', dtype='object')
df_dataset = pd.read_csv('data/en_product4.csv', dtype='object')


# prefixes to distinguish the columns from the 2 ontologies
df_ordo = df_ordo.add_prefix('ORDO_')
df_hp = df_hp.add_prefix('HP_')

# normalizing the different columns for the merge
df_ordo['OrphaCode'] = df_ordo['ORDO_Class ID'].map(lambda x: x.replace('http://www.orpha.net/ORDO/Orphanet_', ''))
df_hp['HPOId'] = df_hp['HP_http://www.w3.org/2004/02/skos/core#notation']
df_dataset['OrphaCode'] = df_dataset['OrphaCode'].astype(str)

# merge
df_merged = pd.merge(df_dataset, df_hp, how='left', on='HPOId')
df_merged = pd.merge(df_merged, df_ordo, how='left', on='OrphaCode')

df_merged.head(1000).to_csv('data/merged_ontologies.csv', encoding='utf-8', index=False)

In [None]:
df_dataset = pd.read_csv('data/en_product4.csv', dtype='object')
df_res = pd.read_csv('data/merged_ontologies.csv', dtype='object')
print(df_dataset.shape)
print(df_res.shape)
print(df_res[['Name', 'HPOId', 'HP_Class ID', 'ORDO_Class ID']])

### Dataset to triples, entities and relations

In [36]:
freq_assoc = {  # from csv frequency to frequency code
    'Obligate (100%)': 'O',
    'Very frequent (99-80%)': 'VF',
    'Frequent (79-30%)': 'F',
    'Occasional (29-5%)': 'OC',
    'Very rare (<4-1%)': 'VR',
    'Excluded (0%)': 'E'
}

freq_code_assoc = {  # from frequency code to output class
    'O': 'obligate',
    'VF': 'very_frequent',
    'F': 'frequent',
    'OC': 'occasional',
    'VR': 'very_rare',
    'E': 'excluded'
}

dc_association = {  # default: exclusion
    'Diagnostic criterion': 'diagnostic_criterion',
    'Pathognomonic sign': 'pathognomonic_sign',
}


def get_association_subclass(orpha, freq, hp):
    """Returns normalized association class
    
    Args:
        orpha (str):
            The prefixed Orphanet code
        freq (str):
            The frequency text
        hp (str):
            The prefixed HPO ID
    Returns:
        (str):
            The orphacode, hpo id and frequency association
    """
    return orpha + '_' + hp + '_FREQ:' + freq_assoc.get(freq)


def get_association_name(orpha, freq, hp):
    """Returns textual description of the association class
    
    Args:
        orpha (str):
            The prefixed Orphanet code
        freq (str):
            The frequency text
        hp (str):
            The prefixed HPO ID
    Returns:
        (str):
            The orphacode, hpo id and frequency association textual_description_with_underscores
    """
    return get_normalized_string(orpha_entities.get(orpha) + ' and ' + hpo_entities.get(hp) +\
            ' ' + freq_code_assoc.get(freq_assoc.get(freq)) + ' association')


def get_normalized_string(s):
    """Transforms a string to lowercase and replaces all whitespace runs with an underscore
    
    Args:
        s (str):
            String to normalize
    Returns:
        (str):
            Normalized string
    """
    return re.sub(r"\s+", '_', s.lower())


df_dataset = pd.read_csv('data/en_product4.csv', dtype='object')
df_dataset['OrphaCode'] = df_dataset['OrphaCode'].map(lambda x: 'ORPHA:' + x)

# key is id, value is textual_description_with_underscores
assoc_entities = {}
dc_entities = {'diagnostic_criterion': 'diagnostic_criterion', 'pathognomonic_sign': 'pathognomonic_sign', 'exclusion':'exclusion'}
freq_assoc_entities = {'obligate': 'obligate', 'very_frequent': 'very_frequent', 'frequent': 'frequent', 
                       'occasional': 'occasional', 'very_rare': 'very_rare', 'excluded': 'excluded'}
hpo_entities = {}
orpha_entities = {}

has_object_triples = []  # association has_object HPOId
has_subject_triples = []  # association has_subject OrphaCode
has_frequency_triples = []  # association has_frequency FrequencyAssociation
has_diagnostic_criterion_triples = []  # association has_DC_attribute DC


# reading the dataset
for orpha, freq, hp, dc, orpha_name, hpo_name in zip(df_dataset['OrphaCode'], df_dataset['HPOFrequency_Name'], 
                                                     df_dataset['HPOId'], df_dataset['DiagnosticCriteria_Name'], 
                                                     df_dataset['Name'], df_dataset['HPOTerm']):
    if hp not in hpo_entities: hpo_entities[hp] = get_normalized_string(hpo_name)
    if orpha not in orpha_entities: orpha_entities[orpha] = get_normalized_string(orpha_name)
    ac = get_association_subclass(orpha, freq, hp)
    ac_name = get_association_name(orpha, freq, hp)
    assoc_entities[ac] = ac_name
    has_object_triples.append((ac, 'association_has_object', hp))
    has_subject_triples.append((ac, 'association_has_subject', orpha))
    if (freq_code_assoc.get(freq_assoc.get(freq)) is None): print('a: ', freq)
    has_frequency_triples.append((ac, 'has_frequency', freq_code_assoc.get(freq_assoc.get(freq))))
    has_diagnostic_criterion_triples.append((ac, 'has_DC_attribute', dc_association.get(dc, 'exclusion')))

    
# lists corresponding to each output file
triples = []
triples_names = []
entities = []
entities_names = []
relations = []

# subClassOf triples
for k, v in assoc_entities.items(): 
    triples.append((k, 'subClassOf', 'association'))
    triples_names.append((v, 'subClassOf', 'association'))
for k, v in dc_entities.items(): 
    triples.append((k, 'subClassOf', 'diagnostic_criterion'))
    triples_names.append((v, 'subClassOf', 'diagnostic_criterion'))
for k, v in freq_assoc_entities.items(): 
    triples.append((k, 'subClassOf', 'frequency_association'))
    triples_names.append((v, 'subClassOf', 'frequency_association'))
for k, v in hpo_entities.items(): 
    triples.append((k, 'subClassOf', 'HPO_Id'))
    triples_names.append((v, 'subClassOf', 'HPO_Id'))
for k, v in orpha_entities.items(): 
    triples.append((k, 'subClassOf', 'OrphaCode'))
    triples_names.append((v, 'subClassOf', 'OrphaCode'))

# other properties triples
for (s, r, o) in has_object_triples:
    triples.append((s, r, o))
    triples_names.append((assoc_entities.get(s), r, hpo_entities.get(o)))
for (s, r, o) in has_subject_triples:
    triples.append((s, r, o))
    triples_names.append((assoc_entities.get(s), r, orpha_entities.get(o)))
for (s, r, o) in has_frequency_triples:
    triples.append((s, r, o))
    triples_names.append((assoc_entities.get(s), r, o))
for (s, r, o) in has_diagnostic_criterion_triples:
    triples.append((s, r, o))
    triples_names.append((assoc_entities.get(s), r, o))

# entities
for i, (k, v) in enumerate({**assoc_entities, **dc_entities, **freq_assoc_entities, **hpo_entities, **orpha_entities}.items()):
    entities.append((i, k))
    entities_names.append((i, v))

# relations
for i, r in enumerate(['subClassOf', 'association_has_object', 'association_has_subject', 'has_frequency', 'has_DC_attribute']):
    relations.append((i, r))
    

# writing to the different files
with open('data/triples.txt', 'w') as f:
    for t in triples:
        f.write('\t'.join(t) + '\n')
with open('data/triples_names.txt', 'w') as f:
    for t in triples_names:
        f.write('\t'.join(t) + '\n')
with open('data/entities.dict', 'w') as f:
    for t in entities:
        f.write('\t'.join(str(e) for e in t) + '\n')
with open('data/entities_names.dict', 'w') as f:
    for t in entities_names:
        f.write('\t'.join(str(e) for e in t) + '\n')
with open('data/relations.dict', 'w') as f:
    for t in relations:
        f.write('\t'.join(str(e) for e in t) + '\n')