*MeNu GUIDE*
# Integrate Ontology and Database Info - Markers

In [1]:
import rdflib
import pandas as pd
from rdflib import URIRef, Literal, Namespace, RDF, XSD
import os

In [2]:
MeNuGUIDE = Namespace("http://MeNuGUIDE.local/")
ChEBI = Namespace("http://purl.obolibrary.org/obo/chebi/")
OBO = Namespace("http://purl.obolibrary.org/obo/")
GO = Namespace("http://www.geneontology.org/formats/oboInOwl#")
FOBI = Namespace("http://purl.obolibrary.org/obo/FOBI_")

In [3]:
MeNuGUIDE.Measurement

rdflib.term.URIRef('http://MeNuGUIDE.local/Measurement')

## Load Graph

In [None]:
markerdb_folder = "/path/to/markerdb/data/folder/"
ontology_folder = "/path/to/ontologies/"
processed_data_folder = "/path/to/processed/data/folder/"

In [4]:
onto_graph = rdflib.Graph()
onto_graph.parse(os.path.join(ontology_folder, "merged_with_foods_compounds_reactions.ttl"), format="turtle")

<Graph identifier=Na02196d5e708470a901ab294d3182d1d (<class 'rdflib.graph.Graph'>)>

## Prepare MarkerDB Data
### Chemical References

In [5]:
chemical_references = pd.read_csv(os.path.join(markerdb_folder, "markerdb_chemical_associations.csv"))

In [6]:
chemical_references

Unnamed: 0,markerdb_id,name_markerdb,condition_id,condition,cohort,sex,sample_type,concentration,unit,notes,reference_id,reference_name,pubmed_id
0,1,1-Methylhistidine,251,Obesity,Adult: >=18 yrs old,Both,Urine,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,,654,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0
1,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Urine,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,,367200,David F. Putnam Composition and Concentrative ...,
2,1,1-Methylhistidine,33,Alzheimer's Disease,Adult: >=18 yrs old,Both,Urine,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,,129,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0
3,1,1-Methylhistidine,34,Pregnancy,Adult: >=18 yrs old,Female,Blood,50.0 uM,uM,,367357,,22494326.0
4,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Blood,12.7 uM,uM,,116748,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3766,5017,Triclosan,1,Normal,All,Both,Urine,0.0059 (0.0052-0.0068) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,
3767,5017,Triclosan,6689,Triclosan Exposure,All,Both,Urine,0.25 (0.20-0.31) umol/mmol creatinine,umol/mmol creatinine,Total population (2011 - 2012),367558,National Health and Nutrition Examination Surv...,
3768,5017,Triclosan,1,Normal,All,Both,Urine,0.0052 (0.0046-0.0059) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,
3769,5017,Triclosan,6689,Triclosan Exposure,All,Both,Urine,0.14 (0.11-0.17) umol/mmol creatinine,umol/mmol creatinine,Total population (2013 - 2014),367558,National Health and Nutrition Examination Surv...,


In [7]:
conditions = list(chemical_references.condition.unique())

In [8]:
exposures = []
diseases = []

for condition in conditions:
    if 'exposure' in condition.lower():
        exposures.append(condition)
    else:
        diseases.append(condition)

In [9]:
len(exposures)

59

In [10]:
len(diseases)

293

In [11]:
def find_term(search_term, label_type=''):
    query = f"""
    SELECT ?entity ?label
    WHERE {{
      ?entity rdfs:label "{search_term}"{label_type} .
    }}
    """
    
    query_2 = f"""
    SELECT ?entity ?label
    WHERE {{
      ?entity rdfs:label ?label .
      FILTER (LCASE(STR(?label)) = LCASE("{search_term}"))
    }}
    """
    
    # Execute the query
    results = onto_graph.query(query_2)
    return list(results)

def find_term_by_synonym(search_term):
    query = f"""
    SELECT ?entity ?label
    WHERE {{
      ?entity <http://www.geneontology.org/formats/obolnOwl#hasExactSynonym> ?label .
      FILTER (LCASE(STR(?label)) = LCASE("{search_term}"))
    }}
    """
    
    results = onto_graph.query(query)
    return list(results)

In [12]:
matched_diseases = {}
unmatched_diseases = [] 

for disease in diseases:
    print(disease)
    matches = find_term(disease)
    matches.extend(find_term_by_synonym(disease))
    
    if matches:
        matched_diseases[disease] = matches
    else:
        unmatched_diseases.append(disease)

Obesity
Normal
Alzheimer's Disease
Pregnancy
Preeclampsia/Eclampsia
Chronic Kidney Disease
Leukemia
Pyruvate Dehydrogenase Deficiency
Prolactinoma
Fumarase Deficiency
Pyruvate Carboxylase Deficiency
Diabetes Mellitus Type 2
Meningitis
Anoxia
Medium Chain Acyl Co A Dehydrogenase Deficiency
Pyruvate Dehydrogenase Phosphatase Deficiency
Long Chain 3 Hydroxyacyl Co A Dehydrogenase Deficiency
3 Hydroxyacyl Co A Dehydrogenase Deficiency
3 Hydroxy 3 Methylglutaryl Co A Lyase Deficiency
Carnitine Palmitoyltransferase I Deficiency
Schizophrenia
Eosinophilic Esophagitis
11 Beta Hydroxylase Deficiency
Congenital Adrenal Hyperplaia due to 17-alpha-Hydroxylase Deficiency
Maple Syrup Urine Disease
Phenylketonuria
Lung Cancer
Parkinson's Disease
Mild Metachromatic Leukodystrophy
Myocardial Infarction
Head Injury
Carnosinuria
Lewy Body Dementia
Lipoid Congenital Adrenal Hyperplasia
Antenatal Bartter Syndrome Type 2
Type 4B Bartter Syndrome
Antenatal Bartter Syndrome Type 1
Congenital Adrenal Insuffici

In [13]:
len(matched_diseases)

143

In [14]:
len(unmatched_diseases)

150

In [15]:
matched_diseases_copy = matched_diseases.copy()

In [16]:
matched_diseases_copy

{'Obesity': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_9970'),
   rdflib.term.Literal('obesity', lang='en')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0001513'),
   rdflib.term.Literal('Obesity'))],
 'Normal': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/PATO_0000461'),
   rdflib.term.Literal('normal'))],
 "Alzheimer's Disease": [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_10652'),
   rdflib.term.Literal("Alzheimer's disease", lang='en'))],
 'Chronic Kidney Disease': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_784'),
   rdflib.term.Literal('chronic kidney disease', lang='en')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0012622'),
   rdflib.term.Literal('Chronic kidney disease'))],
 'Leukemia': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0001909'),
   rdflib.term.Literal('Leukemia')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_1240'),
   rdflib.term.Literal('leukemia', lang='en')

In [17]:
unmatched_diseases

['Pregnancy',
 'Preeclampsia/Eclampsia',
 'Pyruvate Dehydrogenase Deficiency',
 'Pyruvate Carboxylase Deficiency',
 'Diabetes Mellitus Type 2',
 'Medium Chain Acyl Co A Dehydrogenase Deficiency',
 'Pyruvate Dehydrogenase Phosphatase Deficiency',
 'Long Chain 3 Hydroxyacyl Co A Dehydrogenase Deficiency',
 '3 Hydroxyacyl Co A Dehydrogenase Deficiency',
 '3 Hydroxy 3 Methylglutaryl Co A Lyase Deficiency',
 '11 Beta Hydroxylase Deficiency',
 'Congenital Adrenal Hyperplaia due to 17-alpha-Hydroxylase Deficiency',
 'Mild Metachromatic Leukodystrophy',
 'Head Injury',
 'Lipoid Congenital Adrenal Hyperplasia',
 'Antenatal Bartter Syndrome Type 2',
 'Type 4B Bartter Syndrome',
 'Antenatal Bartter Syndrome Type 1',
 'Congenital Adrenal Insufficiency with 46,XY Sex Reversal',
 'Corticosterone Methyloxidase Type I Deficiency',
 'A.I.D.S.',
 'Peritoneal Dialysis',
 'Hemodialysis',
 'Hyperoxalemia',
 'Adenylosuccinate Lyase Deficiency',
 'Adult Onset Type II Citrullinemia',
 'N Acetylglutamate Synth

In [18]:
def get_all_triples_for_iri(iri_to_check):
    sparql_query = f"""
    SELECT ?s ?p ?o
    WHERE {{
        {{ <{iri_to_check}> ?p ?o . }}
        UNION
        {{ ?s ?p <{iri_to_check}> . }}
    }}
    """
    
    # Execute the SPARQL query
    results = onto_graph.query(sparql_query)
    return list(results)

In [19]:
for result in get_all_triples_for_iri(URIRef("http://purl.obolibrary.org/obo/DOID_10591")):
    print(result)

(None, rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(None, rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.Literal('pre-eclampsia', lang='en'))
(None, rdflib.term.URIRef('http://purl.obolibrary.org/obo/IAO_0000115'), rdflib.term.Literal('A hypertension occurring during pregnancy characterized by large amounts of protein in the urine (proteinuria) and edema, usually by the last trimester of pregnancy.', lang='en'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasAlternativeId'), rdflib.term.Literal('DOID:12684'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'), rdflib.term.Literal('ICD10CM:O14'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'), rdflib.term.Literal('MESH:D011225'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'), rdflib.term.

In [20]:
unmatched_diseases_df = pd.DataFrame(unmatched_diseases, columns=['disease'])
unmatched_diseases_df['uri'] = ''

In [None]:
unmatched_diseases.to_csv(os.path.join(markerdb_folder, "diseases_to_match_manually.csv"))

$\rightarrow$ Manual matching

In [22]:
manually_matched_diseases = pd.read_csv(os.path.join(markerdb_folder, "manually_matched_diseases.csv"), sep=';')

In [23]:
matched_diseases_copy

{'Obesity': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_9970'),
   rdflib.term.Literal('obesity', lang='en')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0001513'),
   rdflib.term.Literal('Obesity'))],
 'Normal': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/PATO_0000461'),
   rdflib.term.Literal('normal'))],
 "Alzheimer's Disease": [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_10652'),
   rdflib.term.Literal("Alzheimer's disease", lang='en'))],
 'Chronic Kidney Disease': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_784'),
   rdflib.term.Literal('chronic kidney disease', lang='en')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0012622'),
   rdflib.term.Literal('Chronic kidney disease'))],
 'Leukemia': [(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0001909'),
   rdflib.term.Literal('Leukemia')),
  (rdflib.term.URIRef('http://purl.obolibrary.org/obo/DOID_1240'),
   rdflib.term.Literal('leukemia', lang='en')

In [24]:
multiple_matches_count = 0

matched_diseases_cleaned = []

for disease, matches in matched_diseases_copy.items():
    if len(matches) > 1:
        multiple_matches_count += 1
        found_doid_term = False
        found_hp_term = False
        for match in matches:
            match_uri = match[0]
            if 'DOID'  in match_uri:
                found_doid_term = match_uri
            if 'HP_' in match_uri:
                found_hp_term = match_uri
        
        if found_doid_term:
            matched_diseases_cleaned.append([disease, found_doid_term])
        elif found_hp_term:
            matched_diseases_cleaned.append([disease, found_hp_term])
    else:
        matched_diseases_cleaned.append([disease, matches[0][0]])
        
multiple_matches_count

49

In [25]:
matched_diseases_cleaned_df = pd.DataFrame(matched_diseases_cleaned, columns=['disease', 'uri'])

In [26]:
manually_matched_diseases = manually_matched_diseases[manually_matched_diseases.uri.notna()]

In [27]:
matched_disease_uris = pd.concat([matched_diseases_cleaned_df, manually_matched_diseases])

In [28]:
chemical_references_with_uri = pd.merge(chemical_references, matched_disease_uris, left_on='condition', right_on='disease', how='inner')

In [29]:
chemical_references_with_uri

Unnamed: 0,markerdb_id,name_markerdb,condition_id,condition,cohort,sex,sample_type,concentration,unit,notes,reference_id,reference_name,pubmed_id,disease,uri
0,1,1-Methylhistidine,251,Obesity,Adult: >=18 yrs old,Both,Urine,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,,654,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0,Obesity,http://purl.obolibrary.org/obo/DOID_9970
1,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Urine,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,,367200,David F. Putnam Composition and Concentrative ...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
2,1,1-Methylhistidine,33,Alzheimer's Disease,Adult: >=18 yrs old,Both,Urine,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,,129,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0,Alzheimer's Disease,http://purl.obolibrary.org/obo/DOID_10652
3,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Blood,12.7 uM,uM,,116748,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0,Normal,http://purl.obolibrary.org/obo/PATO_0000461
4,1,1-Methylhistidine,5988,Preeclampsia/Eclampsia,Adult: >=18 yrs old,Female,Blood,50.7 uM,uM,,367357,,22494326.0,Preeclampsia/Eclampsia,http://purl.obolibrary.org/obo/DOID_10591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3094,5017,Triclosan,1,Normal,All,Both,Urine,0.0070 (0.0063-0.0079) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3095,5017,Triclosan,1,Normal,All,Both,Urine,0.0061 (0.0054-0.0068) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3096,5017,Triclosan,1,Normal,All,Both,Urine,0.0059 (0.0052-0.0068) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3097,5017,Triclosan,1,Normal,All,Both,Urine,0.0052 (0.0046-0.0059) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461


In [30]:
compound_identifiers = pd.read_csv(os.path.join(processed_data_folder, "compounds_all_databases_merged_ids.csv"), dtype={'markerdb_id': 'string', 'kegg_id': 'string', 'vmh_id': 'string'})
compound_identifiers = compound_identifiers[compound_identifiers.markerdb_id.notna()]

In [31]:
compound_identifiers

Unnamed: 0.1,Unnamed: 0,menuguide_id,name,hmdb_id,foodb_id,chebi_id,markerdb_id,kegg_id,exposome_explorer_id,vmh_id,foodb_id_internal
13,13,compound_14,cortexolone,HMDB0000015,FDB021872,86600.0,MDB00000009,C05488,,11docrtsl,22035.0
33,33,compound_34,12-hete,HMDB0006111,FDB001435,19138.0,MDB00001296,C14777,,12harachd,1435.0
46,46,compound_47,propylene glycol,HMDB0001881,FDB008274,28972.0,MDB00000688,C02912,,12ppd_r,8275.0
76,76,compound_77,17-hydroxyprogesterone,HMDB0000374,FDB021992,17252.0,MDB00000222,C01176,,17ahprgstrn,22155.0
110,110,compound_111,1-methylnicotinamide,HMDB0000699,FDB022188,16797.0,MDB00000374,C02918,,1mncam,22351.0
...,...,...,...,...,...,...,...,...,...,...,...
403982,403982,compound_403983,22:1-18:3-pc,HMDB0008567,FDB025757,191356.0,MDB00002859,,,,25920.0
404070,404070,compound_404071,17a-hydroxypregnenolone,HMDB0000363,FDB030278,28750.0,MDB00000219,C05138,,17ahprgnlone,30441.0
404192,404192,compound_404193,norepinephrine,HMDB0000216,FDB097324,18357.0,MDB00000136,C00547,2076.0,nrpphr,125156.0
404256,404256,compound_404257,acetic acid,HMDB0000042,FDB019725,15366.0,MDB00000029,C00033,1819.0,ac,19732.0


In [32]:
chemical_references_with_uri

Unnamed: 0,markerdb_id,name_markerdb,condition_id,condition,cohort,sex,sample_type,concentration,unit,notes,reference_id,reference_name,pubmed_id,disease,uri
0,1,1-Methylhistidine,251,Obesity,Adult: >=18 yrs old,Both,Urine,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,,654,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0,Obesity,http://purl.obolibrary.org/obo/DOID_9970
1,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Urine,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,,367200,David F. Putnam Composition and Concentrative ...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
2,1,1-Methylhistidine,33,Alzheimer's Disease,Adult: >=18 yrs old,Both,Urine,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,,129,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0,Alzheimer's Disease,http://purl.obolibrary.org/obo/DOID_10652
3,1,1-Methylhistidine,1,Normal,Adult: >=18 yrs old,Both,Blood,12.7 uM,uM,,116748,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0,Normal,http://purl.obolibrary.org/obo/PATO_0000461
4,1,1-Methylhistidine,5988,Preeclampsia/Eclampsia,Adult: >=18 yrs old,Female,Blood,50.7 uM,uM,,367357,,22494326.0,Preeclampsia/Eclampsia,http://purl.obolibrary.org/obo/DOID_10591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3094,5017,Triclosan,1,Normal,All,Both,Urine,0.0070 (0.0063-0.0079) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3095,5017,Triclosan,1,Normal,All,Both,Urine,0.0061 (0.0054-0.0068) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3096,5017,Triclosan,1,Normal,All,Both,Urine,0.0059 (0.0052-0.0068) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461
3097,5017,Triclosan,1,Normal,All,Both,Urine,0.0052 (0.0046-0.0059) umol/mmol creatinine,umol/mmol creatinine,,367558,National Health and Nutrition Examination Surv...,,Normal,http://purl.obolibrary.org/obo/PATO_0000461


In [33]:
markerdb_id_len = 8
markerdb_id_initials = "MDB"

def format_markerdb_id(markerdb_id):
    markerdb_id = str(markerdb_id)
    number_count = len(markerdb_id)
    
    markerdb_id_formatted = f"{markerdb_id_initials}{'0'*(markerdb_id_len-number_count)}{markerdb_id}"
    return markerdb_id_formatted

chemical_references_with_uri.loc[:, 'markerdb_id'] = chemical_references_with_uri.markerdb_id.apply(format_markerdb_id)

 'MDB00005017']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  chemical_references_with_uri.loc[:, 'markerdb_id'] = chemical_references_with_uri.markerdb_id.apply(format_markerdb_id)


In [34]:
measurements_with_ids = pd.merge(chemical_references_with_uri, compound_identifiers, on='markerdb_id', how='inner')

In [35]:
measurements_with_ids = measurements_with_ids[['menuguide_id', 'uri', 'cohort', 'sample_type', 'sex', 'concentration', 'unit', 'reference_name', 'pubmed_id']]
measurements_with_ids

Unnamed: 0,menuguide_id,uri,cohort,sample_type,sex,concentration,unit,reference_name,pubmed_id
0,compound_6317,http://purl.obolibrary.org/obo/DOID_9970,Adult: >=18 yrs old,Urine,Both,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0
1,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,David F. Putnam Composition and Concentrative ...,
2,compound_6317,http://purl.obolibrary.org/obo/DOID_10652,Adult: >=18 yrs old,Urine,Both,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0
3,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Blood,Both,12.7 uM,uM,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0
4,compound_6317,http://purl.obolibrary.org/obo/DOID_10591,Adult: >=18 yrs old,Blood,Female,50.7 uM,uM,,22494326.0
...,...,...,...,...,...,...,...,...,...
2982,compound_5681,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,0.36 (0.32-0.40) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2983,compound_5681,http://purl.obolibrary.org/obo/PATO_0000461,Adolescent:13-18 yrs old,Urine,Both,0.23 (0.20-0.26) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2984,compound_5680,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,2.2 (2.1-2.3) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2985,compound_5680,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,0.0023 (0.00090-0.0058) umol/mmol creatinine,umol/mmol creatinine,,24023812.0


In [36]:
measurements_with_ids = measurements_with_ids.reset_index(drop=True)
measurements_with_ids = measurements_with_ids.reset_index(names='menuguide_measurement_id')
measurements_with_ids

Unnamed: 0,menuguide_measurement_id,menuguide_id,uri,cohort,sample_type,sex,concentration,unit,reference_name,pubmed_id
0,0,compound_6317,http://purl.obolibrary.org/obo/DOID_9970,Adult: >=18 yrs old,Urine,Both,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0
1,1,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,David F. Putnam Composition and Concentrative ...,
2,2,compound_6317,http://purl.obolibrary.org/obo/DOID_10652,Adult: >=18 yrs old,Urine,Both,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0
3,3,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Blood,Both,12.7 uM,uM,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0
4,4,compound_6317,http://purl.obolibrary.org/obo/DOID_10591,Adult: >=18 yrs old,Blood,Female,50.7 uM,uM,,22494326.0
...,...,...,...,...,...,...,...,...,...,...
2982,2982,compound_5681,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,0.36 (0.32-0.40) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2983,2983,compound_5681,http://purl.obolibrary.org/obo/PATO_0000461,Adolescent:13-18 yrs old,Urine,Both,0.23 (0.20-0.26) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2984,2984,compound_5680,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,2.2 (2.1-2.3) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2985,2985,compound_5680,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,0.0023 (0.00090-0.0058) umol/mmol creatinine,umol/mmol creatinine,,24023812.0


In [37]:
measurements_with_ids.loc[:, 'menuguide_measurement_id'] = measurements_with_ids.menuguide_measurement_id.apply(lambda x: f"measurement_{x}")

 'measurement_2985' 'measurement_2986']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  measurements_with_ids.loc[:, 'menuguide_measurement_id'] = measurements_with_ids.menuguide_measurement_id.apply(lambda x: f"measurement_{x}")


In [38]:
measurements_with_ids

Unnamed: 0,menuguide_measurement_id,menuguide_id,uri,cohort,sample_type,sex,concentration,unit,reference_name,pubmed_id
0,measurement_0,compound_6317,http://purl.obolibrary.org/obo/DOID_9970,Adult: >=18 yrs old,Urine,Both,10.9 (0.80-21.0) umol/mmol creatinine,umol/mmol creatinine,"Tuma, P., Samcova, E. & Balinova, P. Determina...",15899597.0
1,measurement_1,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,85.8 (17.7-153.8) umol/mmol creatinine,umol/mmol creatinine,David F. Putnam Composition and Concentrative ...,
2,measurement_2,compound_6317,http://purl.obolibrary.org/obo/DOID_10652,Adult: >=18 yrs old,Urine,Both,15.7 (11.7-19.7) umol/mmol creatinine,umol/mmol creatinine,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",17031479.0
3,measurement_3,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Blood,Both,12.7 uM,uM,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",7061274.0
4,measurement_4,compound_6317,http://purl.obolibrary.org/obo/DOID_10591,Adult: >=18 yrs old,Blood,Female,50.7 uM,uM,,22494326.0
...,...,...,...,...,...,...,...,...,...,...
2982,measurement_2982,compound_5681,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,0.36 (0.32-0.40) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2983,measurement_2983,compound_5681,http://purl.obolibrary.org/obo/PATO_0000461,Adolescent:13-18 yrs old,Urine,Both,0.23 (0.20-0.26) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2984,measurement_2984,compound_5680,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,2.2 (2.1-2.3) umol/mmol creatinine,umol/mmol creatinine,,26910390.0
2985,measurement_2985,compound_5680,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,0.0023 (0.00090-0.0058) umol/mmol creatinine,umol/mmol creatinine,,24023812.0


In [39]:
chemical_references_with_uri.cohort.unique()

array(['Adult: >=18 yrs old', 'Infant: 1 wk-2 yrs old',
       'Children: 2-17 yrs old', 'Newborn: 0 day-1 wk', 'Unknown',
       'Adolescent:13-18 yrs old', 'Elderly: 65-100+ yrs old', 'All'],
      dtype=object)

In [40]:
chemical_references_with_uri.sex.unique()

array(['Both', 'Female', 'Male', 'Unknown'], dtype=object)

In [41]:
measurements_with_ids.unit.unique()

array(['umol/mmol creatinine', 'uM', 'nmol/g wet feces'], dtype=object)

In [42]:
measurements_with_ids.loc[:, 'concentration'] = measurements_with_ids.concentration.apply(lambda x: float(x.split(' ')[0]))

In [43]:
measurements_with_ids.loc[:, 'pubmed_id'] = measurements_with_ids.pubmed_id.apply(lambda x: f"PMID:{str(x).strip('.0')}" if pd.notna(x) else x)

 nan]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  measurements_with_ids.loc[:, 'pubmed_id'] = measurements_with_ids.pubmed_id.apply(lambda x: f"PMID:{str(x).strip('.0')}" if pd.notna(x) else x)


In [44]:
measurements_with_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2987 entries, 0 to 2986
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   menuguide_measurement_id  2987 non-null   object
 1   menuguide_id              2987 non-null   object
 2   uri                       2987 non-null   object
 3   cohort                    2987 non-null   object
 4   sample_type               2987 non-null   object
 5   sex                       2987 non-null   object
 6   concentration             2987 non-null   object
 7   unit                      2987 non-null   object
 8   reference_name            1510 non-null   object
 9   pubmed_id                 2481 non-null   object
dtypes: object(10)
memory usage: 233.5+ KB


In [45]:
query = f"""
    SELECT ?entity ?label
    WHERE {{
      ?entity rdfs:label "HasBiomarker" .
    }}
    """
    
    # Execute the query
results = onto_graph.query(query)
list(results)

[(rdflib.term.URIRef('http://purl.obolibrary.org/obo/FOBI_00423'), None)]

In [46]:
sample_type_dict = {
    "Urine": "http://purl.obolibrary.org/obo/UBERON_0001088",
    "Blood": "http://purl.obolibrary.org/obo/UBERON_0000178",
    "Cerebrospinal_Fluid": "http://purl.obolibrary.org/obo/UBERON_0001359",
    "Cellular_Cytoplasm": "http://purl.obolibrary.org/obo/GO_0005737",
    "Saliva": "http://purl.obolibrary.org/obo/UBERON_0001836",
    "Feces": "http://purl.obolibrary.org/obo/UBERON_0001988",
    "Sweat": "http://purl.obolibrary.org/obo/UBERON_0001089",
    "Serum": "http://purl.obolibrary.org/obo/UBERON_0001977"
}

In [47]:
def add_measurement_to_graph(row):
    measurement = URIRef(MeNuGUIDE[row['menuguide_measurement_id']])
    compound = URIRef(MeNuGUIDE[row['menuguide_id']])
    condition = URIRef(row['uri'])
    cohort = row['cohort']
    sample_type = URIRef(sample_type_dict[row['sample_type']])
    sex = row['sex']
    concentration = Literal(row['concentration'], datatype=XSD.float)
    unit = Literal(row['unit'])
    reference = row['pubmed_id'] if pd.notna(row['pubmed_id']) else row['reference_name']
    
    onto_graph.add((measurement, RDF.type, MeNuGUIDE.Measurement))
    onto_graph.add((condition, RDF.type, MeNuGUIDE.Condition))
    
    # hasMeasurement - MeNuGUIDE.hasMeasurement
    onto_graph.add((condition, MeNuGUIDE.hasMeasurement, measurement))
    
    # isMeasurementOf - MeNuGUIDE.isMeasurementOf
    onto_graph.add((measurement, MeNuGUIDE.isMeasurementOf, condition))
    
    # amount - MeNuGUIDE.amount
    onto_graph.add((measurement, MeNuGUIDE.amount, concentration))
    
    # unit - MeNuGUIDE.unit
    onto_graph.add((measurement, MeNuGUIDE.unit, unit))
    
    # hasCohort - MeNuGUIDE.hasCohort
    if cohort != 'All' and cohort != 'Unknown':
        onto_graph.add((measurement, MeNuGUIDE.hasCohort, Literal(cohort)))
    
    # hasSex - MeNuGUIDE.hasSex
    if sex != 'Unknown' and sex != 'Both':
        onto_graph.add((measurement, MeNuGUIDE.hasSex, Literal(sex)))
    
    # hasSampleType - MeNuGUIDE.hasSampleType
    onto_graph.add((measurement, MeNuGUIDE.hasSampleType, sample_type))
    
    # reference or PMID - MeNuGUIDE.hasReference
    if pd.notna(reference):
        onto_graph.add((measurement, MeNuGUIDE.hasReference, Literal(reference)))
    
    # BiomarkerOf - http://purl.obolibrary.org/obo/FOBI_00422
    onto_graph.add((compound, OBO.FOBI_00422, measurement))
    
    # HasBiomarker - http://purl.obolibrary.org/obo/FOBI_00423
    onto_graph.add((measurement, OBO.FOBI_00423, compound))

In [48]:
measurements_with_ids.apply(add_measurement_to_graph, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2982    None
2983    None
2984    None
2985    None
2986    None
Length: 2987, dtype: object

In [49]:
measurements_with_ids

Unnamed: 0,menuguide_measurement_id,menuguide_id,uri,cohort,sample_type,sex,concentration,unit,reference_name,pubmed_id
0,measurement_0,compound_6317,http://purl.obolibrary.org/obo/DOID_9970,Adult: >=18 yrs old,Urine,Both,10.9,umol/mmol creatinine,"Tuma, P., Samcova, E. & Balinova, P. Determina...",PMID:15899597
1,measurement_1,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,85.8,umol/mmol creatinine,David F. Putnam Composition and Concentrative ...,
2,measurement_2,compound_6317,http://purl.obolibrary.org/obo/DOID_10652,Adult: >=18 yrs old,Urine,Both,15.7,umol/mmol creatinine,"Fonteh, A. N., Harrington, R. J., Tsai, A., Li...",PMID:17031479
3,measurement_3,compound_6317,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Blood,Both,12.7,uM,"Dohm, G. L., Williams, R. T., Kasperek, G. J. ...",PMID:7061274
4,measurement_4,compound_6317,http://purl.obolibrary.org/obo/DOID_10591,Adult: >=18 yrs old,Blood,Female,50.7,uM,,PMID:22494326
...,...,...,...,...,...,...,...,...,...,...
2982,measurement_2982,compound_5681,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,0.36,umol/mmol creatinine,,PMID:2691039
2983,measurement_2983,compound_5681,http://purl.obolibrary.org/obo/PATO_0000461,Adolescent:13-18 yrs old,Urine,Both,0.23,umol/mmol creatinine,,PMID:2691039
2984,measurement_2984,compound_5680,http://purl.obolibrary.org/obo/DOID_9970,Adolescent:13-18 yrs old,Urine,Both,2.2,umol/mmol creatinine,,PMID:2691039
2985,measurement_2985,compound_5680,http://purl.obolibrary.org/obo/PATO_0000461,Adult: >=18 yrs old,Urine,Both,0.0023,umol/mmol creatinine,,PMID:24023812


In [50]:
for result in get_all_triples_for_iri(URIRef(MeNuGUIDE.measurement_1)):
    print(result)

(None, rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://MeNuGUIDE.local/Measurement'))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/isMeasurementOf'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/PATO_0000461'))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/amount'), rdflib.term.Literal('85.8', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/unit'), rdflib.term.Literal('umol/mmol creatinine'))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/hasCohort'), rdflib.term.Literal('Adult: >=18 yrs old'))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/hasSampleType'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/UBERON_0001088'))
(None, rdflib.term.URIRef('http://MeNuGUIDE.local/hasReference'), rdflib.term.Literal('David F. Putnam Composition and Concentrative Properties of Human Urine. NASA Contractor Report. July 1971'))
(None, rdflib.term.URIRef

In [51]:
onto_graph.serialize(destination=os.path.join(ontology_folder, "merged_with_foods_compounds_reactions_biomarkers.ttl"), format="turtle")

<Graph identifier=Na02196d5e708470a901ab294d3182d1d (<class 'rdflib.graph.Graph'>)>

In [52]:
query = f"""
    SELECT ?entity ?label ?predicate ?object
    WHERE {{
      ?entity rdfs:label "Crohn's disease" .
      ?entity ?predicate ?object
    }}
    """
    
    # Execute the query
results = onto_graph.query(query)
list(results)

[(rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0100280'),
  None,
  rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
  rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class')),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0100280'),
  None,
  rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'),
  rdflib.term.Literal("Crohn's disease")),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/HP_0100280'),
  None,
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/IAO_0000115'),
  rdflib.term.Literal("A chronic granulomatous inflammatory disease of the intestines that may affect any part of the gastrointestinal tract from mouth to anus, causing a wide variety of symptoms. It primarily causes abdominal pain, diarrhea which may be bloody, vomiting, or weight loss, but may also cause complications outside of the gastrointestinal tract such as skin rashes, arthritis, inflammation of the eye, tiredness, and lack of concentration. Crohn'