*MeNu GUIDE*
# Integrate Ontology and Database Info - Foods

In [1]:
import rdflib
import os
import pandas as pd
from rdflib import URIRef, Literal, Namespace, RDF, XSD
from rdflib.plugins.sparql import prepareQuery

## Define Namespaces

In [2]:
MeNuGUIDE = Namespace("http://MeNuGUIDE.local/")
ChEBI = Namespace("http://purl.obolibrary.org/obo/chebi/")
OBO = Namespace("http://purl.obolibrary.org/obo/")
GO = Namespace("http://www.geneontology.org/formats/oboInOwl#")
FOBI = Namespace("http://purl.obolibrary.org/obo/FOBI_")

## Load Graph

In [None]:
foodb_folder = "/path/to/downloaded/FooDB/data/"
ontology_folder = "/path/to/ontologies/"
processed_data_folder = "/path/to/processed/data/folder/"

In [3]:
# Load the merged ontology RDF file
onto_graph = rdflib.Graph()
onto_graph.parse(os.path.join(ontology_folder, "merged_with_database_compounds.ttl"), format="turtle")

<Graph identifier=Na5625b5f59c14ee8b73a915ebca55f5d (<class 'rdflib.graph.Graph'>)>

### Find food IRIs

In [None]:
foods = pd.read_csv(os.path.join(foodb_folder, 'foods_to_map.csv'))

#### Match via scientific name

In [None]:
def find_term(search_term):
    query = f"""
    SELECT ?entity ?label
    WHERE {{
      ?entity rdfs:label "{search_term}".
    }}
    """
    
    # Execute the query
    results = onto_graph.query(query)
    return results

In [None]:
scientific_name_matches = {}

for idx, row in foods.iterrows():
    if pd.notna(row['name_scientific']):
        print(idx)
        matches = find_term(row['name_scientific'])
        if matches:
            scientific_name_matches[row['name_scientific']] = matches

In [None]:
print(f"Foods with scientific names: {len(foods[foods.name_scientific.notna()])}\nFoods that could be matched via scientific name: {len(scientific_name_matches)}")

In [None]:
for name, match in scientific_name_matches.items():
    scientific_name_matches[name] = list(match)

In [None]:
for name, match in scientific_name_matches.items():
    if len(match) > 1:
        print(f"{name}:\n{match}\n")
        scientific_name_matches[name] = [match[1]]

In [None]:
matches_dict = {}

for name, match in scientific_name_matches.items():
    matches_dict[name] = match[0][0]

In [None]:
foods['ncbi_uri'] = foods.name_scientific.apply(lambda x: str(matches_dict[x]) if x in matches_dict else None)

#### Match via name 

In [None]:
name_uri_dict = {}

def get_food_via_name(food):
    name_query = f"""
        SELECT ?iri
        WHERE {{
          ?iri rdfs:label "{food.lower()}"@en .
        }}
        """
    
    results = onto_graph.query(name_query)
    
    if results:
        result = str(list(results)[0][0])
        name_uri_dict[food] = result
    
foods['name'].apply(get_food_via_name)

In [None]:
foods['foodon_uri'] = foods['name'].apply(lambda x: str(name_uri_dict[x]) if x in name_uri_dict else None)

In [None]:
foods[foods.name_scientific.notna() & foods.name_scientific.str.startswith("Capsicum annuum 'Ja")]

In [None]:
foods.loc[954, 'name_scientific'] = "Capsicum annuum Jalapeno"

#### Match via scientific name & synonyms in ontology

In [None]:
name_scientific_uri_dict = {}

def get_food_via_scientific_name(food):
    if pd.notna(food):
        name_query = f"""
            SELECT ?iri
            WHERE {{
              ?iri <http://www.geneontology.org/formats/oboInOwl#hasSynonym> '{food.lower()}'@en .
            }}
            """
        
        results = onto_graph.query(name_query)

        if results:
            result = list(results)
                
            name_scientific_uri_dict[food] = result
    
foods['name_scientific'].apply(get_food_via_scientific_name)

In [None]:
def get_label(iri):
    test_query = f"""
        SELECT ?o
        WHERE {{
            <{iri}> rdfs:label ?o .
        }}
        """
    
    results = list(onto_graph.query(test_query))
    return results[0][0]

for name, match in name_scientific_uri_dict.items():
    updated_match = []
        
    for entry in match:
        label = get_label(entry[0])
        updated_match.append((entry[0], label))
        
    name_scientific_uri_dict[name] = updated_match

In [None]:
name_dict = foods[['name', 'name_scientific']].set_index('name_scientific').to_dict()
name_dict = name_dict['name']

In [None]:
ncbi_tax_dict = {}
foodon_dict = {}

for name, match in name_scientific_uri_dict.items():
    for entry in match:
        uri = entry[0]
        
        if 'FOODON' in uri:
            if name not in foodon_dict:
                foodon_dict[name] = [uri]
            else:
                foodon_dict[name].append(uri)
                
        elif 'NCBITaxon' in uri:
            if name not in ncbi_tax_dict:
                ncbi_tax_dict[name] = [uri]
            else:
                ncbi_tax_dict[name].append(uri)
        else:
            print(f"{name_dict[name]} - {name}: {entry}")

In [None]:
foods['ncbi_uri'] = foods.apply(lambda row: (ncbi_tax_dict[row['name_scientific']] if row['name_scientific'] in ncbi_tax_dict else None) if pd.isna(row['ncbi_uri']) else row['ncbi_uri'], axis=1)

In [None]:
def merge_foodon_matches(row):
    if row['name_scientific'] in foodon_dict:
        uris = set()
        if pd.notna(row['foodon_uri']):
            uris.add(row['foodon_uri'])
        for uri in foodon_dict[row['name_scientific']]:
            uris.add(uri)
        return list(uris)
    else:
        if pd.notna(row['foodon_uri']):
            return [row['foodon_uri']]
        else:
            return None

foods['foodon_uri'] = foods.apply(merge_foodon_matches, axis=1)

In [None]:
unmatched_foods = foods[foods.ncbi_uri.isna() & foods.foodon_uri.isna()]
unmatched_foods

In [None]:
unmatched_foods.to_csv(os.path.join(foodb_folder, "foods_to_map_manually.csv"), index=False)

#### Manual matching

In [None]:
manually_matched_foods = pd.read_csv(os.path.join(foodb_folder, "foods_mapped_manually.csv"), sep=';')

In [None]:
manually_matched_foods.loc[:, 'foodon_uri'] = manually_matched_foods.foodon_uri.apply(lambda x: [x] if pd.notna(x) else x)

In [None]:
automatically_matched_foods = foods[foods.ncbi_uri.notna() | foods.foodon_uri.notna()]

In [None]:
duplicated_names = set(manually_matched_foods.name.unique()) - set(unmatched_foods.name.unique())

In [None]:
automatically_matched_foods = automatically_matched_foods[~automatically_matched_foods.name.isin(duplicated_names)]

In [None]:
foods_matched = pd.concat([automatically_matched_foods, manually_matched_foods])

### Clean up matched foods and add identifiers

In [None]:
foods_matched = foods_matched[~(foods_matched.foodon_uri.isna() & foods_matched.ncbi_uri.isna())]

In [None]:
foods_matched = foods_matched.reset_index(drop=True)

In [None]:
foods_matched = foods_matched.reset_index(names='menuguide_id')
foods_matched.loc[:, 'menuguide_id'] = foods_matched.menuguide_id.apply(lambda x: f"food_{x}")

In [None]:
foods_matched.loc[:, 'foodon_uri'] = foods_matched.foodon_uri.apply(lambda x: None if (type(x) == float and pd.isna(x)) else x)
foods_matched.loc[:, 'ncbi_uri'] = foods_matched.ncbi_uri.apply(lambda x: None if (type(x) == float and pd.isna(x)) else x)

In [None]:
foods_matched.loc[987, 'ncbi_uri'] = None

In [None]:
foods_matched.to_csv(os.path.join(foodb_folder, "foods_matched_to_onto.csv"), index=False)

In [4]:
foods_matched = pd.read_csv(os.path.join(foodb_folder, "foods_matched_to_onto.csv"))

### Add foods to graph

In [5]:
foods_matched

Unnamed: 0,menuguide_id,id,name,name_scientific,description,food_type,ncbi_taxonomy_id,public_id,ncbi_uri,foodon_uri
0,food_0,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,Type 1,1216010.0,FOOD00002,http://purl.obolibrary.org/obo/NCBITaxon_1216010,[rdflib.term.URIRef('http://purl.obolibrary.or...
1,food_1,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,Type 1,,FOOD00003,,[rdflib.term.URIRef('http://purl.obolibrary.or...
2,food_2,4,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",Type 1,3625.0,FOOD00004,http://purl.obolibrary.org/obo/NCBITaxon_3625,[rdflib.term.URIRef('http://purl.obolibrary.or...
3,food_3,5,Allium,Allium,Allium haematochiton is a species of wild onio...,Type 1,4678.0,FOOD00005,http://purl.obolibrary.org/obo/NCBITaxon_4678,[rdflib.term.URIRef('http://purl.obolibrary.or...
4,food_4,6,Garden onion,Allium cepa,The onion (Allium cepa) (Latin 'cepa' = onion)...,Type 1,4679.0,FOOD00006,http://purl.obolibrary.org/obo/NCBITaxon_4679,[rdflib.term.URIRef('http://purl.obolibrary.or...
...,...,...,...,...,...,...,...,...,...,...
985,food_985,1014,Goji,,"Goji, goji berry or wolfberry is the fruit of ...",Type 1,,FOOD00982,,['http://purl.obolibrary.org/obo/FOODON_000043...
986,food_986,1015,Monk fruit,Siraitia grosvenorii,Siraitia grosvenorii (monk fruit or luo han gu...,Type 1,,FOOD00983,http://purl.obolibrary.org/obo/NCBITaxon_3650,
987,food_987,1016,Cantaloupe melon,Cucumis melo var. cantalupensis,"The cantaloupe, rockmelon (Australia), sweet m...",Type 1,,FOOD00984,,['http://purl.obolibrary.org/obo/FOODON_034114...
988,food_988,1017,Hawthorn,Crataegus,"Crataegus commonly called hawthorn, quickthorn...",Type 1,,FOOD00985,http://purl.obolibrary.org/obo/NCBITaxon_23159,['http://purl.obolibrary.org/obo/FOODON_034117...


In [6]:
def reformat_uri(uris):
    if type(uris) == str:
        if uris == ' ':
            return float('NaN')
        elif 'rdflib' in uris:
            if ',' in uris:
                uris = uris.split(',')
                cleaned_uris = []
                for uri in uris:
                    uri_temp = uri.split("'")[1]
                    cleaned_uris.append(uri_temp)
                return cleaned_uris
            else:
                return [uris.split("'")[1]]
        else:
            if '[' in uris:
                uris = uris.split("'")[1]
                return [uris]
            else:
                return [uris]
    else:
        if pd.isna(uris):
            return float('NaN')
        else:
            print(uris)

foods_matched.loc[:, "ncbi_uri"] = foods_matched.ncbi_uri.apply(reformat_uri)
foods_matched.loc[:, "foodon_uri"] = foods_matched.foodon_uri.apply(reformat_uri)

In [7]:
def add_triple_property(subject, predicate, object, prefix=''):
    if pd.notna(object):
        if prefix:
            object = prefix + object
        onto_graph.add((subject, predicate, Literal(object)))
        
def class_exists(iri):
    # SPARQL query to check if the IRI exists in the graph
    query = prepareQuery(
        """ASK
           WHERE {
              VALUES ?iri_to_check { <""" + str(iri) + """> }
              ?iri_to_check ?p ?o .
           }""",
    )
    
    exists = onto_graph.query(query)
    return exists
        
def add_identifier(subject, object_uri):
    if pd.notna(object_uri):
        iri_to_check = URIRef(object_uri)
        
        if class_exists(iri_to_check):
            onto_graph.add((subject, RDF.type, iri_to_check))

def add_food_to_graph(row):
    print(f"{row.name}: {row['name']}")
    
    food_name = row['name'].lower()
    scientific_name = row['name_scientific'].lower() if pd.notna(row['name_scientific']) else None
    description = row['description']
    foodb_id = row['public_id']
    foodon_connections = row['foodon_uri']
    ncbi_uri = row['ncbi_uri']
    
    instance_uri = URIRef(MeNuGUIDE[row['menuguide_id']])
    onto_graph.add((instance_uri, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), Literal(food_name)))
    onto_graph.add((instance_uri, RDF.type, MeNuGUIDE.Food))
    
    # Description - http://purl.org/dc/terms/description or http://purl.obolibrary.org/obo/IAO_0000115 (used in ChEBI ontology)
    add_triple_property(instance_uri, OBO.IAO_0000115, description)
    
    # foodb_id - http://www.geneontology.org/formats/oboInOwl#hasDbXref 
    add_triple_property(instance_uri, GO.hasDbXref, foodb_id, 'FooDB:')
    
    # Scientific name - http://www.geneontology.org/formats/oboInOwl#hasSynonym    
    add_triple_property(instance_uri, GO.hasSynonym, scientific_name)
    
    # ncbi - rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    add_identifier(instance_uri, ncbi_uri)
    
    # other foodon identifiers - rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    if type(foodon_connections) == list:
        for item in foodon_connections:
            add_identifier(instance_uri, item)    

In [8]:
len(onto_graph)

22955648

In [9]:
foods_matched.apply(add_food_to_graph, axis=1)

0: Savoy cabbage
1: Silver linden
2: Kiwi
3: Allium
4: Garden onion
5: Leek
6: Garlic
7: Chives
8: Cashew nut
9: Pineapple
10: Dill
11: Custard apple
12: Wild celery
13: Peanut
14: Burdock
15: Horseradish
16: Tarragon
17: Mugwort
18: Asparagus
19: Oat
20: Star fruit
21: Brazil nut
22: Common beet
23: Borage
24: Chinese mustard
25: Swede
26: Rape
27: Common cabbage
28: Cauliflower
29: Brussel sprouts
30: Kohlrabi
31: Broccoli
32: Chinese cabbage
33: Turnip
34: Pigeon pea
35: Tea
36: Capers
37: Pepper
38: Papaya
39: Safflower
40: Caraway
41: Pecan nut
42: Chestnut
43: Roman camomile
44: Chickpea
45: Endive
46: Chicory
47: Chinese cinnamon
48: Ceylon cinnamon
49: Watermelon
50: Lime
51: Lemon
52: Pummelo
53: Mandarin orange (Clementine, Tangerine)
54: Sweet orange
55: Coffee
56: Arabica coffee
57: Coriander
58: Common hazelnut
59: Saffron
60: Muskmelon
61: Cucumber
62: Cucurbita
63: Cumin
64: Turmeric
65: Quince
66: Lemon grass
67: Wild carrot
68: Japanese persimmon
69: Cardamom
70: Black

0      None
1      None
2      None
3      None
4      None
       ... 
985    None
986    None
987    None
988    None
989    None
Length: 990, dtype: object

In [10]:
len(onto_graph)

22961549

## Add Food Compound Relations to Graph
### Clean up content table

In [11]:
food_content = pd.read_csv(os.path.join(foodb_folder, 'foodb_metabolite_food_relations.csv'))

In [12]:
food_ids = foods_matched[['menuguide_id', 'id']]

In [13]:
food_ids.rename(columns={'id': 'food_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  food_ids.rename(columns={'id': 'food_id'}, inplace=True)


In [14]:
food_content_merge = food_content.merge(food_ids, on='food_id', how='left', suffixes=('_compound', '_food'))

In [15]:
food_content_merge = food_content_merge[food_content_merge.menuguide_id_food.notna()] 
food_content_merge = food_content_merge[['menuguide_id_compound', 'menuguide_id_food', 'content', 'unit', 'min_content', 'max_content', 'citation']]

In [16]:
def adjust_units(unit):
    if ' g' in unit:
        unit = unit.replace(' g', 'g')
    
    if ' 100' in unit:
        unit = unit.replace(' 1', '1')
        
    if 'freshweight' in unit:
        unit = unit.replace('freshweight', 'fresh weight')
        
    if 'Fresh' in unit:
        unit = unit.replace('Fresh', 'fresh')
        
    return unit
        

food_content_merge.loc[:, 'unit'] = food_content_merge.unit.apply(adjust_units)

In [17]:
food_content_merge = food_content_merge.drop_duplicates(subset=['menuguide_id_compound', 'menuguide_id_food', 'content', 'unit'])

In [18]:
food_content_merge = food_content_merge.reset_index(drop=True)

In [19]:
food_content_merge = food_content_merge.reset_index(names='menuguide_id')

In [20]:
food_content_merge.loc[:, 'menuguide_id'] = food_content_merge.menuguide_id.apply(lambda x: f"content_{x}")
food_content_merge

 'content_223539']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  food_content_merge.loc[:, 'menuguide_id'] = food_content_merge.menuguide_id.apply(lambda x: f"content_{x}")


Unnamed: 0,menuguide_id,menuguide_id_compound,menuguide_id_food,content,unit,min_content,max_content,citation
0,content_0,compound_168666,food_242,1.08,mg/100g,,,DTU
1,content_1,compound_401782,food_242,0.58,mg/100g,,,DTU
2,content_2,compound_4138,food_242,1.80,NE,,,DTU
3,content_3,compound_4138,food_242,1.80,mg/100g,,,DTU
4,content_4,compound_900,food_242,26.00,mg/100g,,,DTU
...,...,...,...,...,...,...,...,...
223535,content_223535,compound_398532,food_804,993.00,uM,942.0,1044.0,PMID:30994344
223536,content_223536,compound_398536,food_804,993.00,uM,942.0,1044.0,PMID:30994344
223537,content_223537,compound_398543,food_804,993.00,uM,942.0,1044.0,PMID:30994344
223538,content_223538,compound_398565,food_804,993.00,uM,942.0,1044.0,PMID:30994344


### Add content to graph

In [21]:
def add_food_content_to_graph(row):
    compound = URIRef(MeNuGUIDE[row['menuguide_id_compound']])
    food = URIRef(MeNuGUIDE[row['menuguide_id_food']])
    quantity = Literal(row['content'], datatype=XSD.float)
    unit = Literal(row['unit'])
    min_content = Literal(row['min_content'], datatype=XSD.float) if pd.notna(row['min_content']) else None
    max_content = Literal(row['max_content'], datatype=XSD.float) if pd.notna(row['max_content']) else None
    citation = Literal(row['citation'])
    
    #content = BNode()
    content = URIRef(MeNuGUIDE[row['menuguide_id']])
    onto_graph.add((content, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), Literal(row['menuguide_id'])))
    
    onto_graph.add((content, RDF.type, MeNuGUIDE.Content))
    
    onto_graph.add((food, MeNuGUIDE.hasContent, content))
    onto_graph.add((content, MeNuGUIDE.isContentOf, food))
    
    onto_graph.add((content, MeNuGUIDE.hasCompound, compound))
    onto_graph.add((compound, MeNuGUIDE.isCompoundOf, content))
    
    onto_graph.add((content, MeNuGUIDE.amount, quantity))
    onto_graph.add((content, MeNuGUIDE.unit, unit))
    
    if min_content:
        onto_graph.add((content, MeNuGUIDE.minimumAmount, min_content))
        
    if max_content:
        onto_graph.add((content, MeNuGUIDE.maximumAmount, max_content))
        
    if citation:
        onto_graph.add((content, MeNuGUIDE.hasReference, citation))

In [22]:
len(onto_graph)

22961549

In [23]:
food_content_merge.apply(add_food_content_to_graph, axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
223535    None
223536    None
223537    None
223538    None
223539    None
Length: 223540, dtype: object

In [24]:
len(onto_graph)

25019335

In [25]:
onto_graph.serialize(destination=os.path.join(ontology_folder, "merged_with_foods_and_compounds.ttl"),
    format="turtle")

<Graph identifier=Na5625b5f59c14ee8b73a915ebca55f5d (<class 'rdflib.graph.Graph'>)>