*MeNu GUIDE*
# Integrate Ontology and Database Info - Reactions

In [1]:
import rdflib
import pandas as pd
from rdflib import URIRef, Literal, Namespace, RDF
from rdflib.plugins.sparql import prepareQuery
import re
import os

In [2]:
MeNuGUIDE = Namespace("http://MeNuGUIDE.local/")
ChEBI = Namespace("http://purl.obolibrary.org/obo/chebi/")
OBO = Namespace("http://purl.obolibrary.org/obo/")
GO = Namespace("http://www.geneontology.org/formats/oboInOwl#")
FOBI = Namespace("http://purl.obolibrary.org/obo/FOBI_")

## Load Graph

In [None]:
kegg_folder = "/path/to/kegg/data/folder/"
vmh_folder = "/path/to/vmh/data/folder/"
ontology_folder = "/path/to/ontologies/"
processed_data_folder = "/path/to/processed/data/folder/"

In [3]:
onto_graph = rdflib.Graph()
onto_graph.parse(os.path.join(ontology_folder, "merged_with_foods_and_compounds.ttl"), format="turtle")

<Graph identifier=N05dff0e96c2649b7a861678b835c583e (<class 'rdflib.graph.Graph'>)>

## Prepare reactions data

In [4]:
reactions = pd.read_csv(os.path.join(processed_data_folder, "reactions_kegg_vmh.csv"))

In [5]:
reactions.loc[:, 'description'] = reactions.apply(lambda row: row['kegg_reaction_name'] if pd.notna(row['kegg_reaction_name']) else row['description'], axis=1)

In [6]:
vmh_reaction_enzymes  = reactions[['menuguide_id', 'ecnumber']]

In [7]:
reactions = reactions[['menuguide_id', 'vmh_reaction_id', 'kegg_reaction_id', 'description']]

In [8]:
reactions

Unnamed: 0,menuguide_id,vmh_reaction_id,kegg_reaction_id,description
0,reaction_0,,R00001,polyphosphate polyphosphohydrolase
1,reaction_1,,R00002,Reduced ferredoxin:dinitrogen oxidoreductase (...
2,reaction_2,PPAer,R00004,diphosphate phosphohydrolase
3,reaction_3,PPAm,R00004,diphosphate phosphohydrolase
4,reaction_4,PPAn,R00004,diphosphate phosphohydrolase
...,...,...,...,...
27197,reaction_27198,H2SO,,Hydrogen sulfide oxidation
27198,reaction_27199,ASPNH4Li,,"L Aspartate ammonia ligase ADP forming, irreve..."
27199,reaction_27200,NO2t3,,nitrite transport out via proton antiport
27200,reaction_27201,ICDHyr,,isocitrate dehydrogenase (NADP)


In [9]:
duplicated_reactions = set(reactions[(reactions.kegg_reaction_id.notna()) & (reactions.kegg_reaction_id.duplicated())].kegg_reaction_id.unique())

In [10]:
duplicates_cleaned = []

for duplicate in duplicated_reactions:
    affected_rows = reactions[reactions.kegg_reaction_id == duplicate]
    vmh_ids = list(affected_rows.vmh_reaction_id.unique())
    menuguide_id = list(affected_rows.menuguide_id)[0]
    description = list(affected_rows.description)[0]
    duplicates_cleaned.append({'menuguide_id': menuguide_id, 'vmh_reaction_id': vmh_ids, 'kegg_reaction_id': duplicate, 'description': description})

In [11]:
reactions_not_duplicated = reactions[~reactions.kegg_reaction_id.isin(duplicated_reactions)]

In [12]:
reactions_duplicates_cleaned = pd.DataFrame(duplicates_cleaned)

In [13]:
reactions = pd.concat([reactions_not_duplicated, reactions_duplicates_cleaned])

In [14]:
reactions

Unnamed: 0,menuguide_id,vmh_reaction_id,kegg_reaction_id,description
0,reaction_0,,R00001,polyphosphate polyphosphohydrolase
1,reaction_1,,R00002,Reduced ferredoxin:dinitrogen oxidoreductase (...
7,reaction_7,,R00005,urea-1-carboxylate amidohydrolase
8,reaction_8,,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...
9,reaction_9,,R00008,4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...
...,...,...,...,...
199,reaction_3997,"[PI345P3P, PI345P3Pn]",R04513,"Phosphatidylinositol-3,4,5-trisphosphate 3-pho..."
200,reaction_5079,"[C2M26DCOAHLm, C2M26DCOAHLx]",R06411,"3-hydroxy-2,6-dimethyl-5-methylene-heptanoyl-C..."
201,reaction_1932,"[HMR_3538, HMR_3539, HMR_3540, HMR_3541, HMR_3...",R02115,Steryl-ester acylhydrolase
202,reaction_7182,"[PI345P5P, PI345P5Pn]",R09827,"1-phosphatidyl-1D-myo-inositol-3,4,5-trisphosp..."


In [15]:
reactions = reactions.reset_index(drop=True)

In [16]:
reactions = reactions.drop(columns='menuguide_id')

In [17]:
reactions = reactions.reset_index(names='menuguide_id')

In [18]:
reactions.loc[:, "menuguide_id"] = reactions.menuguide_id.apply(lambda x: f"reaction_{x}")

 'reaction_26787' 'reaction_26788']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  reactions.loc[:, "menuguide_id"] = reactions.menuguide_id.apply(lambda x: f"reaction_{x}")


In [19]:
reactions

Unnamed: 0,menuguide_id,vmh_reaction_id,kegg_reaction_id,description
0,reaction_0,,R00001,polyphosphate polyphosphohydrolase
1,reaction_1,,R00002,Reduced ferredoxin:dinitrogen oxidoreductase (...
2,reaction_2,,R00005,urea-1-carboxylate amidohydrolase
3,reaction_3,,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...
4,reaction_4,,R00008,4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...
...,...,...,...,...
26784,reaction_26784,"[PI345P3P, PI345P3Pn]",R04513,"Phosphatidylinositol-3,4,5-trisphosphate 3-pho..."
26785,reaction_26785,"[C2M26DCOAHLm, C2M26DCOAHLx]",R06411,"3-hydroxy-2,6-dimethyl-5-methylene-heptanoyl-C..."
26786,reaction_26786,"[HMR_3538, HMR_3539, HMR_3540, HMR_3541, HMR_3...",R02115,Steryl-ester acylhydrolase
26787,reaction_26787,"[PI345P5P, PI345P5Pn]",R09827,"1-phosphatidyl-1D-myo-inositol-3,4,5-trisphosp..."


In [20]:
reactions_vmh = {}

def get_vmh_reaction_id(row):
    if type(row['vmh_reaction_id']) == list:
        for reaction_id in row['vmh_reaction_id']:
            reactions_vmh[reaction_id] = row['menuguide_id']
    elif pd.notna(row['vmh_reaction_id']):
        reactions_vmh[row['vmh_reaction_id']] = row['menuguide_id']

reactions.apply(get_vmh_reaction_id, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
26784    None
26785    None
26786    None
26787    None
26788    None
Length: 26789, dtype: object

In [21]:
reactions_kegg = reactions[reactions.kegg_reaction_id.notna()]
reactions_kegg = reactions_kegg.set_index('kegg_reaction_id')['menuguide_id']
reactions_kegg_to_menuguide_id = reactions_kegg.to_dict()

In [22]:
reactions_dict = {}

def add_to_reaction_dict(row):
    key = row['menuguide_id']
    
    reactions_dict[key] = {}
    
    if pd.notna(row['kegg_reaction_id']):
        reactions_dict[key].update({'kegg_id': row['kegg_reaction_id']})
        
    if type(row['vmh_reaction_id']) == list or pd.notna(row['vmh_reaction_id']):
        reactions_dict[key].update({'vmh_id': row['vmh_reaction_id']})
        
    if pd.notna(row['description']):
        reactions_dict[key].update({'description': row['description']})

reactions.apply(add_to_reaction_dict, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
26784    None
26785    None
26786    None
26787    None
26788    None
Length: 26789, dtype: object

### Add substrates

In [23]:
kegg_substrates = pd.read_csv(os.path.join(kegg_folder, "kegg_reaction_substrates.csv"))
vmh_substrates = pd.read_csv(os.path.join(vmh_folder, "vmh_reaction_substrates.csv"))
compound_identifiers = pd.read_csv(os.path.join(processed_data_folder, "compounds_all_databases_merged_ids.csv"), dtype={'markerdb_id': 'string', 'kegg_id': 'string', 'vmh_id': 'string'})

In [24]:
kegg_substrates = kegg_substrates.merge(compound_identifiers, left_on='substrate', right_on='kegg_id', how='left')[['substrate', 'reaction_id', 'menuguide_id']]
unmatched_kegg_reactions = set(kegg_substrates[kegg_substrates.menuguide_id.isna()].reaction_id.unique())
kegg_substrates = kegg_substrates[kegg_substrates.menuguide_id.notna()]

In [25]:
vmh_substrates.loc[:, "substrate"] = vmh_substrates.substrate.str.lower()
vmh_substrates = vmh_substrates.merge(compound_identifiers, left_on='substrate', right_on='vmh_id', how='left')[['substrate', 'reaction_abbreviation', 'menuguide_id']]
unmatched_vmh_reactions = set(vmh_substrates[vmh_substrates.substrate.isna()].reaction_abbreviation.unique())
vmh_substrates = vmh_substrates[vmh_substrates.substrate.notna()]

In [26]:
for reaction in unmatched_kegg_reactions:
    menuguide_reaction_id = reactions_kegg[reaction]
    if menuguide_reaction_id in reactions_dict:
        del(reactions_dict[menuguide_reaction_id])
        print(f"Reaction {reaction} deleted.")

Reaction R13111 deleted.
Reaction R13106 deleted.
Reaction R13132 deleted.
Reaction R13122 deleted.
Reaction R13125 deleted.
Reaction R13131 deleted.
Reaction R13130 deleted.
Reaction R13123 deleted.
Reaction R13109 deleted.
Reaction R13133 deleted.
Reaction R13134 deleted.
Reaction R13121 deleted.
Reaction R13129 deleted.
Reaction R13117 deleted.


In [27]:
for reaction in unmatched_vmh_reactions:
    menuguide_reaction_id = reactions_vmh[reaction]
    del(reactions_dict[menuguide_reaction_id])
    print(f"Reaction {reaction} deleted.")

Reaction dreplication deleted.
Reaction pbiosynthesis deleted.
Reaction rtranscription deleted.


In [28]:
vmh_substrates['menuguide_reaction_id'] = vmh_substrates.reaction_abbreviation.apply(lambda x: reactions_vmh[x])
kegg_substrates['menuguide_reaction_id'] = kegg_substrates.reaction_id.apply(lambda x: reactions_kegg[x])

In [29]:
def add_to_reaction_dict(row):
    reaction_id = row['menuguide_reaction_id']
    substrate_id = row['menuguide_id']
    if reaction_id in reactions_dict:
        entry = reactions_dict[reaction_id]
        if 'substrates' in entry:
            entry['substrates'].add(substrate_id)
        else:
            entry['substrates'] = {substrate_id}

In [30]:
kegg_substrates.apply(add_to_reaction_dict, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
37048    None
37049    None
37050    None
37051    None
37052    None
Length: 37039, dtype: object

In [31]:
vmh_substrates.apply(add_to_reaction_dict, axis=1)

0          None
1          None
2          None
3          None
4          None
           ... 
1289969    None
1289970    None
1289971    None
1289972    None
1289973    None
Length: 93397, dtype: object

### Add products

In [32]:
kegg_products = pd.read_csv(os.path.join(kegg_folder, "kegg_reaction_products.csv"))
vmh_products = pd.read_csv(os.path.join(vmh_folder, "vmh_reaction_products.csv"))

In [33]:
vmh_products

Unnamed: 0,product,reaction_abbreviation
0,10fthf5glu,10FTHF5GLUtl
1,10fthf5glu,10FTHF5GLUtm
2,10fthf6glu,10FTHF6GLUtl
3,10fthf6glu,10FTHF6GLUtm
4,10fthf7glu,10FTHF7GLUtl
...,...,...
44597,co2,ICDHyr
44598,nadph,ICDHyr
44599,btcoa,BTCOADHi
44600,fdxrd,BTCOADHi


In [34]:
kegg_products = kegg_products.merge(compound_identifiers, left_on='product', right_on='kegg_id', how='left')[['product', 'reaction_id', 'menuguide_id']]
unmatched_kegg_reactions = set(kegg_products[kegg_products.menuguide_id.isna()].reaction_id.unique())
kegg_products = kegg_products[kegg_products.menuguide_id.notna()]

In [35]:
vmh_products.loc[:, "products"] = vmh_products['product'].str.lower()

In [36]:
compound_identifiers_vmh = compound_identifiers[compound_identifiers.vmh_id.notna()]

In [37]:
vmh_products = vmh_products.merge(compound_identifiers_vmh, left_on='products', right_on='vmh_id', how='left')[['products', 'reaction_abbreviation', 'menuguide_id']]

In [38]:
unmatched_vmh_reactions = set(vmh_products[vmh_products['products'].isna()].reaction_abbreviation.unique())

In [39]:
vmh_products = vmh_products[vmh_products['products'].notna()]

In [40]:
for reaction in unmatched_kegg_reactions:
    menuguide_reaction_id = reactions_kegg[reaction]
    if menuguide_reaction_id in reactions_dict:
        del(reactions_dict[menuguide_reaction_id])
        print(f"Reaction {reaction} deleted.")

Reaction R13102 deleted.
Reaction R13141 deleted.
Reaction R13110 deleted.
Reaction R13108 deleted.
Reaction R13115 deleted.
Reaction R11151 deleted.
Reaction R13104 deleted.
Reaction R13124 deleted.
Reaction R13100 deleted.
Reaction R13099 deleted.
Reaction R13128 deleted.
Reaction R11152 deleted.
Reaction R13103 deleted.
Reaction R13101 deleted.


In [41]:
for reaction in unmatched_vmh_reactions:
    menuguide_reaction_id = reactions_vmh[reaction]
    del(reactions_dict[menuguide_reaction_id])
    print(f"Reaction {reaction} deleted.")

Reaction EX_co2[e] deleted.
Reaction EX_HC01943(e) deleted.
Reaction EX_lstnm5[e] deleted.
Reaction EX_7klitchol(e) deleted.
Reaction EX_lneldc(e) deleted.
Reaction EX_fdp(e) deleted.
Reaction EX_lpchol_hs(e) deleted.
Reaction EX_sql(e) deleted.
Reaction EX_proargasp[e] deleted.
Reaction DM_akg[c] deleted.
Reaction EX_glucys[e] deleted.
Reaction EX_4hbz(e) deleted.
Reaction EX_fucacngalacglcgalgluside_hs(e) deleted.
Reaction EX_3hpp(e) deleted.
Reaction EX_bgly(e) deleted.
Reaction EX_eandrstrn[e] deleted.
Reaction EX_34dhoxmand[e] deleted.
Reaction EX_MGlcn131(e) deleted.
Reaction DM_tudca3s[c] deleted.
Reaction EX_bvite[e] deleted.
Reaction EX_lyslyslys[e] deleted.
Reaction EX_succ[e] deleted.
Reaction EX_c6dc(e) deleted.
Reaction sink_HC02194[c] deleted.
Reaction EX_MGlcn17_rl(e) deleted.
Reaction DM_pe_hs(r) deleted.
Reaction EX_HC00001[e] deleted.
Reaction EX_glyc[e] deleted.
Reaction EX_sT_antigen(e) deleted.
Reaction EX_ahdt[e] deleted.
Reaction EX_abt_D(e) deleted.
Reaction sin

In [42]:
vmh_products['menuguide_reaction_id'] = vmh_products.reaction_abbreviation.apply(lambda x: reactions_vmh[x])
kegg_products['menuguide_reaction_id'] = kegg_products.reaction_id.apply(lambda x: reactions_kegg[x])

In [43]:
def add_product_to_reaction_dict(row):
    reaction_id = row['menuguide_reaction_id']
    product_id = row['menuguide_id']
    if reaction_id in reactions_dict:
        entry = reactions_dict[reaction_id]
        if 'products' in entry:
            entry['products'].add(product_id)
        else:
            entry['products'] = {product_id}

In [44]:
kegg_products.apply(add_product_to_reaction_dict, axis=1)
vmh_products.apply(add_product_to_reaction_dict, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
53179    None
53180    None
53181    None
53182    None
53183    None
Length: 49569, dtype: object

In [45]:
reactions_dict

{'reaction_0': {'kegg_id': 'R00001',
  'description': 'polyphosphate polyphosphohydrolase',
  'substrates': {'compound_2300', 'compound_5003', 'compound_5449'},
  'products': {'compound_4433', 'compound_7069'}},
 'reaction_1': {'kegg_id': 'R00002',
  'description': 'Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing)',
  'substrates': {'compound_2019',
   'compound_2300',
   'compound_2460',
   'compound_3505',
   'compound_5003',
   'compound_925'},
  'products': {'compound_2018',
   'compound_3473',
   'compound_4400',
   'compound_4401',
   'compound_5006',
   'compound_744',
   'compound_9361'}},
 'reaction_2': {'kegg_id': 'R00005',
  'description': 'urea-1-carboxylate amidohydrolase',
  'substrates': {'compound_2300',
   'compound_5003',
   'compound_6210',
   'compound_6211',
   'compound_808'},
  'products': {'compound_1654', 'compound_4158'}},
 'reaction_3': {'kegg_id': 'R00006',
  'description': 'pyruvate:pyruvate acetaldehydetransferase (decarboxylating)',
  'subst

### Add Genes & Proteins

In [46]:
genes = pd.read_csv(os.path.join(processed_data_folder, "genes_kegg_vmh.csv"), dtype={'gene_number': 'string'})
vmh_genes = pd.read_csv(os.path.join(vmh_folder, 'vmh_gene_reactions.csv'))
kegg_genes = pd.read_csv(os.path.join(processed_data_folder, "kegg_enzyme_gene_symbols.csv"))
kegg_enzymes = pd.read_csv(os.path.join(kegg_folder, "kegg_reaction_enzymes.csv"))

In [47]:
genes

Unnamed: 0,symbol,gene_number,chromosome,description,ensembl_gene,chebl_id,uniprot_gname,kegg_gene_id
0,36951,55016.1,4,membrane associated ring-CH-type finger 1 [Sou...,ENSG00000145416,,Q8TCQ1,
1,37316,51257.1,19,membrane associated ring-CH-type finger 2 [Sou...,ENSG00000099785,,Q9P0N8,
2,37681,115123.1,5,membrane associated ring-CH-type finger 3 [Sou...,ENSG00000173926,,Q86UD3,
3,38047,57574.1,2,membrane associated ring-CH-type finger 4 [Sou...,ENSG00000144583,,Q9P2E8,
4,38412,54708.1,10,membrane associated ring-CH-type finger 5 [Sou...,ENSG00000198060,,Q9NX47,
...,...,...,...,...,...,...,...,...
5107,ZNRF3,,,,,,,hsa:84133
5108,ZNRF4,,,,,,,hsa:148066
5109,ZRANB1,,,,,,,hsa:54764
5110,ZSWIM2,151112.1,2,"zinc finger, SWIM-type containing 2 [Source:HG...",ENSG00000163012,,Q8NEG5,hsa:151112


In [48]:
kegg_enzymes.loc[:, 'enzyme_id'] = kegg_enzymes.enzyme_id.apply(lambda x: f'EC {x}')

In [49]:
kegg_genes_to_reactions = kegg_genes.merge(kegg_enzymes, left_on='kegg_enzyme_id', right_on='enzyme_id', how='inner')

In [50]:
kegg_genes_to_reactions.loc[:, 'menuguide_id'] = kegg_genes_to_reactions.reaction_id.apply(lambda x: reactions_kegg[x])
kegg_genes_to_reactions

Unnamed: 0,symbol,kegg_enzyme_id,enzyme_id,reaction_id,menuguide_id
0,AOC3,EC 1.4.3.21,EC 1.4.3.21,R01853,reaction_1391
1,AOC3,EC 1.4.3.21,EC 1.4.3.21,R02382,reaction_1785
2,AOC3,EC 1.4.3.21,EC 1.4.3.21,R02529,reaction_1901
3,AOC3,EC 1.4.3.21,EC 1.4.3.21,R02613,reaction_1974
4,AOC3,EC 1.4.3.21,EC 1.4.3.21,R03139,reaction_2386
...,...,...,...,...,...
8826,COX1,EC 7.1.1.9,EC 7.1.1.9,R00081,reaction_62
8827,COX1,EC 7.1.1.9,EC 7.1.1.9,R00082,reaction_63
8828,CYB561D2,EC 7.2.1.3,EC 7.2.1.3,R09739,reaction_6487
8829,CYB561,EC 7.2.1.3,EC 7.2.1.3,R09739,reaction_6487


In [51]:
vmh_genes_symbols = genes[genes.gene_number.notna()]

In [52]:
vmh_genes = vmh_genes.merge(vmh_genes_symbols, on='gene_number', how='inner')[['reaction_abbreviation', 'symbol']]

In [53]:
vmh_genes.loc[:, 'menuguide_id'] = vmh_genes.reaction_abbreviation.apply(lambda x: reactions_vmh[x])
vmh_genes

Unnamed: 0,reaction_abbreviation,symbol,menuguide_id
0,13DAMPPOX,AOC3,reaction_2386
1,42A12BOOX,AOC3,reaction_3270
2,NMPTRCOX,AOC3,reaction_11118
3,13DAMPPOX,AOC1,reaction_2386
4,42A12BOOX,AOC1,reaction_3270
...,...,...,...
21916,MTHFD2,MTHFD2,reaction_18381
21917,MTHFD2,MTHFD2,reaction_18381
21918,AHC,AHCY,reaction_18383
21919,AHC,AHCYL1,reaction_18383


In [54]:
enzymes_to_add = kegg_genes_to_reactions[['enzyme_id', 'menuguide_id']]

In [55]:
genes_to_add = pd.concat([kegg_genes_to_reactions[['menuguide_id', 'symbol']], vmh_genes[['menuguide_id', 'symbol']]])

In [56]:
genes_to_add = genes_to_add.drop_duplicates()

In [57]:
enzymes_to_add = enzymes_to_add.drop_duplicates()

In [58]:
enzymes_to_add

Unnamed: 0,enzyme_id,menuguide_id
0,EC 1.4.3.21,reaction_1391
1,EC 1.4.3.21,reaction_1785
2,EC 1.4.3.21,reaction_1901
3,EC 1.4.3.21,reaction_1974
4,EC 1.4.3.21,reaction_2386
...,...,...
8814,EC 6.5.1.7,reaction_266
8815,EC 6.5.1.7,reaction_7390
8816,EC 6.5.1.7,reaction_7391
8826,EC 7.1.1.9,reaction_62


In [59]:
vmh_reaction_enzymes = vmh_reaction_enzymes[vmh_reaction_enzymes.ecnumber.notna()]
vmh_reaction_enzymes

Unnamed: 0,menuguide_id,ecnumber
2,reaction_2,3.6.1.1
3,reaction_3,3.6.1.1
4,reaction_4,3.6.1.1
5,reaction_5,3.6.1.1
6,reaction_6,3.6.1.1
...,...,...
22134,reaction_22135,EC:3.4.-.-
22135,reaction_22136,EC:6.3.4.14
22136,reaction_22137,EC:2.6.1.82
22137,reaction_22138,EC:5.1.3.2


In [60]:
vmh_reaction_enzymes[vmh_reaction_enzymes.ecnumber.str.contains('EC')]

Unnamed: 0,menuguide_id,ecnumber
18,reaction_18,EC:1.2.4.1
59,reaction_59,EC:2.4.1.95
70,reaction_70,EC:2.7.11.19
71,reaction_71,EC:3.1.3.17
96,reaction_96,EC:1.6.2.4
...,...,...
22134,reaction_22135,EC:3.4.-.-
22135,reaction_22136,EC:6.3.4.14
22136,reaction_22137,EC:2.6.1.82
22137,reaction_22138,EC:5.1.3.2


In [61]:
vmh_enzymes_to_add = []

def extract_vmh_enzymes(row):
    reaction_id = row['menuguide_id']
    ecnumber = row['ecnumber']
    ecnumber = ecnumber.strip()
    
    if ',' in ecnumber:
        ecnumber_list = ecnumber.split(',')
        for number in ecnumber_list:
            if number:
                if 'EC:' in number:
                    vmh_enzymes_to_add.append([reaction_id, number])
                else:
                    number_temp = f"EC:{number.strip()}"
                    vmh_enzymes_to_add.append([reaction_id, number_temp])
                
    elif ';' in ecnumber:
        ecnumber_list = ecnumber.split(';')
        for number in ecnumber_list:
            if number:
                if 'EC:' in number:
                    vmh_enzymes_to_add.append([reaction_id, number])
                else:
                    number_temp = f"EC:{number.strip()}"
                    vmh_enzymes_to_add.append([reaction_id, number_temp])
                    
    elif ' ' in ecnumber:
        ecnumber_list = ecnumber.split(' ')
        for number in ecnumber_list:
            if number:
                if 'EC:' in number:
                    vmh_enzymes_to_add.append([reaction_id, number])
                else:
                    number_temp = f"EC:{number.strip()}"
                    vmh_enzymes_to_add.append([reaction_id, number_temp])
    
    elif 'EC:' in ecnumber:
        vmh_enzymes_to_add.append([reaction_id, ecnumber])
    else:
        number_temp = f"EC:{ecnumber.strip()}"
        vmh_enzymes_to_add.append([reaction_id, number_temp])

vmh_reaction_enzymes.apply(extract_vmh_enzymes, axis=1)

2        None
3        None
4        None
5        None
6        None
         ... 
22134    None
22135    None
22136    None
22137    None
22139    None
Length: 6465, dtype: object

In [62]:
pattern = re.compile(r"EC:[1-9]{1}.[1-9]{1}[0-9]{0,1}.[-1-9]{1}[0-9]{0,1}.[-1-9]{1}")

In [63]:
cleaned_vmh_enzymes = []

for idx, enzyme in enumerate(vmh_enzymes_to_add):
    match = pattern.match(enzyme[1])
    
    if match:
        cleaned_vmh_enzymes.append(enzyme)

In [64]:
enzymes_to_add = pd.concat([enzymes_to_add, pd.DataFrame(cleaned_vmh_enzymes, columns=['menuguide_id', 'enzyme_id'])], axis=0)

In [65]:
enzymes_to_add = enzymes_to_add.drop_duplicates()

In [66]:
enzymes_to_add

Unnamed: 0,enzyme_id,menuguide_id
0,EC 1.4.3.21,reaction_1391
1,EC 1.4.3.21,reaction_1785
2,EC 1.4.3.21,reaction_1901
3,EC 1.4.3.21,reaction_1974
4,EC 1.4.3.21,reaction_2386
...,...,...
7183,EC:6.3.4.14,reaction_22136
7184,EC:2.6.1.82,reaction_22137
7185,EC:5.1.3.2,reaction_22138
7186,EC:1.1.1.79,reaction_22140


In [67]:
def add_gene_to_dict(row):
    reaction_id = row['menuguide_id']
    gene_symbol = row['symbol']
    if reaction_id in reactions_dict:
        entry = reactions_dict[reaction_id]
        if 'genes' in entry:
            entry['genes'].add(gene_symbol)
        else:
            entry['genes'] = {gene_symbol}
            
def add_enzyme_to_dict(row):
    reaction_id = row['menuguide_id']
    enzyme = row['enzyme_id']
    enzyme = enzyme.replace('EC ', 'EC:')
    if reaction_id in reactions_dict:
        entry = reactions_dict[reaction_id]
        if 'enzymes' in entry:
            entry['enzymes'].add(enzyme)
        else:
            entry['enzymes'] = {enzyme}

In [68]:
genes_to_add = genes_to_add[~genes_to_add.symbol.str.isnumeric()]

In [69]:
enzymes_to_add

Unnamed: 0,enzyme_id,menuguide_id
0,EC 1.4.3.21,reaction_1391
1,EC 1.4.3.21,reaction_1785
2,EC 1.4.3.21,reaction_1901
3,EC 1.4.3.21,reaction_1974
4,EC 1.4.3.21,reaction_2386
...,...,...
7183,EC:6.3.4.14,reaction_22136
7184,EC:2.6.1.82,reaction_22137
7185,EC:5.1.3.2,reaction_22138
7186,EC:1.1.1.79,reaction_22140


In [70]:
genes_to_add.apply(add_gene_to_dict, axis=1)
enzymes_to_add.apply(add_enzyme_to_dict, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
7183    None
7184    None
7185    None
7186    None
7187    None
Length: 10009, dtype: object

In [71]:
reactions_dict

{'reaction_0': {'kegg_id': 'R00001',
  'description': 'polyphosphate polyphosphohydrolase',
  'substrates': {'compound_2300', 'compound_5003', 'compound_5449'},
  'products': {'compound_4433', 'compound_7069'}},
 'reaction_1': {'kegg_id': 'R00002',
  'description': 'Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing)',
  'substrates': {'compound_2019',
   'compound_2300',
   'compound_2460',
   'compound_3505',
   'compound_5003',
   'compound_925'},
  'products': {'compound_2018',
   'compound_3473',
   'compound_4400',
   'compound_4401',
   'compound_5006',
   'compound_744',
   'compound_9361'}},
 'reaction_2': {'kegg_id': 'R00005',
  'description': 'urea-1-carboxylate amidohydrolase',
  'substrates': {'compound_2300',
   'compound_5003',
   'compound_6210',
   'compound_6211',
   'compound_808'},
  'products': {'compound_1654', 'compound_4158'},
  'enzymes': {'EC:3.6.1.1'}},
 'reaction_3': {'kegg_id': 'R00006',
  'description': 'pyruvate:pyruvate acetaldehydetransferase

## Add reactions to graph

In [72]:
def find_enzyme(ec_number):
    query = prepareQuery(
        f"""SELECT ?iri
            WHERE {{
              ?iri <http://www.geneontology.org/formats/oboInOwl#hasDbXref> '{ec_number}' .
              FILTER(isIRI(?iri))
            }}
        """,
    )
    
    result = onto_graph.query(query)
    return list(result)

In [73]:
ec_test = list(reactions_dict['reaction_23']['enzymes'])[0]

In [74]:
results = find_enzyme(ec_test)

In [75]:
for term in results:
    print(term[0])

http://purl.obolibrary.org/obo/GO_0003834


In [76]:
mapped = set()
not_mapped = set()

for reaction_name, reaction in reactions_dict.items():
    if 'enzymes' in reaction:
        enzymes = reaction['enzymes']
        for enzyme in enzymes:
            results = find_enzyme(enzyme)
            if results:
                mapped.add(enzyme)
            else:
                not_mapped.add(enzyme)

In [77]:
len(mapped)

1735

In [78]:
len(not_mapped)

245

In [79]:
def add_triple_property(subject, predicate, object, prefix=''):
    if pd.notna(object):
        if prefix:
            object = prefix + object
        onto_graph.add((subject, predicate, Literal(object)))
        
def class_exists(iri):
    # SPARQL query to check if the IRI exists in the graph
    query = prepareQuery(
        """ASK
           WHERE {
              VALUES ?iri_to_check { <""" + str(iri) + """> }
              ?iri_to_check ?p ?o .
           }""",
    )
    
    exists = onto_graph.query(query)
    return exists
        
def add_reactant(subject, predicate, object_uri):
    if pd.notna(object_uri):
        iri_to_check = URIRef(object_uri)
        
        if class_exists(iri_to_check):
            onto_graph.add((subject, predicate, iri_to_check))

In [80]:
def add_reaction_content_to_graph(reaction_name, reaction_entry):
    reaction = URIRef(MeNuGUIDE[reaction_name])
    
    onto_graph.add((reaction, RDF.type, MeNuGUIDE.Reaction))
    
    # TODO: make it reaction abbreviation?
    onto_graph.add((reaction, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), Literal(reaction_name)))
    
    # Description - http://purl.obolibrary.org/obo/IAO_0000115 (used in ChEBI ontology)
    if 'description' in reaction_entry:
        add_triple_property(reaction, OBO.IAO_0000115, reaction_entry['description'])
    
    # KEGG - http://www.geneontology.org/formats/oboInOwl#hasDbXref 
    if 'kegg_id' in reaction_entry:
        add_triple_property(reaction, GO.hasDbXref, reaction_entry['kegg_id'], 'KEGG:')
        
    # VMH - http://www.geneontology.org/formats/oboInOwl#hasDbXref 
    if 'vmh_id' in reaction_entry:
        if type(reaction_entry['vmh_id']) == list:
            for vmh_identifier in reaction_entry['vmh_id']:
                 add_triple_property(reaction, GO.hasDbXref, vmh_identifier, 'VMH:')
        else:
            add_triple_property(reaction, GO.hasDbXref, reaction_entry['vmh_id'], 'VMH:')
    
    # input of - http://purl.obolibrary.org/obo/RO_0002352
    # has input - http://purl.obolibrary.org/obo/RO_0002233
    if 'substrates' in reaction_entry: 
        for substrate in reaction_entry['substrates']:
            add_reactant(MeNuGUIDE[substrate], OBO.RO_0002352, reaction)
            add_reactant(reaction, OBO.RO_0002233, MeNuGUIDE[substrate])
    
    # output of - http://purl.obolibrary.org/obo/RO_0002353
    # has output - http://purl.obolibrary.org/obo/RO_0002234
    if 'products' in reaction_entry:
        for product in reaction_entry['products']:
            add_reactant(MeNuGUIDE[product], OBO.RO_0002353, reaction)
            add_reactant(reaction, OBO.RO_0002234, MeNuGUIDE[product])
    
    # has function - http://purl.obolibrary.org/obo/RO_0000085
    # participates in - http://purl.obolibrary.org/obo/RO_0000056
    # has participant - http://purl.obolibrary.org/obo/RO_0000057
    if 'enzymes' in reaction_entry:
        for enzyme in reaction_entry['enzymes']:
            enzyme_uri = URIRef(MeNuGUIDE[enzyme])
            onto_graph.add((enzyme_uri, RDF.type, MeNuGUIDE.Enzyme))
            onto_graph.add((enzyme_uri, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), Literal(enzyme)))
            
            onto_graph.add((reaction, OBO.RO_0000057, enzyme_uri))
            onto_graph.add((enzyme_uri, OBO.RO_0000056, reaction))
            
            go_terms_found = find_enzyme(ec_test)
            if go_terms_found:
                for term in go_terms_found:
                    onto_graph.add((enzyme_uri, OBO.RO_0000085, term[0]))
    
    # involved in - http://purl.obolibrary.org/obo/RO_0002331
    if 'genes' in reaction_entry:
        for gene in reaction_entry['genes']:
            gene_uri = URIRef(MeNuGUIDE[gene])
            onto_graph.add((gene_uri, RDF.type, MeNuGUIDE.Gene))
            onto_graph.add((gene_uri, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), Literal(gene)))
            
            onto_graph.add((gene_uri, OBO.RO_0002331, reaction))

In [81]:
for reaction_id, reaction_entry in reactions_dict.items():
    print(reaction_id)
    print(reaction_entry)
    if reaction_id == "reaction_15":
        break

reaction_0
{'kegg_id': 'R00001', 'description': 'polyphosphate polyphosphohydrolase', 'substrates': {'compound_5449', 'compound_2300', 'compound_5003'}, 'products': {'compound_7069', 'compound_4433'}}
reaction_1
{'kegg_id': 'R00002', 'description': 'Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing)', 'substrates': {'compound_2300', 'compound_925', 'compound_2019', 'compound_3505', 'compound_2460', 'compound_5003'}, 'products': {'compound_744', 'compound_4401', 'compound_9361', 'compound_3473', 'compound_4400', 'compound_5006', 'compound_2018'}}
reaction_2
{'kegg_id': 'R00005', 'description': 'urea-1-carboxylate amidohydrolase', 'substrates': {'compound_2300', 'compound_6210', 'compound_808', 'compound_6211', 'compound_5003'}, 'products': {'compound_4158', 'compound_1654'}, 'enzymes': {'EC:3.6.1.1'}}
reaction_3
{'kegg_id': 'R00006', 'description': 'pyruvate:pyruvate acetaldehydetransferase (decarboxylating)', 'substrates': {'compound_1654', 'compound_6130'}, 'products': {'c

In [82]:
for reaction_id, reaction_entry in reactions_dict.items():
    print(reaction_id)
    add_reaction_content_to_graph(reaction_id, reaction_entry)

reaction_0
reaction_1
reaction_2
reaction_3
reaction_4
reaction_5
reaction_6
reaction_7
reaction_8
reaction_9
reaction_10
reaction_11
reaction_12
reaction_13
reaction_14
reaction_15
reaction_16
reaction_17
reaction_18
reaction_19
reaction_20
reaction_21
reaction_22
reaction_23
reaction_24
reaction_25
reaction_26
reaction_27
reaction_28
reaction_29
reaction_30
reaction_31
reaction_32
reaction_33
reaction_34
reaction_35
reaction_36
reaction_37
reaction_38
reaction_39
reaction_40
reaction_41
reaction_42
reaction_43
reaction_44
reaction_45
reaction_46
reaction_47
reaction_48
reaction_49
reaction_50
reaction_51
reaction_52
reaction_53
reaction_54
reaction_55
reaction_56
reaction_57
reaction_58
reaction_59
reaction_60
reaction_61
reaction_62
reaction_63
reaction_64
reaction_65
reaction_66
reaction_67
reaction_68
reaction_69
reaction_70
reaction_71
reaction_72
reaction_73
reaction_74
reaction_75
reaction_76
reaction_77
reaction_78
reaction_79
reaction_80
reaction_81
reaction_82
reaction_83
re

In [83]:
len(onto_graph)

25577969

In [84]:
def get_all_triples_for_iri(iri_to_check):
    sparql_query = f"""
    SELECT ?s ?p ?o
    WHERE {{
        {{ <{iri_to_check}> ?p ?o . }}
        UNION
        {{ ?s ?p <{iri_to_check}> . }}
    }}
    """
    
    # Execute the SPARQL query
    results = onto_graph.query(sparql_query)
    return list(results)

In [88]:
iri_test = URIRef(MeNuGUIDE.reaction_9)

triples = get_all_triples_for_iri(iri_test)

for triple in triples:
    print(triple)

(None, rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://MeNuGUIDE.local/Reaction'))
(None, rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.Literal('reaction_9'))
(None, rdflib.term.URIRef('http://purl.obolibrary.org/obo/IAO_0000115'), rdflib.term.Literal('pyruvate:thiamin diphosphate acetaldehydetransferase (decarboxylating)'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'), rdflib.term.Literal('KEGG:R00014'))
(None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'), rdflib.term.Literal('VMH:HMR_8746'))
(None, rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002233'), rdflib.term.URIRef('http://MeNuGUIDE.local/compound_5012'))
(None, rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002233'), rdflib.term.URIRef('http://MeNuGUIDE.local/compound_404077'))
(None, rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002233'), rdflib.ter

In [86]:
reactions_dict['reaction_9']

{'kegg_id': 'R00014',
 'vmh_id': 'HMR_8746',
 'description': 'pyruvate:thiamin diphosphate acetaldehydetransferase (decarboxylating)',
 'substrates': {'compound_404077',
  'compound_4529',
  'compound_4784',
  'compound_5012'},
 'products': {'compound_1654', 'compound_3106', 'compound_3107'},
 'genes': {'PDHA1', 'PDHA2', 'PDHB'},
 'enzymes': {'EC:1.2.4.1'}}

In [87]:
onto_graph.serialize(destination=os.path.join(ontology_folder, "merged_with_foods_compounds_reactions.ttl"),
    format="turtle") 

<Graph identifier=N05dff0e96c2649b7a861678b835c583e (<class 'rdflib.graph.Graph'>)>