*MeNu GUIDE*

# Match KEGG

In [1]:
import pandas as pd
import os

In [None]:
kegg_folder = '/path/to/kegg/data/folder/'
processed_data_folder = "/path/to/processed/data/folder/"

In [2]:
merged_data = pd.read_csv(os.path.join(processed_data_folder, 'foodb_hmdb_markerdb_chebi_exposomeexplorer.csv'), dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string', 'markerdb_id': 'string', 'classification': 'string'})
kegg_compounds = pd.read_csv(os.path.join(kegg_folder, 'kegg_compounds_processed.csv'))

## Prepare kegg dataframe

In [3]:
kegg_compounds = kegg_compounds.drop(columns=['molecular_weight', 'NIKKAJI', '3DMET', 'Drug group', 'PubChem_subtance_id', 'PDB-CCD', 'pubchem_name'])

In [4]:
kegg_compounds = kegg_compounds.rename(columns={'entry_id': 'kegg_id', 'formula': 'chemical_formula', 'exact_mass': 'mono_mass', 'inchi_key': 'inchikey', 'LIPIDMAPS': 'lipid_maps', 'KNApSAcK': 'knapsack_id', 'ChEBI': 'chebi_id', 'CAS': 'cas_number'})

## Prepare merged dataframe

In [5]:
kegg_compounds['name'] = kegg_compounds.name.str.lower()
kegg_compounds['name'] = kegg_compounds.name.str.strip()
kegg_name_dict = kegg_compounds[['name', 'kegg_id']].set_index('name').to_dict()
kegg_name_dict = kegg_name_dict['kegg_id']
merged_data['kegg_id'] = merged_data.apply(lambda row: (kegg_name_dict[row['name']] if row['name'] in kegg_name_dict else float('NaN')) if pd.isna(row.kegg_id) else row.kegg_id, axis=1)

## Check duplicate KEGG IDs in merged dataset

In [6]:
merged_data[(merged_data.kegg_id.duplicated()) & (merged_data.kegg_id.notna())][['name', 'kegg_id']]

Unnamed: 0,name,kegg_id
154,tannase,C05079
2292,cedrol,C09631
2394,sulfadimidine,C19530
10170,norharman,C20157
16767,2-oxopent-4-enoate,C00596
...,...,...
401903,sorbitol 6-phosphate,C01096
401932,sulfur,C00087
403746,tricin,C10193
403764,udp-glucose,C00029


In [7]:
merged_data[merged_data.kegg_id == "C00596"][['name', 'hmdb_id', 'foodb_id', 'chebi_id', 'kegg_id']]

Unnamed: 0,name,hmdb_id,foodb_id,chebi_id,kegg_id
11610,"cis-2-hydroxypenta-2,4-dienoic acid",,,1113,C00596
16767,2-oxopent-4-enoate,,,11641,C00596
148362,"2-hydroxypenta-2,4-dienoate",,,37319,C00596


In [8]:
kegg_compounds[kegg_compounds.kegg_id == "C00596"]

Unnamed: 0,kegg_id,name,chemical_formula,mono_mass,lipid_maps,knapsack_id,chebi_id,cas_number,pubchem_compound_id,smiles,inchi,inchikey
545,C00596,"2-hydroxy-2,4-pentadienoate",C5H6O3,114.0317,,,1113 11641 37319,159694-16-3,5280361.0,C=CC=C(C(=O)O)O,"InChI=1S/C5H6O3/c1-2-3-4(6)5(7)8/h2-3,6H,1H2,(...",VHTQQDXPNUTMNB-ONEGZZNKSA-N


Seems like there are multiple correct chebi_ids for one kegg_entry, probably a hierarchy & stereoisomer issue. So I can't really remove any identifier and I will just leave duplicated kegg_ids. 

## Merge datasets

In [9]:
merged_data_kegg = merged_data.merge(kegg_compounds, on='kegg_id', how='outer', suffixes=['', '_kegg'])

In [10]:
merged_data_kegg[(merged_data_kegg.kegg_id.notna()) & (merged_data_kegg.name_kegg.isna())][['name', 'hmdb_id', 'foodb_id', 'chebi_id', 'kegg_id']]

Unnamed: 0,name,hmdb_id,foodb_id,chebi_id,kegg_id
1262,"pip(18:1(11z)/20:4(5z,8z,11z,14z))",HMDB0009967,FDB027154,142272,C00626
1263,pip2(16:0/18:0),HMDB0010035,FDB027218,145879,C00626
1264,"pip(16:0/20:4(5z,8z,11z,14z))",HMDB0009931,FDB027120,172859,C00626
1265,pip(16:1(9z)/16:1(9z)),HMDB0009939,FDB027126,191649,C00626
1266,pi(16:0/16:1(9z)),HMDB0009779,FDB026969,88396,C00626
...,...,...,...,...,...
22898,norbolethone,HMDB0006026,FDB023805,,D05204
22899,4-aminohippuric acid,HMDB0001867,FDB022720,104011,D06890
22900,difucosyllacto-n-hexaose a,HMDB0006622,FDB024011,88470,G01889
22901,p-lacto-n-hexaose,HMDB0006628,FDB024014,,G02993


In [11]:
# These compounds don't seem to exist in the KEGG database, so I am going to overwrite them
merged_data_kegg['kegg_id'] = merged_data_kegg.apply(lambda row: float('NaN') if (pd.notna(row.kegg_id) & pd.isna(row.name_kegg)) else row.kegg_id, axis=1)

## Clean up data merge

In [12]:
merged_data_kegg = merged_data_kegg.drop(columns=['chebi_id_kegg'])

In [13]:
columns_to_clean = ['name', 'lipid_maps', 'knapsack_id', 'cas_number', 'pubchem_compound_id', 'chemical_formula', 'mono_mass', 'inchi', 'inchikey', 'smiles']

for column in columns_to_clean:
    column_kegg = column + '_kegg'

    merged_data_kegg[column] = merged_data_kegg.apply(lambda row: row[column] if pd.notna(row[column]) else row[column_kegg], axis=1)
    merged_data_kegg = merged_data_kegg.drop(columns=[column_kegg])

In [14]:
merged_data_kegg.to_csv(os.path.join(processed_data_folder, "foodb_hmdb_markerdb_chebi_exposomeexplorer_kegg.csv"), index=False)