*MeNu GUIDE*

# Matching MarkerDB
As a first step HMDB was merged with FooDB. Now the merged dataset will be further matched with MarkerDB.

In [1]:
import pandas as pd

In [None]:
processed_data_folder = "/path/to/processed/data/folder/"

In [72]:
foodb_hmdb = pd.read_csv(f'{processed_data_folder}foodb_hmdb.csv', dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string'})

markerdb = pd.read_csv(f"{processed_data_folder}markerdb_compounds.csv")

## Merge via hmdb_id

In [73]:
markerdb = markerdb[['name', 'description', 'hmdb', 'moldb_smiles', 'moldb_formula', 'moldb_inchi', 'moldb_inchikey', 'moldb_iupac', 'moldb_mono_mass', 'mdbid']]
markerdb = markerdb.rename(columns={'mdbid': 'markerdb_id', 'hmdb': 'hmdb_id', 'moldb_smiles': 'smiles', 'moldb_formula': 'chemical_formula', 'moldb_inchi': 'inchi', 'moldb_inchikey': 'inchikey', 'moldb_iupac': 'iupac', 'moldb_mono_mass': 'mono_mass'})
markerdb['name'] = markerdb['name'].str.lower()

In [74]:
# We don't have any rows that do not contain an HMDB ID from the MarkerDB database, so we can just perform the merge
df_merge = foodb_hmdb.merge(markerdb, how='outer', on='hmdb_id', suffixes=['', '_markerdb'])

## Clean up dataframe

In [77]:
df_merge.columns

Index(['hmdb_id', 'name', 'chemical_formula', 'chemspider_id', 'drugbank_id',
       'pubchem_compound_id', 'knapsack_id', 'wikipedia_id', 'metlin_id',
       'biocyc_id', 'bigg_id', 'vmh_id', 'phenol_explorer_compound_id',
       'pdb_id', 'foodb_id_internal', 'kingdom', 'superclass', 'class',
       'subclass', 'lipid_maps', 'meta_cyc', 'synonym', 'foodb_id',
       'description', 'mono_mass', 'iupac', 'inchi', 'inchikey', 'cas_number',
       'smiles', 'chebi_id', 'kegg_id', 'markerdb_id'],
      dtype='object')

In [76]:
# Merge descriptions. Strategy: HMDB seems to be correct more often than FooDB, so go with HMDB info, unless there is only FooDB info available. 
columns_to_clean = ['name', 'description', 'smiles', 'chemical_formula', 'inchi', 'inchikey', 'iupac', 'mono_mass']

for column in columns_to_clean:
    column_markerdb = column + '_markerdb'
    
    df_merge[column] = df_merge.apply(lambda row: row[column] if pd.notna(row[column]) else row[column_markerdb], axis=1)
    df_merge = df_merge.drop(columns=[column_markerdb])

In [78]:
df_merge.to_csv(f'{processed_data_folder}foodb_hmdb_markerdb.csv', index=False)