#*MeNu GUIDE*

# Matching ChEBI
This merging is a bit more difficult, as ChEBI does not contain many FooDB, HMDB or MarkerDB identifiers. The three metioned databases do contain ChEBI identifiers, but those do not necessarily match up.

At first I wanted to split the ChEBI dataframe into multiple ones, depending on if the rows has an HMDB ID or FooDB ID or none of those two and then merge. Unfortunately though, there are unqiue ChEBI molecules with the same HMDB/FooDB identifiers. So I decided to merge just on the ChEBI ID. As merging based on the ChEBI ID provided by HMDB yields better results than the one provided by FooDB, the former was performed.

In [1]:
import os.path
import pandas as pd

In [None]:
processed_data_folder = "/path/to/processed/data/folder/"

In [2]:
chebi = pd.read_csv(os.path.join(processed_data_folder, 'chebi_compounds_with_accession.csv'), dtype={'chebi_id': 'string', 'pdb_id': 'string'})
merged_databases = pd.read_csv(os.path.join(processed_data_folder, 'foodb_hmdb_markerdb.csv'), dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string', 'markerdb_id': 'string'})

## Check for ChEBI IDs via name

In [3]:
chebi['name'] = chebi.name.str.lower()
chebi['name'] = chebi.name.str.strip()
chebi_name_dict = chebi[['name', 'chebi_id']].set_index('name').to_dict()
chebi_name_dict = chebi_name_dict['chebi_id']
merged_databases['chebi_id'] = merged_databases.apply(lambda row: chebi_name_dict[row['name']] if row['name'] in chebi_name_dict else row.chebi_id, axis=1)

## Merge dataframes

In [4]:
merged_chebi = merged_databases.merge(chebi, how='outer', on='chebi_id', suffixes=['', '_chebi'])

## Clean Up Data

### HMDB ID

In [5]:
hmdb_ids = set(merged_chebi['hmdb_id'].unique())

In [6]:
# this is to make sure that we don't introduce any duplicates
merged_chebi['hmdb_id_chebi'] = merged_chebi.hmdb_id_chebi.apply(lambda x: None if x in hmdb_ids else x)

### FooDB ID

In [7]:
foodb_ids = set(merged_chebi['foodb_id'].unique())

In [8]:
merged_chebi['foodb_id_chebi'] = merged_chebi.foodb_id_chebi.apply(lambda x: None if x in foodb_ids else x)

### Other columns

In [9]:
columns_to_clean = ['name', 'hmdb_id', 'foodb_id', 'description', 'cas_number', 'chemspider_id', 'drugbank_id', 'kegg_id', 'knapsack_id', 'lipid_maps', 'pubchem_compound_id', 'pdb_id', 'wikipedia_id', 'chemical_formula', 'mono_mass', 'inchi', 'smiles']

In [10]:
for column in columns_to_clean:
    column_chebi = column + '_chebi'
    
    merged_chebi[column] = merged_chebi.apply(lambda row: row[column] if pd.notna(row[column]) else row[column_chebi], axis=1)
    merged_chebi = merged_chebi.drop(columns=[column_chebi])

In [11]:
merged_chebi = merged_chebi[~(merged_chebi.chebi_id.notna() & merged_chebi.name.isna())]

In [14]:
merged_chebi.to_csv(os.path.join(processed_data_folder,'foodb_hmdb_markerdb_chebi.csv'), index=False)