*MeNu GUIDE*

# Match VMH/Recon3

In [33]:
import pandas as pd
import os

In [None]:
vmh_folder = '/path/to/vmh/data/folder/'
processed_data_folder = "/path/to/processed/data/folder/"

In [138]:
merged_data = pd.read_csv(os.path.join(processed_data_folder, "foodb_hmdb_markerdb_chebi_exposomeexplorer_kegg.csv"), dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string', 'markerdb_id': 'string', 'classification': 'string'})

## Clean up VMH ID duplicates

In [139]:
merged_data[(merged_data.vmh_id.duplicated()) & (merged_data.vmh_id.notna())][['hmdb_id', 'foodb_id', 'chebi_id', 'kegg_id', 'vmh_id', 'name']]

Unnamed: 0,hmdb_id,foodb_id,chebi_id,kegg_id,vmh_id,name
10,HMDB0000217,,25523.0,C00006,NADP,nadp
794,HMDB0242149,FDB001134,17151.0,C00379,XYLT,xylitol
1857,HMDB0302754,,37024.0,C00956,L2AADP,2-aminoadipic acid
94341,HMDB0012515,FDB029108,172659.0,,CE5855,11'-carboxy-alpha-chromanol
353289,HMDB0254327,,,,M03165,mannoheptulose


In [140]:
merged_data.loc[10, 'vmh_id'] = float('NaN') # HMDB0000217 
merged_data.loc[794, 'vmh_id'] = float('NaN') # HMDB0242149
merged_data.loc[1857, 'vmh_id'] = float('NaN') # HMDB0302754
merged_data.loc[94341, 'vmh_id'] = float('NaN') # HMDB0012515
merged_data.loc[353289, 'vmh_id'] = float('NaN') # HMDB0254327

## Match with VMH metabolites

In [141]:
vmh_metabolites = pd.read_csv(os.path.join(vmh_folder, "vmh_metabolites_all.csv"), dtype={'cheBlId': 'string'})

I have the following identifiers to merge with:
* iupac
* keggId
* pubChemId
* cheBlId
* inchiString
* inchiKey
* hmdb
* food_db

$\rightarrow$ start with hmdb, then foodb, then chebi

### Clean up VMH dataframe

In [142]:
vmh_metabolites['formula'] = vmh_metabolites.apply(lambda row: row.chargedFormula if pd.notna(row.chargedFormula) else row.neutralFormula, axis=1)

In [143]:
vmh_metabolites = vmh_metabolites.drop(columns=['Unnamed: 0', 'index', 'createdDate', 'updatedDate', 'lmId', 'ehmnId', 'hepatonetId', 'metanetx', 'seed', 'pdmapName', 'reconMap', 'golgimap', 'lysosomemap', 'mitochondrionmap', 'nucleusmap', 'reticulummap', 'peroxisomemap', 'epa_id', 'echa_id', 'iuphar_id', 'fda_id', 'mesh_id', 'chodb_id', 'isHuman', 'isMicrobe', 'charge', 'chargedFormula', 'neutralFormula', 'synonyms', 'met_id', 'miriam', 'avgmolweight', 'biggId', 'chembl'])

In [144]:
merged_data.columns

Index(['hmdb_id', 'name', 'chemical_formula', 'chemspider_id', 'drugbank_id',
       'pubchem_compound_id', 'knapsack_id', 'wikipedia_id', 'metlin_id',
       'biocyc_id', 'bigg_id', 'vmh_id', 'phenol_explorer_compound_id',
       'pdb_id', 'foodb_id_internal', 'kingdom', 'superclass', 'class',
       'subclass', 'lipid_maps', 'meta_cyc', 'synonym', 'foodb_id',
       'description', 'mono_mass', 'iupac', 'inchi', 'inchikey', 'cas_number',
       'smiles', 'chebi_id', 'kegg_id', 'markerdb_id', 'stars_chebi',
       'exposome_explorer_id', 'classification'],
      dtype='object')

In [145]:
vmh_metabolites = vmh_metabolites.rename(columns={'abbreviation': 'vmh_id', 'fullName': 'name', 'monoisotopicweight': 'mono_mass', 'keggId': 'kegg_id', 'pubChemId': 'pubchem_compound_id', 'cheBlId': 'chebi_id', 'inchiString': 'inchi', 'inchiKey': 'inchikey', 'hmdb': 'hmdb_id', 'reconMap3': 'recon3', 'food_db': 'foodb_id', 'chemspider': 'chemspider_id', 'biocyc': 'biocyc_id', 'wikipedia': 'wikipedia_id', 'drugbank': 'drugbank_id', 'knapsack': 'knapsack_id', 'phenolExplorer': 'phenol_explorer_compound_id', 'metlin': 'metlin_id', 'casRegistry': 'cas_number', 'formula': 'chemical_formula', 'smile': 'smiles'})
vmh_metabolites['name'] = vmh_metabolites['name'].str.lower()

In [146]:
vmh_metabolites_hmdb = vmh_metabolites[vmh_metabolites.hmdb_id.notna()][['vmh_id', 'hmdb_id']]
vmh_metabolites_foodb = vmh_metabolites[vmh_metabolites.foodb_id.notna()][['vmh_id', 'foodb_id']]
vmh_metabolites_chebi = vmh_metabolites[vmh_metabolites.chebi_id.notna()][['vmh_id', 'chebi_id']]
vmh_metabolites_name = vmh_metabolites[vmh_metabolites.name.notna()][['vmh_id', 'name']]

In [147]:
vmh_abbreviations_hmdb = dict(zip(vmh_metabolites_hmdb.hmdb_id, vmh_metabolites_hmdb.vmh_id))
vmh_abbreviations_foodb = dict(zip(vmh_metabolites_foodb.foodb_id, vmh_metabolites_foodb.vmh_id))
vmh_abbreviations_chebi = dict(zip(vmh_metabolites_chebi.chebi_id, vmh_metabolites_chebi.vmh_id))
vmh_abbreviations_name = dict(zip(vmh_metabolites_name.name, vmh_metabolites_name.vmh_id))

### Match as many VMH metabolites as possible

In [148]:
merged_data['vmh_id'] = merged_data.vmh_id.str.lower()
vmh_metabolites['vmh_id'] = vmh_metabolites.vmh_id.str.lower()

In [149]:
def match_vmh(row):
    hmdb_match = vmh_abbreviations_hmdb[row.hmdb_id] if (pd.notna(row.hmdb_id) and (row.hmdb_id in vmh_abbreviations_hmdb.keys())) else None
    foodb_match = vmh_abbreviations_foodb[row.foodb_id] if (pd.notna(row.foodb_id) and (row.foodb_id in vmh_abbreviations_foodb.keys())) else None
    chebi_match = vmh_abbreviations_chebi[row.chebi_id] if (pd.notna(row.chebi_id) and (row.chebi_id in vmh_abbreviations_chebi.keys())) else None
    name_match = vmh_abbreviations_name[row.name] if (pd.notna(row.name) and (row.name in vmh_abbreviations_name.keys())) else None

    if pd.notna(row.vmh_id):
        return row.vmh_id.lower()
    elif pd.notna(name_match):
        return name_match.lower()
    elif pd.notna(hmdb_match):
        return hmdb_match.lower()
    elif pd.notna(foodb_match):
        return foodb_match.lower()
    elif pd.notna(chebi_match):
        return chebi_match.lower()
    else:
        return None

In [150]:
print(merged_data.vmh_id.nunique())
print(merged_data.vmh_id.count())

1529
1529


In [151]:
merged_data['vmh_id'] = merged_data.apply(match_vmh, axis=1)

In [152]:
print(merged_data.vmh_id.nunique())
print(merged_data.vmh_id.count())

1859
2315


### Merge dataframes

In [158]:
merge_temp = merged_data.merge(vmh_metabolites, how='outer', on='vmh_id', suffixes=['', '_vmh'])

In [159]:
merge_temp = merge_temp.drop(columns=['hmdb_id_vmh', 'chebi_id_vmh', 'foodb_id_vmh'])

In [160]:
merge_temp.columns

Index(['hmdb_id', 'name', 'chemical_formula', 'chemspider_id', 'drugbank_id',
       'pubchem_compound_id', 'knapsack_id', 'wikipedia_id', 'metlin_id',
       'biocyc_id', 'bigg_id', 'vmh_id', 'phenol_explorer_compound_id',
       'pdb_id', 'foodb_id_internal', 'kingdom', 'superclass', 'class',
       'subclass', 'lipid_maps', 'meta_cyc', 'synonym', 'foodb_id',
       'description', 'mono_mass', 'iupac', 'inchi', 'inchikey', 'cas_number',
       'smiles', 'chebi_id', 'kegg_id', 'markerdb_id', 'stars_chebi',
       'exposome_explorer_id', 'classification', 'name_vmh', 'description_vmh',
       'iupac_vmh', 'mono_mass_vmh', 'kegg_id_vmh', 'pubchem_compound_id_vmh',
       'inchi_vmh', 'inchikey_vmh', 'smiles_vmh', 'recon3',
       'chemspider_id_vmh', 'biocyc_id_vmh', 'wikipedia_id_vmh',
       'drugbank_id_vmh', 'knapsack_id_vmh', 'phenol_explorer_compound_id_vmh',
       'metlin_id_vmh', 'cas_number_vmh', 'chemical_formula_vmh'],
      dtype='object')

In [161]:
columns_to_clean = ['name', 'description', 'iupac', 'kegg_id', 'drugbank_id', 'cas_number', 'pubchem_compound_id', 'chemspider_id', 'biocyc_id', 'wikipedia_id', 'knapsack_id', 'phenol_explorer_compound_id', 'metlin_id', 'chemical_formula', 'mono_mass', 'inchi', 'inchikey', 'smiles']

for column in columns_to_clean:
    column_vmh = column + '_vmh'

    merge_temp[column] = merge_temp.apply(lambda row: row[column] if pd.notna(row[column]) else row[column_vmh], axis=1)
    merge_temp = merge_temp.drop(columns=[column_vmh])

In [162]:
merge_temp.columns

Index(['hmdb_id', 'name', 'chemical_formula', 'chemspider_id', 'drugbank_id',
       'pubchem_compound_id', 'knapsack_id', 'wikipedia_id', 'metlin_id',
       'biocyc_id', 'bigg_id', 'vmh_id', 'phenol_explorer_compound_id',
       'pdb_id', 'foodb_id_internal', 'kingdom', 'superclass', 'class',
       'subclass', 'lipid_maps', 'meta_cyc', 'synonym', 'foodb_id',
       'description', 'mono_mass', 'iupac', 'inchi', 'inchikey', 'cas_number',
       'smiles', 'chebi_id', 'kegg_id', 'markerdb_id', 'stars_chebi',
       'exposome_explorer_id', 'classification', 'recon3'],
      dtype='object')

In [163]:
merge_temp.to_csv(os.path.join(processed_data_folder, "metabolites_all_databases_merged.csv"), index=False)