*MeNu GUIDE*

# Match Exposome Explorer

In [1]:
import os.path
import pandas as pd

In [None]:
exposome_explorer_folder = '/path/to/exposome_explorer/data/folder/'
processed_data_folder = "/path/to/processed/data/folder/"

In [2]:
merged_databases = pd.read_csv(os.path.join(processed_data_folder, 'foodb_hmdb_markerdb_chebi.csv'), dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string', 'markerdb_id': 'string'})

exposome_explorer = pd.read_csv(os.path.join(exposome_explorer_folder, 'biomarkers.csv'), dtype={'ChEBI ID': 'string', 'PubChem ID': 'string'})

## Prepare Exposome Explorer Data

In [3]:
exposome_explorer_single = exposome_explorer[exposome_explorer.Level == 'Single']
exposome_explorer_single = exposome_explorer_single.drop(columns=['Level', 'Synonyms', 'No. of Publications', 'No. of Concentration values', 'No. of Reproducibility values', 'No. of Correlation values', 'No. of Metabolomic associations', 'No. of Microbiota associations', 'No. of Cancer associations', 'Average mass'])

In [4]:
exposome_explorer_single = exposome_explorer_single.rename(columns={'ID': 'exposome_explorer_id', 'Name': 'name_exposome_explorer', 'Classification': 'classification', 'Description': 'description_exposome_explorer', 'CAS Number': 'cas_number_exposome_explorer', 'PubChem ID': 'pubchem_compound_id_exposome_explorer', 'ChEBI ID': 'chebi_id_exposome_explorer', 'FooDB ID': 'foodb_id_exposome_explorer', 'HMDB ID': 'hmdb_id_exposome_explorer', 'SMILES': 'smiles_exposome_explorer', 'Formula': 'chemical_formula_exposome_explorer', 'InChI': 'inchi_exposome_explorer', 'InChIKey': 'inchikey_exposome_explorer', 'Mono. mass': 'mono_mass_exposome_explorer'})

## Merge

In [5]:
matched_exposome_eplorer_ids = set()

def match_exposome_explorer_via_chebi(row):
    chebi_id_match = exposome_explorer_single[exposome_explorer_single['chebi_id_exposome_explorer'] == row.chebi_id]
    if len(chebi_id_match) > 1:
        print('Error: more than one ChEBI match!')
    elif len(chebi_id_match) == 1:
        matched_exposome_eplorer_ids.add(list(chebi_id_match['exposome_explorer_id'])[0])
        return chebi_id_match.iloc[0]
    elif len(chebi_id_match) == 0:
        return pd.Series(None, index=exposome_explorer_single.columns)

def match_exposome_explorer_via_hmdb(row):
    if pd.isna(row["exposome_explorer_id"]):
        hmdb_id_match = exposome_explorer_single[exposome_explorer_single['hmdb_id_exposome_explorer'] == row.hmdb_id]
        if len(hmdb_id_match) > 1:
            print('Error: more than one HMDB match!')
        elif len(hmdb_id_match) == 1:
            ee_id = list(hmdb_id_match['exposome_explorer_id'])[0]
            if ee_id not in matched_exposome_eplorer_ids:
                matched_exposome_eplorer_ids.add(ee_id)
                return hmdb_id_match.iloc[0]
            else:
                return row[list(exposome_explorer_single.columns)]
        elif len(hmdb_id_match) == 0:
            return row[list(exposome_explorer_single.columns)]
    else: 
        return row[list(exposome_explorer_single.columns)]

def match_exposome_explorer_via_foodb(row):
    if pd.isna(row["exposome_explorer_id"]):
        foodb_id_match = exposome_explorer_single[exposome_explorer_single['foodb_id_exposome_explorer'] == row.foodb_id]
        if len(foodb_id_match) > 1:
            print('Error: more than one FooDB match!')
        elif len(foodb_id_match) == 1:
            ee_id = list(foodb_id_match['exposome_explorer_id'])[0]
            if ee_id not in matched_exposome_eplorer_ids:
                matched_exposome_eplorer_ids.add(ee_id)
                return foodb_id_match.iloc[0]
            else:
                return row[list(exposome_explorer_single.columns)]
        elif len(foodb_id_match) == 0:
            return row[list(exposome_explorer_single.columns)]
    else: 
        return row[list(exposome_explorer_single.columns)]

def match_exposome_explorer_via_name(row):
    if pd.isna(row["exposome_explorer_id"]):
        name_match = exposome_explorer_single[exposome_explorer_single['name_exposome_explorer'].str.lower() == row.name]
        if len(name_match) > 1:
            print('Error: more than one name match!')
        elif len(name_match) == 1:
            ee_id = list(name_match['exposome_explorer_id'])[0]
            if ee_id not in matched_exposome_eplorer_ids:
                matched_exposome_eplorer_ids.add(ee_id)
                return name_match.iloc[0]
            else:
                return row[list(exposome_explorer_single.columns)]
        elif len(name_match) == 0:
            return row[list(exposome_explorer_single.columns)]
    else: 
        return row[list(exposome_explorer_single.columns)]

In [6]:
merged_databases[exposome_explorer_single.columns] = merged_databases.apply(match_exposome_explorer_via_chebi, axis=1)
merged_databases[exposome_explorer_single.columns] = merged_databases.apply(match_exposome_explorer_via_hmdb, axis=1)
merged_databases[exposome_explorer_single.columns] = merged_databases.apply(match_exposome_explorer_via_foodb, axis=1)
merged_databases[exposome_explorer_single.columns] = merged_databases.apply(match_exposome_explorer_via_name, axis=1)

In [7]:
len(matched_exposome_eplorer_ids)

697

In [8]:
len(exposome_explorer_single)

980

In [9]:
unmatched_exposome_eplorer = exposome_explorer_single[~exposome_explorer_single.exposome_explorer_id.isin(matched_exposome_eplorer_ids)]
unmatched_exposome_eplorer = unmatched_exposome_eplorer.reindex(columns=merged_databases.columns)
unmatched_exposome_eplorer

Unnamed: 0,hmdb_id,name,chemical_formula,chemspider_id,drugbank_id,pubchem_compound_id,knapsack_id,wikipedia_id,metlin_id,biocyc_id,...,cas_number_exposome_explorer,pubchem_compound_id_exposome_explorer,chebi_id_exposome_explorer,foodb_id_exposome_explorer,hmdb_id_exposome_explorer,smiles_exposome_explorer,chemical_formula_exposome_explorer,inchi_exposome_explorer,inchikey_exposome_explorer,mono_mass_exposome_explorer
47,,,,,,,,,,,...,26539-01-5,161525,,,,OC(=O)CCC1=CC(O)=CC(O)=C1,C9H10O4,InChI=1S/C9H10O4/c10-7-3-6(1-2-9(12)13)4-8(11)...,ITPFIKQWNDGDLG-UHFFFAOYSA-N,182.057909
56,,,,,,,,,,,...,,,,,,CC(=O)N[C@H](CSCC(O)C(N)=O)C(O)=O,C8H14N2O5S,InChI=1S/C8H14N2O5S/c1-4(11)10-5(8(14)15)2-16-...,GFVUOIIZUCFXSF-LWOQYNTDSA-N,250.062343
57,,,,,,,,,,,...,81690-92-8,71312850,,,,CC(=O)N[C@H](CSCCC(N)=O)C(O)=O,C8H14N2O4S,InChI=1S/C8H14N2O4S/c1-5(11)10-6(8(13)14)4-15-...,GGBCHNJZQQEQRX-ZCFIWIBFSA-N,234.067428
80,,,,,,,,,,,...,2528-16-7,31736,,,,OC(=O)C1=CC=CC=C1C(=O)OCC1=CC=CC=C1,C15H12O4,InChI=1S/C15H12O4/c16-14(17)12-8-4-5-9-13(12)1...,XIKIUQUXDNHBFR-UHFFFAOYSA-N,256.073559
82,,,,,,,,,,,...,40321-99-1,170295,,,,CCC(CCC(C)O)COC(=O)C1=CC=CC=C1C(O)=O,C16H22O5,InChI=1S/C16H22O5/c1-3-12(9-8-11(2)17)10-21-16...,RYPQSGURZSTFSX-UHFFFAOYSA-N,294.146724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,,,,,,,,,,,...,84-81-1,5283547,,,,[H]\C(CC\C(C)=C(/[H])CC\C(C)=C(/[H])CC\C(C)=C(...,C41H56O2,InChI=1S/C41H56O2/c1-30(2)16-11-17-31(3)18-12-...,PFRQBZFETXBLTP-RCIYGOBDSA-N,580.428031
1201,,,,,,,,,,,...,523-40-0,9988135,,,,CC(C)=CCC\C(C)=C\CC\C(C)=C\CC\C(C)=C\CC\C(C)=C...,C61H88O2,InChI=1S/C61H88O2/c1-46(2)24-15-25-47(3)26-16-...,OCQQATZYCNAKQB-UQUNHUMXSA-N,852.678432
1202,,,,,,,,,,,...,19228-10-5,6442190,,,,CC(C)=CCC\C(C)=C\CC\C(C)=C\CC\C(C)=C\CC\C(C)=C...,C66H96O2,InChI=1S/C66H96O2/c1-50(2)26-16-27-51(3)28-17-...,YYDMANIEKFAEJC-RYZSZPJESA-N,920.741032
1208,,,,,,,,,,,...,,,,,,,,,,


In [10]:
all_data_merge = pd.concat([merged_databases, unmatched_exposome_eplorer])

  all_data_merge = pd.concat([merged_databases, unmatched_exposome_eplorer])


In [11]:
all_data_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 404154 entries, 0 to 1210
Data columns (total 48 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   hmdb_id                                218847 non-null  object 
 1   name                                   403871 non-null  object 
 2   chemical_formula                       264262 non-null  object 
 3   chemspider_id                          76904 non-null   float64
 4   drugbank_id                            5187 non-null    string 
 5   pubchem_compound_id                    104206 non-null  float64
 6   knapsack_id                            11328 non-null   string 
 7   wikipedia_id                           10538 non-null   string 
 8   metlin_id                              1554 non-null    float64
 9   biocyc_id                              2634 non-null    string 
 10  bigg_id                                682 non-null     float64

## Clean up merged dataframe

In [12]:
all_data_merge = all_data_merge.drop(columns=['hmdb_id_exposome_explorer', 'foodb_id_exposome_explorer', 'chebi_id_exposome_explorer'])

In [13]:
columns_to_clean = ['name', 'description', 'cas_number', 'pubchem_compound_id', 'chemical_formula', 'mono_mass', 'inchi', 'inchikey', 'smiles']

for column in columns_to_clean:
    column_ee = column + '_exposome_explorer'

    all_data_merge[column] = all_data_merge.apply(lambda row: row[column] if pd.notna(row[column]) else row[column_ee], axis=1)
    all_data_merge = all_data_merge.drop(columns=[column_ee])

In [14]:
all_data_merge['name'] = all_data_merge['name'].apply(lambda x: x.lower())

In [15]:
all_data_merge.to_csv(os.path.join(processed_data_folder, 'foodb_hmdb_markerdb_chebi_exposomeexplorer.csv'), index=False)