*MeNu GUIDE*

# Clean up final metabolite dataframe

In [52]:
import os
import pandas as pd

In [None]:
processed_data_folder = "/path/to/processed/data/folder/"

In [36]:
merged_data = pd.read_csv(os.path.join(processed_data_folder, "metabolites_all_databases_merged.csv"), dtype={'drugbank_id': 'string', 'knapsack_id': 'string', 'wikipedia_id': 'string', 'biocyc_id': 'string', 'vmh_id': 'string', 'pdb_id': 'string', 'description': 'string', 'cas_number': 'string', 'kingdom': 'string', 'superclass': 'string', 'class': 'string', 'subclass': 'string', 'chebi_id': 'string', 'kegg_id': 'string','lipid_maps': 'string', 'meta_cyc': 'string', 'synonym': 'string', 'foodb_id': 'string', 'markerdb_id': 'string', 'classification': 'string', 'recon3': 'string'})

## Inchi Duplicates

In [37]:
duplicates_inchi = set(merged_data[merged_data.inchi.duplicated() & merged_data.inchi.notna()].inchi.unique())
merged_inchi_duplicated = merged_data[merged_data.inchi.isin(duplicates_inchi)].sort_values(by='inchi')

In [38]:
merged_inchi_duplicated[['name', 'hmdb_id', 'foodb_id', 'chebi_id', 'kegg_id', 'vmh_id', 'exposome_explorer_id', 'markerdb_id', 'inchi']]

Unnamed: 0,name,hmdb_id,foodb_id,chebi_id,kegg_id,vmh_id,exposome_explorer_id,markerdb_id,inchi
193279,aluminium sulfate (anhydrous),,,74768,,,,,"InChI=1S/2Al.3H2O4S/c;;3*1-5(2,3)4/h;;3*(H2,1,..."
193284,aluminium sulfate,,FDB013276,74772,,,,,"InChI=1S/2Al.3H2O4S/c;;3*1-5(2,3)4/h;;3*(H2,1,..."
402565,calcium octadecanoate,HMDB0303352,FDB011517,,,,,,InChI=1S/2C18H36O2.Ca/c2*1-2-3-4-5-6-7-8-9-10-...
115330,calcium stearate,,,190296,,,,,InChI=1S/2C18H36O2.Ca/c2*1-2-3-4-5-6-7-8-9-10-...
21765,iron(ii) lactate,HMDB0303462,FDB013684,,C17382,,,,"InChI=1S/2C3H6O3.Fe/c2*1-2(4)3(5)6;/h2*2,4H,1H..."
...,...,...,...,...,...,...,...,...,...
6086,zinc cation,,,63056,C00038,,,,InChI=1S/Zn/q+2
6108,h+,,FDB030899,,C00080,,,,InChI=1S/p+1
2813,hydrogen ion,HMDB0059597,,15378,C00080,h,,,InChI=1S/p+1
6107,proton,,,24636,C00080,,,,InChI=1S/p+1/i/hH


In [39]:
def find_conflicts(rows):
    row_columns = rows.columns
    
    merged_row = {}
    
    merge_conflict = False
    
    for index, row in rows.iterrows():
        #print(f"{index}: {row['name']}")
        for column in row_columns:
            if column not in merged_row:
               merged_row[column] = row[column]
            elif pd.isna(merged_row[column]):
                merged_row[column] = row[column]
            elif pd.notna(row[column]) and merged_row[column] != row[column]:
                if column in ['hmdb_id', 'chebi_id', 'foodb_id', 'markerdb_id', 'kegg_id', 'vmh_id', 'exposome_explorer_id']:
                    #print(f'Merge conflict: {merged_row[column]} != {row[column]} for {column}')
                    merge_conflict = True
                elif column == 'name':
                    if len(row[column]) < len(merged_row[column]):
                        merged_row[column] = row[column]
                    
    return merged_row, merge_conflict

In [40]:
solvable_duplicates = 0
merge_conflict_count = 0

merged_rows = []

for inchi in duplicates_inchi:
    merged_row, merge_conflict = find_conflicts(merged_data[merged_data.inchi == inchi])
    if merge_conflict:
        merge_conflict_count += 1
    else:
        solvable_duplicates += 1
        merged_rows.append(merged_row)

In [41]:
print(f"Solvable {solvable_duplicates} vs. not solvable {merge_conflict_count}")

Solvable 4212 vs. not solvable 3085


In [42]:
merged_rows_df = pd.DataFrame(merged_rows)

In [43]:
merged_rows_inchis = set(merged_rows_df.inchi.unique())
merged_data = merged_data[~merged_data.inchi.isin(merged_rows_inchis)]
merged_data_inchi_clean = pd.concat([merged_data, merged_rows_df])

## Clean up name duplicates

In [44]:
duplicated_names = set(merged_data_inchi_clean[merged_data_inchi_clean.name.duplicated()].name.unique())
merged_name_duplicated_inchi_clean = merged_data_inchi_clean[merged_data_inchi_clean.name.isin(duplicated_names)].sort_values(by='name')

In [45]:
merged_name_duplicated_inchi_clean

Unnamed: 0,hmdb_id,name,chemical_formula,chemspider_id,drugbank_id,pubchem_compound_id,knapsack_id,wikipedia_id,metlin_id,biocyc_id,...,inchikey,cas_number,smiles,chebi_id,kegg_id,markerdb_id,stars_chebi,exposome_explorer_id,classification,recon3
8997,,"(-)-5-oxo-1,2-campholide",C10H14O3,,,439864.0,,,,,...,UDJVKSCOEHSXBZ-UHFFFAOYSA-N,,CC1(C)[C@H]2CC(=O)O[C@]1(C)CC2=O,18130,C02952,,3.0,,,
26371,,"(-)-5-oxo-1,2-campholide",C10H14O3,,,439864.0,,,,,...,UDJVKSCOEHSXBZ-UHFFFAOYSA-N,,CC1(C2CC(=O)OC1(CC2=O)C)C,,C21930,,,,,
7674,HMDB0034976,(-)-borneol,C10H18O,1013314.0,,1201518.0,C00011024,,,,...,DTGKSKDOIYIVQL-QXFUBDJGSA-N,507-70-0,CC1(C)[C@H]2CC[C@]1(C)[C@H](O)C2,15394,C01411,,3.0,,,
7978,,(-)-borneol,C10H18O,,,439569.0,C00011024,,,,...,DTGKSKDOIYIVQL-CCNFQMFXSA-N,464-45-9,CC1(C2CCC1(C(C2)O)C)C,,C01766,,,,,
21921,HMDB0301807,(-)-pinoresinol,C26H32O11,,,11168362.0,C00031451,Pinoresinol,,,...,QLJNETOQFQXTLI-JKUDBEEXSA-N,81446-29-9,[H][C@@]12CO[C@@H](C3=CC=C(O)C(OC)=C3)[C@]1([H...,67245,C17529,,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105024,HMDB0013118,xanthurenate-8-o-beta-d-glucoside,C16H17NO9,30776690.0,,53481609.0,,,,,...,MYFHOUJDPFBJLH-XGJKELJWSA-N,97451-32-6,OC[C@H]1O[C@@H](OC2=C3N=C(C=C(O)C3=CC=C2)C(O)=...,179661,,,2.0,,,
8499,HMDB0301774,xylan,C5H10O6,59696179.0,,50909243.0,,Xylan,,,...,HEHIOFQJTRFOKM-ASQQECOQSA-N,9014-63-5,[H][C@]1(O)O[C@@]([H])(O)[C@]([H])(O)C([H])(O)...,37166,C02352,,3.0,,,
7099,,xylan,,,,,,,,,...,,9014-63-5,,,C00707,,,,,
6060,HMDB0006271,zymosterol intermediate 2,C27H44O,83724.0,,92746.0,C00023749,,,,...,CGSJXLIKVBJVRY-XTGBIJOFSA-N,128-33-6,[H][C@@](C)(CCC=C(C)C)[C@@]1([H])CC[C@@]2([H])...,52386,C05437,,3.0,,,zymst


In [46]:
solvable_duplicates_name = 0
merge_conflict_count_name = 0

merged_rows_name = []

for name in duplicated_names:
    merged_row_name, merge_conflict_name = find_conflicts(merged_data_inchi_clean[merged_data_inchi_clean.name == name])
    if merge_conflict_name:
        merge_conflict_count_name += 1
    else:
        solvable_duplicates_name += 1
        merged_rows_name.append(merged_row_name)

In [47]:
print(f"Solvable {solvable_duplicates_name} vs. not solvable {merge_conflict_count_name} of total {len(duplicated_names)}")

Solvable 475 vs. not solvable 255 of total 730


In [48]:
merged_rows_name_df = pd.DataFrame(merged_rows_name)
merged_rows_names = set(merged_rows_name_df.name.unique())
merged_data_inchi_clean = merged_data_inchi_clean[~merged_data_inchi_clean.name.isin(merged_rows_names)]
merged_data_inchi_names_clean = pd.concat([merged_data_inchi_clean, merged_rows_name_df])

In [49]:
merged_data_inchi_names_clean

Unnamed: 0,hmdb_id,name,chemical_formula,chemspider_id,drugbank_id,pubchem_compound_id,knapsack_id,wikipedia_id,metlin_id,biocyc_id,...,inchikey,cas_number,smiles,chebi_id,kegg_id,markerdb_id,stars_chebi,exposome_explorer_id,classification,recon3
0,HMDB0000972,10-formyltetrahydrofolate,C20H23N7O7,109092.0,,122347.0,C00007251,10-Formyltetrahydrofolate,5912.0,10-FORMYL-THF,...,AUFGTPPARQZWDO-YUZLPWPTSA-N,2800-34-2,NC1=NC(=O)C2=C(NCC(CN(C=O)C3=CC=C(C=C3)C(=O)N[...,15637,C00234,,3.0,,,10fthf
1,,10-formyltetrahydrofolate-[glu](5),C40H45N11O19,,,,,,,,...,RGSLBOWMZCMTRA-UHFFFAOYSA-H,,[H]OC1=NC(=N[H])N([H])C2=C1N([H])C([H])(C([H])...,,,,,,,10fthf5glu
2,,10-formyltetrahydrofolate-[glu](6),C45H51N12O22,,,,,,,,...,SFFBEUFYOPMKMV-CWMPVMSNSA-G,,[H]OC(=O)C([H])(N=C([O-])C([H])([H])C([H])([H]...,,,,,,,10fthf6glu
3,,10-formyltetrahydrofolate-[glu](7),C50H57N13O25,,,,,,,,...,VTHHVZBTNSHUDD-UHFFFAOYSA-F,,[H]OC1=NC(=N[H])N([H])C2=C1N([H])C([H])(C([H])...,,,,,,,10fthf7glu
4,,10-methyl-3-hydroxy-dodecanoyl-acp,C24H45N2O9PRS,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,,4-n-(n-acetyl-d-glucosaminyl)-protein,C13H20N4O8R2,,,,,,,,...,,,[H]OC([H])([H])[C@@]1([H])O[C@@]([H])(N([H])C(...,16447,C04375,,1.0,,,
471,HMDB0252153,f1alpha,C22H37N2O15X,,,72813999.0,,,,,...,JPQYDUJDUQNSHE-UHFFFAOYSA-N,,CC(=O)NC1C(O)C(O)C(COC2OC(CO)C(OC3OC(CO)C(O)C(...,,,,,,,f1a
472,HMDB0060094,"12,20-dioxo-leukotriene b4",C20H27O5,,,122164848.0,,,,,...,CPTWPKCLDUXOKW-QPPAMNDKSA-M,,[H]O[C@]([H])(C(\[H])=C(\[H])/C(/[H])=C(\[H])/...,134520,,,3.0,,,
473,,trna(cys),X,,,,,,,,...,,,,29167,C01639,,3.0,,,


In [100]:
merged_data_inchi_names_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 404823 entries, 0 to 474
Data columns (total 37 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   hmdb_id                      218847 non-null  object 
 1   name                         404823 non-null  object 
 2   chemical_formula             274129 non-null  object 
 3   chemspider_id                76654 non-null   float64
 4   drugbank_id                  5200 non-null    object 
 5   pubchem_compound_id          117492 non-null  float64
 6   knapsack_id                  12358 non-null   object 
 7   wikipedia_id                 10637 non-null   object 
 8   metlin_id                    2033 non-null    float64
 9   biocyc_id                    3132 non-null    object 
 10  bigg_id                      682 non-null     float64
 11  vmh_id                       5964 non-null    object 
 12  phenol_explorer_compound_id  346 non-null     float64
 13  pdb_id 

In [101]:
merged_data_inchi_names_clean.to_csv(os.path.join(processed_data_folder, 'compounds_all_databases_merged.csv'), index=False)

In [104]:
set(merged_data_inchi_names_clean.columns) - {'name', 'inchi', 'inchikey', 'chemical_formula', 'smiles', 'mono_mass', 'cas_number', 'description', 'iupac'}

{'bigg_id',
 'biocyc_id',
 'chebi_id',
 'chemspider_id',
 'class',
 'classification',
 'drugbank_id',
 'exposome_explorer_id',
 'foodb_id',
 'foodb_id_internal',
 'hmdb_id',
 'kegg_id',
 'kingdom',
 'knapsack_id',
 'lipid_maps',
 'markerdb_id',
 'meta_cyc',
 'metlin_id',
 'pdb_id',
 'phenol_explorer_compound_id',
 'pubchem_compound_id',
 'recon3',
 'stars_chebi',
 'subclass',
 'superclass',
 'synonym',
 'vmh_id',
 'wikipedia_id'}