*MeNu GUIDE*

# Preprocessing HMDB Data

In [1]:
import pandas as pd
import xml.etree.ElementTree as ET

In [None]:
hmdb_folder = "/path/to/downloaded/HMDB/data/"
processed_data_folder = "/path/to/processed/data/folder/"

## Load XML file
This can take a while, as the file is around 6GB

In [2]:
tree = ET.parse(f'{hmdb_folder}hmdb_metabolites.xml')
root = tree.getroot()

## Extract all metabolites and their properties

In [3]:
compounds = []

for child in root:
    child_dict = {}
    
    for elem in child:
        # Some texts are None, some just contain white space, both of which we want to sort out. 
        if elem.text:
            text = elem.text.strip()
            if text:
                tag = elem.tag.split('}')[1]
                child_dict[tag] = text
    
    compounds.append(child_dict)

In [57]:
compounds_df = pd.DataFrame(compounds)

In [58]:
compounds_df

Unnamed: 0,version,creation_date,update_date,accession,status,name,description,chemical_formula,average_molecular_weight,monisotopic_molecular_weight,...,knapsack_id,kegg_id,wikipedia_id,metlin_id,synthesis_reference,biocyc_id,bigg_id,vmh_id,phenol_explorer_compound_id,pdb_id
0,5.0,2005-11-16 15:48:42 UTC,2021-10-13 17:34:04 UTC,HMDB0000001,quantified,1-Methylhistidine,"1-Methylhistidine, also known as 1-MHis or 1MH...",C7H11N3O2,169.1811,169.085126611,...,C00052105,C01152,Methylhistidine,3741,"Jain, Rahul; Cohen, Louis A. Regiospecific alk...",,,,,
1,5.0,2005-11-16 15:48:42 UTC,2021-10-13 04:18:52 UTC,HMDB0000002,quantified,"1,3-Diaminopropane","1,3-Diaminopropane, also known as DAP or trime...",C3H10N2,74.1249,74.08439833,...,C00007404,C00986,"1,3-Diaminopropane",,"Takayanagi, Yasuyuki; Oohinata, Takahiro. Pre...",CPD-313,,,,
2,5.0,2005-11-16 15:48:42 UTC,2020-11-09 23:11:34 UTC,HMDB0000005,quantified,2-Ketobutyric acid,"2-Ketobutyric acid, also known as alpha-ketobu...",C4H6O3,102.0886,102.031694058,...,C00019675,C00109,Alpha-Ketobutyric_acid,,"Figge, Rainer; Lux, Fabien; Raynaud, Celine; S...",2-OXOBUTANOATE,33889,2OBUT,,
3,5.0,2005-11-16 15:48:42 UTC,2021-09-14 15:44:51 UTC,HMDB0000008,quantified,2-Hydroxybutyric acid,"2-Hydroxybutyric acid (CAS: 600-15-7), also kn...",C4H8O3,104.105,104.047344118,...,,C05984,2-Hydroxybutyric_acid,,"Carlier, J. P.; Henry, C.; Lorin, V.; Rouffign...",CPD-3564,,,,
4,5.0,2005-11-16 15:48:42 UTC,2021-09-14 15:41:25 UTC,HMDB0000010,quantified,2-Methoxyestrone,2-Methoxyestrone (or 2-ME1) belongs to the cla...,C19H24O3,300.3921,300.172544634,...,,C05299,2-Methoxyestrone,2578,"Stoelwinder, Johannes; Moers, Nicolaas Elisabe...",,,C05299,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217915,5.0,2021-10-01 18:05:59 UTC,2021-10-01 18:05:59 UTC,HMDB0304947,expected,Nordeoxycholic acid,,C23H38O4,378.553,378.277009704,...,,,,,,,,,,
217916,5.0,2021-10-01 18:07:16 UTC,2021-10-01 18:07:16 UTC,HMDB0304950,expected,3-Oxo-5beta-cholanoic acid,,C24H38O3,374.565,374.282095084,...,,,,,,,,,,
217917,5.0,2021-10-01 18:07:41 UTC,2021-10-01 18:07:41 UTC,HMDB0304951,expected,Glycerol 1-myristate,,C17H34O4,302.455,302.245709575,...,,,,,,,,,,
217918,5.0,2021-10-08 16:13:31 UTC,2021-10-08 16:22:12 UTC,HMDB0304953,expected,O-Phenolsulfonic acid,,C6H6O4S,174.17,173.998679847,...,,,,,,,,,,


In [59]:
compounds_df = compounds_df.drop(columns=['version', 'status', 'creation_date', 'update_date', 'average_molecular_weight', 'traditional_iupac', 'state', 'synthesis_reference'])

## Check for duplicates with the same ChEBI ID
HMDB has some incorrect, duplicate annotations for certain external identifiers, among them ChEBI. As we want to merge the compounds from HMDB with ChEBI, we need to take care of this issue. Unfortunately, this is something that needs to be done manually. It might also be, that there are even more falsely assigned external identifiers that do not appear multiple times, but there is no easy way to check for that, as we can't go manually through thousands of compounds, so that might be something we will have to check at a later timepoint.

In [60]:
duplicated_chebi_ids = list(compounds_df[(compounds_df.chebi_id.duplicated()) & (compounds_df.chebi_id.notna())].chebi_id)
len(duplicated_chebi_ids)

139

In [14]:
compounds_for_manual_inspection = compounds_df[compounds_df.chebi_id.isin(duplicated_chebi_ids)].sort_values(by='chebi_id')[['name', 'accession', 'chebi_id']]
compounds_for_manual_inspection.to_csv(f"{hmdb_folder}compounds_for_manual_chebi_id_inspection.csv")

$\rightarrow$ Manual inspection step inbetween

In [61]:
compounds_for_manual_inspection = pd.read_csv(f"{hmdb_folder}compounds_for_manual_chebi_id_inspection.csv", sep=';')
compounds_for_manual_inspection

Unnamed: 0.1,Unnamed: 0,name_hmdb,hmdb_id,chebi_id_hmdb,website check
0,10622,5-Hydroxy-7-(4-hydroxy-3-methoxyphenyl)-1-phen...,HMDB0029524,1030794,CHEBI:121564
1,14437,"1-Acetyl-3,14,20-trihydroxywitha-5,24-dienolid...",HMDB0033572,1030794,CHEBI:168690
2,216973,"(S)-2,3,4,5-tetrahydrodipicolinate",HMDB0303994,10980,CHEBI:16845
3,216972,(R+)-3-(4-hydroxyphenyl)lactate,HMDB0303993,10980,CHEBI:10980
4,6345,LysoPC(15:0/0:0),HMDB0010381,131924,CHEBI:131924
...,...,...,...,...,...
294,8332,SM(d18:0/24:1(15Z)(OH)),HMDB0013469,90006,CHEBI:90006
295,15238,S-Propyl 1-propanesulfinothioate,HMDB0034394,91021,CHEBI:91021
296,15242,Vinaginsenoside R17,HMDB0034398,91021,
297,11607,"Theaflavin 3,3'-digallate",HMDB0030551,975367,CHEBI:136608


### Integrate the manually corrected ChEBI IDs

In [62]:
compounds_for_manual_inspection.loc[:, 'website check'] = compounds_for_manual_inspection['website check'].apply(lambda x: x.split('CHEBI:')[1] if pd.notna(x) else float('NaN'))

In [63]:
compounds_for_manual_inspection.index = compounds_for_manual_inspection.hmdb_id
compounds_for_manual_inspection = compounds_for_manual_inspection['website check']
compounds_correct_chebi_dict = compounds_for_manual_inspection.to_dict()

In [64]:
columns_to_modify = ['chemspider_id', 'drugbank_id', 'foodb_id', 'pubchem_compound_id', 'knapsack_id', 'kegg_id', 'wikipedia_id', 'metlin_id', 'biocyc_id', 'bigg_id', 'vmh_id', 'phenol_explorer_compound_id', 'pdb_id']

adjusted_df = compounds_df.copy(deep=True)
adjusted_df = adjusted_df.sort_values(by='chebi_id')

for column in columns_to_modify:
    adjusted_df[column] = adjusted_df.apply(lambda row: row[column] if (row.accession not in compounds_correct_chebi_dict.keys()) else (row[column] if row.chebi_id == compounds_correct_chebi_dict[row.accession] else float('NaN')), axis=1)

adjusted_df['chebi_id'] = adjusted_df.apply(lambda row: row.chebi_id if (row.accession not in compounds_correct_chebi_dict.keys()) else compounds_correct_chebi_dict[row.accession], axis=1)

In [65]:
adjusted_df[(adjusted_df.chebi_id.duplicated()) & (adjusted_df.chebi_id.notna())][['accession', 'name', 'chebi_id']]

Unnamed: 0,accession,name,chebi_id
39781,HMDB0059641,"4,4-Dimethyl-5-alpha-cholest-7-en-3-beta-ol",16455
41860,HMDB0062457,"cholest-5-en-3beta-yl (7Z,10Z,13Z,16Z,19Z-doco...",73910
40159,HMDB0060089,w Hydroxy testosterone,798
2873,HMDB0006759,3a-Hydroxy-5b-pregnane-20-one,1712


In [66]:
adjusted_df = adjusted_df[adjusted_df.accession != 'HMDB0059641']
adjusted_df = adjusted_df[adjusted_df.accession != 'HMDB0006769']
adjusted_df = adjusted_df[adjusted_df.accession != 'HMDB0006759']
adjusted_df = adjusted_df[adjusted_df.accession != 'HMDB0010375']

In [71]:
adjusted_df.to_csv(f'{processed_data_folder}hmdb_metabolites.csv', index=False)