# The Human Metabolome Database (HMDB)

Author: John Erol Evangelista<br/>
Adapted from: https://github.com/MaayanLab/HarmonizomePythonScripts/blob/master/HMDB/HMDB.ipynb <br/>
Date: 01-19 <br/>
Data Source: http://www.hmdb.ca/ <br/>
Notes: Used 2019 dataset and updated gene mapping values

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
import xmltodict
import xml.etree.ElementTree as ET

%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts/utility_functions.py'>

## Load XML file

In [3]:
data_path = '/Users/maayan/sigsets/Harmonizome/Data/'
with open(data_path + "hmdb_metabolites.xml", "rb") as fd:
    d = xmltodict.parse(fd)

In [11]:
a = [dict(x) for x in d['hmdb']['metabolite']]

In [13]:
a[0].keys()

dict_keys(['version', 'creation_date', 'update_date', 'accession', 'status', 'secondary_accessions', 'name', 'cs_description', 'description', 'synonyms', 'chemical_formula', 'average_molecular_weight', 'monisotopic_molecular_weight', 'iupac_name', 'traditional_iupac', 'cas_registry_number', 'smiles', 'inchi', 'inchikey', 'taxonomy', 'ontology', 'state', 'experimental_properties', 'predicted_properties', 'spectra', 'cellular_locations', 'biospecimen_locations', 'tissue_locations', 'pathways', 'normal_concentrations', 'abnormal_concentrations', 'diseases', 'drugbank_id', 'drugbank_metabolite_id', 'phenol_explorer_compound_id', 'phenol_explorer_metabolite_id', 'foodb_id', 'knapsack_id', 'chemspider_id', 'kegg_id', 'biocyc_id', 'bigg_id', 'wikipidia', 'nugowiki', 'metagene', 'metlin_id', 'pubchem_compound_id', 'het_id', 'chebi_id', 'synthesis_reference', 'general_references', 'protein_associations'])

In [15]:
df = pd.DataFrame.from_records(a)
df.head()

Unnamed: 0,abnormal_concentrations,accession,average_molecular_weight,bigg_id,biocyc_id,biological_properties,biospecimen_locations,cas_registry_number,cellular_locations,chebi_id,...,state,status,synonyms,synthesis_reference,taxonomy,tissue_locations,traditional_iupac,update_date,version,wikipidia
0,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000001,169.1811,,CPD-1823,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",332-80-9,{'cellular': 'Cytoplasm'},50599,...,Solid,quantified,{'synonym': ['(2S)-2-amino-3-(1-Methyl-1H-imid...,"Jain, Rahul; Cohen, Louis A. Regiospecific alk...",{'description': 'This compound belongs to the ...,"{'tissue': ['Muscle', 'Skeletal Muscle']}",1 methylhistidine,2018-05-20 20:24:02 UTC,4.0,
1,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000002,74.1249,36543.0,CPD-313,,"{'biospecimen': ['Blood', 'Feces', 'Urine']}",109-76-2,{'cellular': 'Cytoplasm'},15725,...,Liquid,quantified,"{'synonym': ['1,3-Propanediamine', '1,3-Propyl...","Takayanagi, Yasuyuki; Oohinata, Takahiro. Pre...",{'description': 'This compound belongs to the ...,,"α,ω-propanediamine",2018-05-20 06:57:20 UTC,4.0,
2,"{'concentration': {'biospecimen': 'Urine', 'co...",HMDB0000005,102.0886,33889.0,2-OXOBUTANOATE,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",600-18-0,{'cellular': 'Cytoplasm'},30831,...,Solid,quantified,"{'synonym': ['2-Ketobutanoic acid', '2-Oxobuty...","Figge, Rainer; Lux, Fabien; Raynaud, Celine; S...",{'description': 'This compound belongs to the ...,,2-oxobutanoic acid,2018-05-20 02:21:35 UTC,4.0,Alpha-ketobutyric_acid
3,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000008,104.1045,47130.0,CPD-3564,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",600-15-7,"{'cellular': ['Cytoplasm', 'Extracellular']}",1148,...,Solid,quantified,"{'synonym': ['2-Hydroxybutanoic acid', 'alpha-...","Carlier, J. P.; Henry, C.; Lorin, V.; Rouffign...",{'description': 'This compound belongs to the ...,{'tissue': 'Prostate'},α-hydroxybutyric acid,2018-05-20 20:40:32 UTC,4.0,2-Hydroxybutyric acid
4,"{'concentration': {'biospecimen': 'Urine', 'co...",HMDB0000010,300.3921,,,,"{'biospecimen': ['Blood', 'Urine']}",362-08-3,"{'cellular': ['Extracellular', 'Membrane']}",1189,...,Solid,quantified,"{'synonym': ['2-(8S,9S,13S,14S)-3-Hydroxy-2-me...","Stoelwinder, Johannes; Moers, Nicolaas Elisabe...",{'description': 'This compound belongs to the ...,,2-methoxyestrone,2018-05-25 18:22:39 UTC,4.0,


In [16]:
df.shape

(114100, 54)

In [18]:
df = df[['name', 'protein_associations']]
df.head()

Unnamed: 0,name,protein_associations
0,1-Methylhistidine,{'protein': [{'protein_accession': 'HMDBP00473...
1,"1,3-Diaminopropane",{'protein': [{'protein_accession': 'HMDBP00217...
2,2-Ketobutyric acid,{'protein': [{'protein_accession': 'HMDBP00012...
3,2-Hydroxybutyric acid,{'protein': [{'protein_accession': 'HMDBP00054...
4,2-Methoxyestrone,{'protein': [{'protein_accession': 'HMDBP00272...


In [39]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    lst2 = []
    
    progressPercent = ((i+1)/len(df.index))*100

    if i%1000 == 0:
        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
        sys.stdout.flush()
    if df.loc[index]['protein_associations']:
        if len(df.loc[1]['protein_associations']) > 1:
            print(df.loc[1]['protein_associations'])
#         if type(df.ix[index, 'protein_associations']['protein']) == list:
#             for j in range(0, len(df.ix[index, 'protein_associations']['protein'])):
#                 lst2.append(df.ix[index, 'protein_associations']['protein'][j]['gene_name'])
#         else:
#             lst2.append(df.ix[index, 'protein_associations']['protein']['gene_name'])
#         lst1 = [df.ix[index, 'name']]*(len(lst2))
#         temp = pd.DataFrame()
#         temp['Metabolite'] = lst1
#         temp['Gene'] = lst2
#         df_interactions = pd.concat([df_interactions, temp]) 

Progress: 77%  88416 Out of 114100   

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress: 97%  110727 Out of 114100   

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress: 100%  114100 Out of 114100   

In [38]:
len(df.loc[1]['protein_associations'])

1

In [41]:
3%4

3