# The Human Metabolome Database (HMDB)

Author: John Erol Evangelista<br/>
Adapted from: https://github.com/MaayanLab/HarmonizomePythonScripts/blob/master/HMDB/HMDB.ipynb <br/>
Date: 01-19 <br/>
Data Source: http://www.hmdb.ca/ <br/>
Notes: Used 2019 dataset and updated gene mapping values

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
import xmltodict
import xml.etree.ElementTree as ET

%matplotlib inline

In [434]:
import scipy.spatial.distance as dist

In [475]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts/utility_functions.py'>

## Load XML file

In [3]:
data_path = '/Users/maayan/sigsets/Harmonizome/Data/'
with open(data_path + "hmdb_metabolites.xml", "rb") as fd:
    d = xmltodict.parse(fd)

In [336]:
a = [dict(x) for x in dict(d['hmdb'])['metabolite']]

In [337]:
df = pd.DataFrame.from_records(a)
df.head()

Unnamed: 0,abnormal_concentrations,accession,average_molecular_weight,bigg_id,biocyc_id,biological_properties,biospecimen_locations,cas_registry_number,cellular_locations,chebi_id,...,state,status,synonyms,synthesis_reference,taxonomy,tissue_locations,traditional_iupac,update_date,version,wikipidia
0,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000001,169.1811,,CPD-1823,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",332-80-9,{'cellular': 'Cytoplasm'},50599,...,Solid,quantified,{'synonym': ['(2S)-2-amino-3-(1-Methyl-1H-imid...,"Jain, Rahul; Cohen, Louis A. Regiospecific alk...",{'description': 'This compound belongs to the ...,"{'tissue': ['Muscle', 'Skeletal Muscle']}",1 methylhistidine,2018-05-20 20:24:02 UTC,4.0,
1,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000002,74.1249,36543.0,CPD-313,,"{'biospecimen': ['Blood', 'Feces', 'Urine']}",109-76-2,{'cellular': 'Cytoplasm'},15725,...,Liquid,quantified,"{'synonym': ['1,3-Propanediamine', '1,3-Propyl...","Takayanagi, Yasuyuki; Oohinata, Takahiro. Pre...",{'description': 'This compound belongs to the ...,,"α,ω-propanediamine",2018-05-20 06:57:20 UTC,4.0,
2,"{'concentration': {'biospecimen': 'Urine', 'co...",HMDB0000005,102.0886,33889.0,2-OXOBUTANOATE,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",600-18-0,{'cellular': 'Cytoplasm'},30831,...,Solid,quantified,"{'synonym': ['2-Ketobutanoic acid', '2-Oxobuty...","Figge, Rainer; Lux, Fabien; Raynaud, Celine; S...",{'description': 'This compound belongs to the ...,,2-oxobutanoic acid,2018-05-20 02:21:35 UTC,4.0,Alpha-ketobutyric_acid
3,"{'concentration': [{'biospecimen': 'Blood', 'c...",HMDB0000008,104.1045,47130.0,CPD-3564,,"{'biospecimen': ['Blood', 'Cerebrospinal Fluid...",600-15-7,"{'cellular': ['Cytoplasm', 'Extracellular']}",1148,...,Solid,quantified,"{'synonym': ['2-Hydroxybutanoic acid', 'alpha-...","Carlier, J. P.; Henry, C.; Lorin, V.; Rouffign...",{'description': 'This compound belongs to the ...,{'tissue': 'Prostate'},α-hydroxybutyric acid,2018-05-20 20:40:32 UTC,4.0,2-Hydroxybutyric acid
4,"{'concentration': {'biospecimen': 'Urine', 'co...",HMDB0000010,300.3921,,,,"{'biospecimen': ['Blood', 'Urine']}",362-08-3,"{'cellular': ['Extracellular', 'Membrane']}",1189,...,Solid,quantified,"{'synonym': ['2-(8S,9S,13S,14S)-3-Hydroxy-2-me...","Stoelwinder, Johannes; Moers, Nicolaas Elisabe...",{'description': 'This compound belongs to the ...,,2-methoxyestrone,2018-05-25 18:22:39 UTC,4.0,


In [338]:
df.shape

(114100, 54)

In [339]:
df = df[['name', 'protein_associations']]
df.head()

Unnamed: 0,name,protein_associations
0,1-Methylhistidine,{'protein': [{'protein_accession': 'HMDBP00473...
1,"1,3-Diaminopropane",{'protein': [{'protein_accession': 'HMDBP00217...
2,2-Ketobutyric acid,{'protein': [{'protein_accession': 'HMDBP00012...
3,2-Hydroxybutyric acid,{'protein': [{'protein_accession': 'HMDBP00054...
4,2-Methoxyestrone,{'protein': [{'protein_accession': 'HMDBP00272...


In [366]:
df.loc[3039]

name                    CPA(18:1(11Z)/0:0)
protein_associations                  None
Name: 3039, dtype: object

In [367]:
gene_set = set([])
for i, index in enumerate(df.index):
    if df.loc[index]['protein_associations']:
        proteins = df.loc[index]['protein_associations']['protein']
        if isinstance(proteins, list):
            for j in proteins:
                gene_set.add(j["gene_name"])
        else:
            gene_set.add(proteins['gene_name'])

In [368]:
len(gene_set)

5402

In [373]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    lst2 = []
    
    progressPercent = ((i+1)/len(df.index))*100

    if i%100 == 0:
        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
        sys.stdout.flush()
    if df.loc[index]['protein_associations']:
        if isinstance(df.loc[index]['protein_associations']['protein'], list):
            for j in range(0, len(df.loc[index]['protein_associations']['protein'])):
                lst2.append(df.loc[index]['protein_associations']['protein'][j]['gene_name'])
        else:
            lst2.append(df.loc[index]['protein_associations']['protein']['gene_name'])
        lst1 = [df.loc[index]['name']]*(len(lst2))
        temp = pd.DataFrame()
        temp['Metabolite'] = lst1
        temp['Gene'] = lst2
        df_interactions = pd.concat([df_interactions, temp])

Progress: 99%  114001 Out of 114100   

In [375]:
df_interactions.head()

Unnamed: 0,Metabolite,Gene
0,1-Methylhistidine,CNDP1
1,1-Methylhistidine,PRMT3
0,"1,3-Diaminopropane",SMS
1,"1,3-Diaminopropane",DHPS
2,"1,3-Diaminopropane",ABP1


In [376]:
df_interactions.shape

(865701, 2)

In [377]:
len(set(df_interactions["Gene"]))

5402

In [378]:
df_interactions.reset_index(inplace=True)

In [379]:
df_interactions.drop('index', axis=1, inplace=True)

In [380]:
df_interactions.head()

Unnamed: 0,Metabolite,Gene
0,1-Methylhistidine,CNDP1
1,1-Methylhistidine,PRMT3
2,"1,3-Diaminopropane",SMS
3,"1,3-Diaminopropane",DHPS
4,"1,3-Diaminopropane",ABP1


In [381]:
df_interactions.drop_duplicates(inplace=True)

In [382]:
df_interactions.shape

(856563, 2)

## Map Gene Symbols To Up-to-date Gene Symbols

In [383]:
df_interactions.set_index('Gene', inplace=True)

In [384]:
df_interactions.head()

Unnamed: 0_level_0,Metabolite
Gene,Unnamed: 1_level_1
CNDP1,1-Methylhistidine
PRMT3,1-Methylhistidine
SMS,"1,3-Diaminopropane"
DHPS,"1,3-Diaminopropane"
ABP1,"1,3-Diaminopropane"


In [385]:
uf.mapgenesymbols_updated(df_interactions)

Progress: 99%  856001 Out of 856563   

In [386]:
df_interactions.head()

Unnamed: 0_level_0,Metabolite
Gene,Unnamed: 1_level_1
CNDP1,1-Methylhistidine
PRMT3,1-Methylhistidine
SMS,"1,3-Diaminopropane"
DHPS,"1,3-Diaminopropane"
AOC1,"1,3-Diaminopropane"


In [387]:
df_interactions.shape

(851232, 1)

## Remove Duplicates

In [388]:
df_interactions.reset_index(inplace=True)
df_interactions.drop_duplicates(inplace=True)
df_interactions.shape

(836222, 2)

## Create Binary Matrix

In [389]:
binary_matrix = uf.createBinaryMatrix(df_interactions)

Progress: 100%  5263 Out of 5263   

In [390]:
binary_matrix.head()

Unnamed: 0,Selenomethionine,TG(22:0/22:1(13Z)/o-18:0),"PC(22:2(13Z,16Z)/22:2(13Z,16Z))",TG(20:0/15:0/24:1(15Z)),TG(20:2n6/24:0/20:2n6),"TG(15:0/20:2n6/18:3(6Z,9Z,12Z))",PE(22:0/18:0),"TG(20:4(5Z,8Z,11Z,14Z)/24:1(15Z)/20:4(8Z,11Z,14Z,17Z))",TG(20:0/18:1(11Z)/24:0),Tolbutamide,...,"TG(20:3(5Z,8Z,11Z)/18:3(9Z,12Z,15Z)/22:5(4Z,7Z,10Z,13Z,16Z))",Ganglioside GQ1c (d18:1/16:0),"TG(14:0/18:1(11Z)/18:3(6Z,9Z,12Z))","PC(o-18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","TG(20:3n6/20:5(5Z,8Z,11Z,14Z,17Z)/20:5(5Z,8Z,11Z,14Z,17Z))","TG(22:5(7Z,10Z,13Z,16Z,19Z)/18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))",PIP(18:1(9Z)/20:1(11Z)),Temsirolimus,"TG(22:2(13Z,16Z)/22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:3(9Z,12Z,15Z))","TG(18:3(6Z,9Z,12Z)/14:0/20:3n6)"
LRRK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HOMER2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NPSR1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
POLR3B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CLDN12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [391]:
binary_matrix.shape

(5263, 24503)

## Save Binary Matrix

In [394]:
out_path = '/Users/maayan/sigsets/Harmonizome/Output/HMDB/'

In [396]:
filename = out_path+'hmdb_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

## Create Gene Set Library

In [430]:
name = 'hmdb_gene_set'

In [431]:
uf.createUpGeneSetLib(binary_matrix, out_path, name)

Progress: 100%  24503 Out of 24503   

## Create Attribute Library

In [432]:
name = 'hmdb_attribute_set'

In [433]:
uf.createUpAttributeSetLib(binary_matrix, out_path, name)

Progress: 100%  5263 Out of 5263   

## Create Similarity Matrix

In [446]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [447]:
gene_similarity_matix.head()

Unnamed: 0,LRRK2,HOMER2,NPSR1,POLR3B,CLDN12,SNAPIN,EFHC1,COMMD1,BTF3P11,FUT9,...,ISCA2,SLC13A4,P4HA2,SMPD2,AARS2,CHST15,GSTT2B,ZNF219,COX4I1,FUT3
,,,,,,,,,,,,,,,,,,,,,
LRRK2,1.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0
HOMER2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NPSR1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
POLR3B,0.285714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
CLDN12,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [449]:
filename = out_path + 'hmdb_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

## Create Attribute Similarity Matrix

In [450]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [451]:
attribute_similarity_matix.head()

Unnamed: 0,Selenomethionine,TG(22:0/22:1(13Z)/o-18:0),"PC(22:2(13Z,16Z)/22:2(13Z,16Z))",TG(20:0/15:0/24:1(15Z)),TG(20:2n6/24:0/20:2n6),"TG(15:0/20:2n6/18:3(6Z,9Z,12Z))",PE(22:0/18:0),"TG(20:4(5Z,8Z,11Z,14Z)/24:1(15Z)/20:4(8Z,11Z,14Z,17Z))",TG(20:0/18:1(11Z)/24:0),Tolbutamide,...,"TG(20:3(5Z,8Z,11Z)/18:3(9Z,12Z,15Z)/22:5(4Z,7Z,10Z,13Z,16Z))",Ganglioside GQ1c (d18:1/16:0),"TG(14:0/18:1(11Z)/18:3(6Z,9Z,12Z))","PC(o-18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","TG(20:3n6/20:5(5Z,8Z,11Z,14Z,17Z)/20:5(5Z,8Z,11Z,14Z,17Z))","TG(22:5(7Z,10Z,13Z,16Z,19Z)/18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))",PIP(18:1(9Z)/20:1(11Z)),Temsirolimus,"TG(22:2(13Z,16Z)/22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:3(9Z,12Z,15Z))","TG(18:3(6Z,9Z,12Z)/14:0/20:3n6)"
,,,,,,,,,,,,,,,,,,,,,
Selenomethionine,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TG(22:0/22:1(13Z)/o-18:0),0.0,1.0,0.028037,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
"PC(22:2(13Z,16Z)/22:2(13Z,16Z))",0.0,0.028037,1.0,0.028037,0.028037,0.028037,0.469136,0.028037,0.028037,0.0,...,0.028037,0.0,0.028037,0.571429,0.028037,0.028037,0.198473,0.0,0.028037,0.028037
TG(20:0/15:0/24:1(15Z)),0.0,1.0,0.028037,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
TG(20:2n6/24:0/20:2n6),0.0,1.0,0.028037,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0


In [453]:
filename = out_path + 'hmdb_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

## Create Gene List

In [476]:
gene_list = uf.createGeneList_updated(binary_matrix)

Progress: 100%  5263 Out of 5263   

In [477]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,LRRK2,120892
1,HOMER2,9455
2,NPSR1,387129
3,POLR3B,55703
4,CLDN12,9069


In [478]:
gene_list.shape

(5263, 2)

In [479]:
filename = out_path+'hmdb_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

## Create attribute list

In [505]:
attribute_list = uf.createAttributeList(binary_matrix)

In [506]:
attribute_list.head()

Selenomethionine
TG(22:0/22:1(13Z)/o-18:0)
"PC(22:2(13Z,16Z)/22:2(13Z,16Z))"
TG(20:0/15:0/24:1(15Z))
TG(20:2n6/24:0/20:2n6)


In [486]:
attribute_list.shape

(24503, 1)

In [487]:
filename = out_path+'hmdb_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [584]:
name = 'hmdb_gene_attribute_edge_list'

In [585]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, out_path, name)

Progress: 100%  24503 Out of 24503   

 The number of statisticaly relevent gene-attribute associations is: 836222
