# Metabolomics Workbench Gene Metabolite Associations
This notebook contains the script used to process the Metabolomics Workbench Gene Metabolite Association dataset. An edgelist from Metabolomics Workbench containing 1068 genes and 734 metabolites was processed to map gene symbols to approved and up-to-date gene symbols and IDs. The final edgelist contains 1050 genes and 734 metabolites.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-process Data

In [None]:
metabolomicswb = pd.read_csv('MW_gene_metabolite.tsv', sep='\t')
metabolomicswb

In [None]:
genemapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
metabolomicswb['gene_symbol'] = metabolomicswb['gene_symbol'].map(genemapping)
metabolomicswb.drop_duplicates().reset_index(drop=True)
metabolomicswb

In [None]:
len(metabolomicswb['gene_symbol'].unique()), len(metabolomicswb['kegg_name'].unique())

## Pfocess Data for SQL

### Resource

In [None]:
#(id, name, acronym, long_description, short_description, url, num_attributes, num_datasets)
(110, 'Metabolomics Workbench', 'MW', 'Metabolomics Workbench supports fast and sensitive identification of aspects of the cellular metabolome and increases the inventory of chemically identifiable metabolites', 'Catalyze the study of metabolomics ', 'https://www.metabolomicsworkbench.org/', 734, 1)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(149, 'MW Gene Metabolite Associations', 'Gene Metabolite Associations', 'biomolecular interactions between metabolites and proteinscurated from experimental studies', 'protein-metabolite associations from low-throughput or high-throughput metabolomics studies', 'interacting proteins for {0} metabolite from the MW Gene Metabolite Associations dataset.', 'sets of interacting proteins for metabolites from the MW Gene Metabolite Associations dataset.', 'interacting metabolites for {0} protein from the MW Gene Metabolite Associations dataset.', 0, 0, '2023-10-24', 'mwmetabolites', 0, 110, 3, 4, 10, 2, 'association by data aggregation', 'curated experimental data', 'mixed', 'metabolites', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(148, 'Sud M, et al (2016). Metabolomics Workbench: An international repository for metabolomics data and metadata, metabolite standards, protocols, tutorials and training, and analysis tools. Nucleic Acids Res. 44:D463-77', 'Sud, Nucleic Acids Res, 2016', 'dx.doi.org/10.1093/nar/gkv1042', 26467476, 'https://www.ncbi.nlm.nih.gov/pubmed/26467476', 'Sud', 'M', 'Nucleic Acids Res', 2016, 'Metabolomics Workbench: An international repository for metabolomics data and metadata, metabolite standards, protocols, tutorials and training, and analysis tools', 44, 'D463-77')

### Gene

In [None]:
index = 57617
genes = pd.read_csv('../../tables/gene.csv')
genelist = genes['symbol'].tolist()
genefks = genes.set_index('symbol')['id'].to_dict()
geneinfo = pd.read_csv('../../tables/gene_info', sep='\t').drop_duplicates('Symbol').set_index('Symbol').get(['GeneID', 'description'])
for gene in metabolomicswb['gene_symbol'].apply(str.upper).unique():
    if gene not in genelist:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneinfo.loc[gene, 'GeneID'])), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
index = 367911
attributes = pd.read_csv('../../tables/attribute.csv')
attributeslist = attributes['name_from_naming_authority'].tolist()
attributefks = {}
metaboliteids = metabolomicswb.set_index('kegg_name')['kegg_id'].to_dict()
for metabolite in metabolomicswb['kegg_name'].unique():
        print((index, metabolite, metaboliteids[metabolite], 'https://www.kegg.jp/entry/'+metaboliteids[metabolite], 72), end=',\n')
        attributefks[metabolite] = index
        index += 1

### Gene Set

In [None]:
index = 135000000
genesetfks = {}
for metabolite in metabolomicswb['kegg_name'].unique():
    print((index, metabolite, metaboliteids[metabolite], 'https://www.kegg.jp/entry/'+metaboliteids[metabolite], 149, 10, attributefks[metabolite]), end=',\n')
    genesetfks[metabolite] = index
    index += 1

### Association

In [None]:
associations = metabolomicswb.copy()
associations = associations.get(['gene_symbol', 'kegg_name'])
associations['threshold'] = 1
associations['gene_symbol'] = associations['gene_symbol'].apply(lambda x: genefks[x])
associations['kegg_name'] = associations['kegg_name'].apply(lambda x: genesetfks[x])
associations.columns = ['gene_fk', 'gene_set_fk', 'threshold_value']
associations = associations.drop_duplicates().reset_index(drop=True)
associations.index += 31000000
associations.to_csv('../../harmonizome-update/mwmetabolites.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'
metabolomicswb = metabolomicswb.get(['gene_symbol', 'gene_id', 'kegg_name', 'kegg_id']).drop_duplicates()
metabolomicswb

### Gene-Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(metabolomicswb['gene_symbol'], metabolomicswb['kegg_name'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = metabolomicswb.copy()
edgelist.columns = ['Gene', 'Gene ID', 'Metabolite', 'Metabolite ID']
edgelist['Threshold'] = 1
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Metabolite', 'Metabolite ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for metabolite in attributeslist.index:
    metabolite = attributeslist.loc[metabolite]
    nodes[metabolite['Metabolite ID']] = {
        "type":"metabolite",
        "properties": {
            "label":metabolite['Metabolite'],
            "id":metabolite['Metabolite ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "moleculary interacts with",
        "target": edge['Metabolite ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Metabolite ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_id":edge['Metabolite ID'],
            "target_label":edge['Metabolite'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/mwmetabolites.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix KEGG: kegg.jp/entry/', file=f)

    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0002248', 'KEGG:'+edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/mwmetabolites.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'metabolite':'KEGG'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/mwmetabolites_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/mwmetabolites_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, figsize=(25, 25))

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open(output_path+'/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in MW Gene Metabolite Associations Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=15,mindist=0.01
     ,spread=6.5 
     #,maxdf=0.1 
     ,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/mwmetabolites.html", title = 'Gene Sets in MW Gene Metabolite Associations Library')
save(plot)