# Urine metabolome visualization

**Michiel Stock**

*Friday 21 July 2017*

Visualization of the different metabolites found in urine.

In [93]:
import pandas as pd
import json
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
sns.set_style('white')

In [94]:
filled_markers = ('o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X')

In [95]:
fingerprints = pd.DataFrame.from_csv('../Data/metabolite_fingerprints.csv')
fingerprints.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
(+)-(S)-Carvone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0
(+)-4-Carene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
(-)-Epicatechin 3'-O-glucuronide,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
(-)-Epicatechin 7-O-glucuronide,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
(-)-Epicatechin sulfate,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0


In [96]:
n_molecules, n_fingerprints = fingerprints.shape

In [None]:
TSNE?

In [None]:
fig, axes = plt.subplots(nrows=5, figsize=(7, 27))

for i, perplexity in enumerate([2, 5, 30, 50, 100]):
    tsne = TSNE(perplexity=perplexity)
    X = tsne.fit_transform(fingerprints)
    axes[i].scatter(X[:,0], X[:,1])
    axes[i].set_title('perplexity = {}'.format(perplexity))

In [None]:
tsne = TSNE(perplexity=50)
X = tsne.fit_transform(fingerprints)
projection = pd.DataFrame(X, index=fingerprints.index)
projection.to_csv('../Data/tsne_projection.csv')

In [None]:
plt.scatter(X[:,0], X[:,1])

In [None]:
urine_metabolome = json.load(open('../Data/urine_metabolome.json', 'r'))

In [None]:
metabolite_classes = [urine_metabolome[met]['class'] if 'class' in urine_metabolome[met] else 'Varia'
                      for met in fingerprints.index]

In [None]:
c = Counter(metabolite_classes)

In [None]:
common_classes = sorted(c.most_common(n=11))
common_classes

In [None]:
# use only common classes
metabolite_classes_subset = [cl if cl in set([cc for cc, _ in common_classes])
                             else 'Varia' for cl in metabolite_classes]

In [None]:
colors = sns.color_palette("Set2", 11)
# put gray last
colors.append(colors.pop(-4))
sns.palplot(colors)

In [None]:
fig, ax = plt.subplots(figsize=(9, 6))
markers = iter(filled_markers)

for col, (cl, _) in zip(colors, common_classes):
    mask = np.array([cl_met==cl for cl_met in metabolite_classes_subset])
    ax.scatter(X[mask,0], X[mask,1], marker=next(markers), c=col, label=cl, alpha=0.95 if cl!='Varia' else 0.5)

ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
ax.set_yticks([])
ax.set_xticks([])
fig.savefig('../Figures/metabolites_tsne.pdf')

In [None]:
from sklearn.cluster import KMeans

n_clusters = 12

kmeans = KMeans(n_clusters=n_clusters)
cluster_ids = kmeans.fit_predict(fingerprints.values)

colors_cluster = sns.color_palette("Set1", n_clusters)

In [None]:
fig, ax = plt.subplots(figsize=(9, 6))
ax.plot(X[:,0], X[:,1], color='w')

for i, cl in enumerate(cluster_ids):
    ax.text(X[i,0], X[i,1], str(cl), color=colors_cluster[cl])
    
fig.savefig('../Figures/metabolites_clustering.pdf') 

In [None]:
from random import randint

fig, ax = plt.subplots(figsize=(9, 6))
ax.scatter(X[:,0], X[:,1], color='gray', alpha=0.3)

clusters_to_pick = set(cluster_ids)

while len(clusters_to_pick):
    i = randint(0, n_molecules-1)
    cl_ind = cluster_ids[i]
    if cl_ind in clusters_to_pick:
        clusters_to_pick.remove(cl_ind)
        ax.scatter(X[i,0], X[i,1], marker=filled_markers[cl_ind], color=colors_cluster[cl_ind], label=fingerprints.index[i])
        
ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
fig.savefig('../Figures/metabolites_cluster_examples.pdf')  

In [None]:
diseases = []
biofunctions = []
pathways = []

for metabolite in urine_metabolome.values():
    if 'diseases' in metabolite:
        diseases += metabolite['diseases']
    if 'biofunctions' in metabolite:
        biofunctions += metabolite['biofunctions']
    if 'pathways' in metabolite:
        pathways += metabolite['pathways']

In [None]:
diseases_counter = Counter(diseases)
diseases_counter.most_common(n=9)

In [None]:
biofunctions_counter = Counter(biofunctions)
biofunctions_counter.most_common(n=9)

In [None]:
pathways_counter = Counter(pathways)
pathways_counter.most_common(n=9)

In [None]:
colors = sns.color_palette("Set1", 9)
sns.palplot(colors)

figsize = (12, 12)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=figsize)

axes = iter([ax for axr in axes for ax in axr])
colors_iter = iter(colors)

for dis, _ in diseases_counter.most_common(n=9):
    ax = next(axes)
    col = next(colors_iter)
    ax.set_title(dis)
    ax.scatter(X[:,0], X[:,1], alpha=0.2, color='gray')
    for i, (x0, x1) in enumerate(X):
        met_name = fingerprints.index[i]
        if 'diseases' in urine_metabolome[met_name] and dis in urine_metabolome[met_name]['diseases']:
            ax.scatter(x0, x1, color=col)
        ax.set_xticks([])
        ax.set_yticks([])
        
fig.savefig('../Figures/diseases.pdf')

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=figsize)

axes = iter([ax for axr in axes for ax in axr])
colors_iter = iter(colors)

for biofun, _ in biofunctions_counter.most_common(n=9):
    ax = next(axes)
    col = next(colors_iter)
    ax.set_title(biofun)
    ax.scatter(X[:,0], X[:,1], alpha=0.2, color='gray')
    for i, (x0, x1) in enumerate(X):
        met_name = fingerprints.index[i]
        if 'biofunctions' in urine_metabolome[met_name] and biofun in urine_metabolome[met_name]['biofunctions']:
            ax.scatter(x0, x1, color=col)
        ax.set_xticks([])
        ax.set_yticks([])
        
fig.savefig('../Figures/biofunction.pdf')

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=figsize)

axes = iter([ax for axr in axes for ax in axr])
colors_iter = iter(colors)

for pathw, _ in pathways_counter.most_common(n=9):
    ax = next(axes)
    col = next(colors_iter)
    ax.set_title(pathw)
    ax.scatter(X[:,0], X[:,1], alpha=0.2, color='gray')
    for i, (x0, x1) in enumerate(X):
        met_name = fingerprints.index[i]
        if 'pathways' in urine_metabolome[met_name] and pathw in urine_metabolome[met_name]['pathways']:
            ax.scatter(x0, x1, color=col)
        ax.set_xticks([])
        ax.set_yticks([])
        
fig.savefig('../Figures/pathways.pdf')

In [None]:
molecular_weigths = [met["average_molecular_weight"] for met in urine_metabolome.values()]
plt.hist(molecular_weigths)