In [52]:
import os
import warnings

import cupy
import cudf
from cuml.manifold import UMAP
from cuml.decomposition import PCA
from cuml.cluster import KMeans
from cuml.neighbors import kneighbors_graph
from cugraph import louvain, leiden, Graph

from bokeh.io.export import export_png
from bokeh.plotting import figure
from bokeh.models.tickers import FixedTicker
from bokeh.io import output_notebook, push_notebook, show

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

output_notebook()

## Plotting Function

In [39]:
COLORS = ["#406278", "#e32636", "#9966cc", "#cd9575", "#915c83", "#008000",
          "#ff9966", "#848482", "#8a2be2", "#de5d83", "#800020", "#e97451",
          "#5f9ea0", "#36454f", "#008b8b", "#e9692c", "#f0b98d", "#ef9708",
          "#0fcfc0", "#9cded6", "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4"]


def show_cluster_plot(df, title='UMAP'):
    """
    Draws a scatter plots from output of UMAP.
    """
    umap_fig = figure(title=title, width=800, output_backend="webgl")

    for cluster,dat in df.groupby('cluster'):
        x_array = dat['x']
        y_array = dat['y']

        color = COLORS[cluster % len(COLORS)]
        umap_fig.circle(x_array.to_pandas(),
                        y_array.to_pandas(),
                        size=2,
                        color=color,
                        alpha=0.5, 
                        legend = 'Cluster ' + str(cluster))

    umap_fig.legend.location = 'top_right'
    umap_fig.legend.title = 'Clusters'
    
    umap_fig_handle = show(umap_fig, notebook_handle=True)
    push_notebook(handle=umap_fig_handle)
    
def standard_scaler(df):
    return (df - df.std(axis=0)) / df.mean(axis=0)

## Settings

In [119]:
# Settings
pca_comps = 64
n_clusters = 6
n_neighbors = 100
num_mols = 30000

## Load Data

In [120]:
# ECFP fingerprints
fp = cupy.load('./tmp/fp.pkl', allow_pickle=True)

# MinHash version of fingerprints
mh = cupy.load('./tmp/minhash.pkl', allow_pickle=True)

MinHash fingerprints and large integer values -- should normalize them later.

In [121]:
mh.shape, mh[0, :10]

((500000, 128),
 array([ 71131466,  69994144, 305801762, 361969177, 103568912,  43153142,
        122906807, 194869780,  73700223,  20475546]))

## ECFP -> MinHash -> Scale -> PCA -> KMeans + UMAP

Similar to the previous version -- separation of clusters is very poor -- this is consistent with the publication.

In [129]:
# Normalize
df_xf = standard_scaler(mh[:num_mols])

# PCA
pca = PCA(n_components=pca_comps)
df_xf = pca.fit_transform(df_xf)

# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            a=1.0,
            b=1.0,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

# KMeans
clusters = KMeans(n_clusters=n_clusters).fit(df_xf).labels_

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': clusters})
show_cluster_plot(plot_df)

## ECFP -> MinHash -> NearestNeighbors -> Louvain + UMAP

Unclear about how to handle the output of Louvain for creating UMAP embedding of plot.

In [147]:
# Normalize
df_xf = standard_scaler(mh[:num_mols])
# df_xf = mh

# PCA
pca = PCA(n_components=pca_comps)
df_xf = pca.fit_transform(df_xf)

# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            a=0.5,
            b=1.0,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

# Calculate nearest neighbors graph and then extract indices (row/col) for edge list
df_g = kneighbors_graph(df_xf, n_neighbors=n_neighbors).tocoo()
edge_list = cudf.DataFrame({'row':df_g.row, 'col':df_g.col})

# Create a graph from the edgelist
G = Graph()
G.from_cudf_edgelist(edge_list, 'row', 'col')

# Perform clustering on the graph object
louvain_parts, _  = louvain(G)
clusters = louvain_parts.sort_values('vertex')['partition']


plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': clusters})
show_cluster_plot(plot_df)

Same Louvain clusters, different UMAP settings

In [161]:
# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            min_dist=0.05,
            spread=1.0,
            a=1.9,
            b=1.5,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': clusters})
show_cluster_plot(plot_df)