In [1]:
import os
import shutil
import warnings
import urllib.request as request
from contextlib import closing

import cupy

import dask_cudf
import dask_ml

from cuml.manifold import UMAP as cuUMAP
from cuml.dask.decomposition import PCA as cuDaskPCA
from cuml.dask.cluster import KMeans as cuDaskKMeans
from cuml.dask.manifold import UMAP as cuDaskUMAP

from dask.distributed import Client, LocalCluster
from dask_cuda import initialize, LocalCUDACluster
from dask_cuda.local_cuda_cluster import cuda_visible_devices

from bokeh.io.export import export_png
from bokeh.plotting import figure
from bokeh.models.tickers import FixedTicker
from bokeh.io import output_notebook, push_notebook, show

from nvidia.cheminformatics.chembldata import ChEmblData

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

output_notebook()

### Functions

In [2]:
def show_cluster_plot(ldcudf, title='UMAP'):
    """
    Draws a scatter plots from output of UMAP.
    """
    umap_fig = figure(title=title, width=800)
    clusters = ldcudf['cluster'].unique().compute().values_host

    for cluster in clusters:
        query = 'cluster == %s' % (cluster)

        cdf = ldcudf.query(query)
        cdf = cdf.compute()

        if cdf.shape[0] == 0:
            continue

        x_array = cupy.fromDlpack(cdf['x'].to_dlpack())
        y_array = cupy.fromDlpack(cdf['y'].to_dlpack())

        umap_fig.circle(x_array.get(),
                        y_array.get(),
                        size=2,
                        color=COLORS[cluster],
                        alpha=0.5, legend = 'Cluster ' + str(cluster))

    umap_fig.legend.location = 'top_right'
    umap_fig.legend.title = 'Clusters'
    
    umap_fig_handle = show(umap_fig, notebook_handle=True)
    push_notebook(handle=umap_fig_handle)
    

def recluster(tcdf):
    """
    Recluster on a dataset
    """
    tcdf = tcdf.persist()

    print('Reclustering...')
    tcdf = tcdf.drop(['x', 'y', 'cluster'], axis=1)

    print('Reduce dimensions...')
    pca = cuDaskPCA(client=client, n_components=pca_comps)
    tcdf = pca.fit_transform(tcdf)

    print('Computing KMEANS...')
    kmeans_float = cuDaskKMeans(client=client, n_clusters=n_clusters)
    kmeans_float.fit(tcdf)
    kmeans_labels = kmeans_float.predict(tcdf)

    print('Computing UMAP...')
    local_model = cuUMAP()
    X_train = tcdf.compute()
    local_model.fit(X_train)

    umap_model = cuDaskUMAP(local_model,
                            n_neighbors=n_neighbors,
                            a=1.0,
                            b=1.0,
                            learning_rate=1.0,
                            client=client)
    Xt = umap_model.transform(tcdf)

    # Add back the column required for plotting and to correlating data
    # between re-clustering
    tcdf['x'] = Xt[0]
    tcdf['y'] = Xt[1]
    tcdf['cluster'] = kmeans_labels
    tcdf['id'] = tcdf.index
    
    return tcdf

# Download ChEMBL database

In [3]:
data_dir = "/data/db"
db_file = os.path.join(data_dir, 'chembl_27.db')

if not os.path.exists(db_file):
    print('Downloading ChEMBL db...')

    os.makedirs(data_dir, exist_ok=True)
    with closing(request.urlopen('ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_27_sqlite.tar.gz')) as r:
        with open(db_file, 'wb') as f:
            shutil.copyfileobj(r, f)

    print('Download completed')
else:
    print('Reusing available ChEMBL db at', db_file)

Reusing available ChEMBL db at /data/db/chembl_27.db


In [4]:
enable_tcp_over_ucx = True
enable_nvlink = False
enable_infiniband = False

n_clusters = 3
n_neighbors=100
pca_comps = 64

COLORS = ["#406278", "#e32636", "#9966cc", "#cd9575", "#915c83", "#008000",
          "#ff9966", "#848482", "#8a2be2", "#de5d83", "#800020", "#e97451",
          "#5f9ea0", "#36454f", "#008b8b", "#e9692c", "#f0b98d", "#ef9708",
          "#0fcfc0", "#9cded6", "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4"]

# Please add or remove device ids that can be used
# CUDA_VISIBLE_DEVICES=[0]
CUDA_VISIBLE_DEVICES = cuda_visible_devices(0).split(',')

cluster = LocalCUDACluster(protocol="ucx",
                           dashboard_address=':9001',
                           # TODO: automate visible device list
                           CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
                           enable_tcp_over_ucx=enable_tcp_over_ucx,
                           enable_nvlink=enable_nvlink,
                           enable_infiniband=enable_infiniband)
client = Client(cluster)
client

0,1
Client  Scheduler: ucx://127.0.0.1:57469  Dashboard: http://127.0.0.1:9001/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


# Generate fingerprint from ChEMBL

The 4 in ECFP4 corresponds to the diameter of the atom environments considered, while the Morgan fingerprints take a radius parameter. So a Morgan fingerprint with radius=2 is roughly equivalent to ECFP4 and FCFP4.

In [5]:
%%time
chem_data = ChEmblData(db_file=db_file)

# Fetch and generate Finger prints
ddf = chem_data.fetch_all_props(num_recs=100000)

CPU times: user 80 ms, sys: 0 ns, total: 80 ms
Wall time: 78.8 ms


# Clustering

In [6]:
%%time
dcudf = dask_cudf.from_dask_dataframe(ddf)
dcudf = dcudf.persist()

pca = cuDaskPCA(client=client, n_components=pca_comps)
dcudf = pca.fit_transform(dcudf)
kmeans_float = cuDaskKMeans(client=client, n_clusters=n_clusters)
kmeans_float.fit(dcudf)
kmeans_labels = kmeans_float.predict(dcudf)

CPU times: user 2.78 s, sys: 788 ms, total: 3.57 s
Wall time: 23.7 s


In [7]:
%%time
local_model = cuUMAP()
X_train = dcudf.compute()
local_model.fit(X_train)

umap_model = cuDaskUMAP(local_model,
                        n_neighbors=n_neighbors,
                        a=1.0,
                        b=1.0,
                        learning_rate=1.0,
                        client=client)
Xt = umap_model.transform(dcudf)

dcudf['x'] = Xt[0]
dcudf['y'] = Xt[1]
dcudf['cluster'] = kmeans_labels

dcudf.head(5)

CPU times: user 1.1 s, sys: 740 ms, total: 1.84 s
Wall time: 2.4 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,x,y,cluster
0,-0.865112,-0.292498,0.786793,-0.095167,0.796131,0.656851,-0.295272,-0.621785,0.742622,0.438077,...,0.028807,-1.098561,-0.171039,0.309716,0.37655,-0.184357,-0.118622,1.419051,-2.540216,0
1,-1.20402,0.119179,-0.013176,0.055461,0.797991,0.426472,-0.399391,-0.880223,0.117578,-0.019923,...,-0.13571,-0.742768,0.154385,0.313549,-0.162728,-0.287286,-0.228559,1.206517,-2.464119,0
2,-0.850133,-0.352995,-0.017706,-0.182487,1.49636,-0.199621,0.545039,-0.290447,0.719919,0.412478,...,-0.181031,-0.345724,-0.168323,0.314587,-0.036333,-0.189226,0.078509,1.437063,-2.781623,0
3,-1.142299,-0.016498,-0.014625,-0.310365,0.534084,0.251172,-0.598171,-0.777275,0.082535,-0.105282,...,-0.202103,-0.818697,0.5553,-0.027742,-0.068905,-0.366029,-0.147182,1.215112,-2.315584,0
4,-1.13396,-0.142532,0.030312,-0.095378,1.681553,0.710912,-0.142615,-0.48475,1.036489,0.604442,...,-0.309506,-0.765516,-0.198503,0.188181,0.08133,-0.270295,0.011964,1.355321,-2.713211,0


In [8]:
%%time
show_cluster_plot(dcudf)

CPU times: user 1.69 s, sys: 308 ms, total: 2 s
Wall time: 3.83 s


# Re-clustering

In [11]:
%%time
# Filter data before reclustering
cluster = 0                                         # <- PLEASE CHANGE CLUSTER AS REQUIRED
query = 'cluster == ' + str(cluster)
tcdf = dcudf.query(query)

tcdf = recluster(tcdf)
print('Plotting UMAP...')
show_cluster_plot(tcdf, title='UMAP - cluster ' + str(cluster))

Reclustering...
Reduce dimensions...
Computing KMEANS...
Computing UMAP...
Plotting UMAP...


CPU times: user 1.79 s, sys: 624 ms, total: 2.42 s
Wall time: 4.48 s
