In [1]:
import os
import math
import shutil
import warnings
import importlib
import urllib.request as request
from contextlib import closing
from numba import cuda

import cupy
import cudf
import cuml

import dask_cudf
import dask_ml
import dask

from cuml.manifold import UMAP as cuUMAP
from cuml.dask.decomposition import PCA as cuDaskPCA
from cuml.dask.cluster import KMeans as cuDaskKMeans
from cuml.dask.manifold import UMAP as cuDaskUMAP

from dask.distributed import Client, LocalCluster
from dask_cuda import initialize, LocalCUDACluster
from dask_cuda.local_cuda_cluster import cuda_visible_devices

from bokeh.io.export import export_png
from bokeh.plotting import figure
from bokeh.models.tickers import FixedTicker
from bokeh.io import output_notebook, push_notebook, show

import nvidia.cheminformatics.chembldata as chembldata

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

output_notebook()

### Configurations and settings

In [2]:
# Please add or remove device ids that can be used
# CUDA_VISIBLE_DEVICES=[0]
CUDA_VISIBLE_DEVICES = cuda_visible_devices(0).split(',')

pca_comps = 64
n_clusters = 6
n_neighbors=100
num_mols=10000

enable_tcp_over_ucx = True
enable_nvlink = False
enable_infiniband = False

COLORS = ["#406278", "#e32636", "#9966cc", "#cd9575", "#915c83", "#008000",
          "#ff9966", "#848482", "#8a2be2", "#de5d83", "#800020", "#e97451",
          "#5f9ea0", "#36454f", "#008b8b", "#e9692c", "#f0b98d", "#ef9708",
          "#0fcfc0", "#9cded6", "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4"]
FINGER_PRINT_FILES = 'filter_*.h5'

### Functions

In [3]:
def show_cluster_plot(ldcudf, title='UMAP'):
    """
    Draws a scatter plots from output of UMAP.
    """
    umap_fig = figure(title=title, width=800, output_backend="webgl")
    clusters = ldcudf['cluster'].unique().values_host

    for cluster in clusters:
        query = 'cluster == %s' % (cluster)

        cdf = ldcudf.query(query)

        if cdf.shape[0] == 0:
            continue

        x_array = cupy.fromDlpack(cdf['x'].to_dlpack())
        y_array = cupy.fromDlpack(cdf['y'].to_dlpack())

        umap_fig.circle(x_array.get(),
                        y_array.get(),
                        size=2,
                        color=COLORS[cluster],
                        alpha=0.5, legend = 'Cluster ' + str(cluster))

    umap_fig.legend.location = 'top_right'
    umap_fig.legend.title = 'Clusters'
    
    umap_fig_handle = show(umap_fig, notebook_handle=True)
    push_notebook(handle=umap_fig_handle)

# Download ChEMBL database

In [4]:
data_dir = "/data/db"
db_file = os.path.join(data_dir, 'chembl_27.db')

if not os.path.exists(db_file):
    print('Downloading ChEMBL db...')

    os.makedirs(data_dir, exist_ok=True)
    with closing(request.urlopen('ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_27_sqlite.tar.gz')) as r:
        with open(db_file, 'wb') as f:
            shutil.copyfileobj(r, f)

    print('Download completed')
else:
    print('Reusing available ChEMBL db at', db_file)

Reusing available ChEMBL db at /data/db/chembl_27.db


In [5]:
cluster = LocalCUDACluster(protocol="ucx",
                           dashboard_address=':9001',
                           # TODO: automate visible device list
                           CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
                           enable_tcp_over_ucx=enable_tcp_over_ucx,
                           enable_nvlink=enable_nvlink,
                           enable_infiniband=enable_infiniband)
client = Client(cluster)
n_workers = len(client.scheduler_info()['workers'].keys())
client

0,1
Client  Scheduler: ucx://127.0.0.1:54813  Dashboard: http://127.0.0.1:9001/status,Cluster  Workers: 3  Cores: 3  Memory: 33.58 GB


# Generate fingerprint from ChEMBL

The 4 in ECFP4 corresponds to the diameter of the atom environments considered, while the Morgan fingerprints take a radius parameter. So a Morgan fingerprint with radius=2 is roughly equivalent to ECFP4 and FCFP4.

In [6]:
%%time
import nvidia.cheminformatics.chembldata
importlib.reload(nvidia.cheminformatics.chembldata)

# cache_directory = '/data/fp'
cache_directory = None
from nvidia.cheminformatics.fingerprint import MorganFingerprint

if cache_directory is None:
    chem_data = chembldata.ChEmblData(db_file=db_file, fp_type=MorganFingerprint)
    ddf = chem_data.fetch_all_props(num_recs=num_mols)
else:
    hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
    ddf = dask.dataframe.read_hdf(hdf_path, 'fingerprints')

    if num_mols > 0:
        ddf = ddf.head(num_mols, compute=False, npartitions=-1)

dcudf = dask_cudf.from_dask_dataframe(ddf)
dcudf = dcudf.persist()
df = dcudf.compute()

CPU times: user 2.02 s, sys: 432 ms, total: 2.45 s
Wall time: 10.6 s


In [7]:
# Compute Tanimoto similarities
fp = cupy.fromDlpack(df.to_dlpack())

In [8]:
%%time

@cuda.jit
def compute_norms(data, norms):
    i = cuda.grid(1)
    norms[i] = len(data[i])
    for j in range(len(data[i])):
        if data[i][j] != 0:
            data[i][j] = j + 1
            norms[i] = norms[i] + ((j + 1)**2)
    
    if norms[i] != 0:
        norms[i] = math.sqrt(norms[i])

@cuda.jit
def tanimotoSimilarity(data, norms, sim_array):
    x = cuda.grid(1)
    rows = len(data)
    
    i = x // rows
    j = x % rows
    
    a = data[i]
    b = data[j]
    
    prod = 0
    for k in range(len(data[i])):
        prod = prod + (a[k] * b[k])
        
    a_norm = norms[i]
    b_norm = norms[j]
    
    sim_array[i][j] = prod / ((a_norm**2 + b_norm**2) - prod)
    

norms = cupy.zeros(fp.shape[0])
compute_norms.forall(norms.shape[0], 1)(fp, norms)

sim_array = cupy.zeros((fp.shape[0], fp.shape[0]), cupy.float32)
tanimotoSimilarity.forall(fp.shape[0] * fp.shape[0], 1)(fp, norms, sim_array)

sim_array.shape

CPU times: user 758 ms, sys: 40.9 ms, total: 799 ms
Wall time: 766 ms


(10000, 10000)

In [9]:
# Reduce
df2 = cudf.from_dlpack(sim_array.toDlpack())
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.999691,0.619004,0.441258,0.467084,0.687903,0.654308,0.774606,0.728659,0.566314,0.076509,...,0.247997,0.166112,0.206059,0.131143,0.091207,0.095801,0.069306,0.087932,0.087932,0.129583
1,0.619004,0.441258,0.467084,0.687903,0.654308,0.774606,0.728659,0.566314,0.076509,0.568478,...,0.166112,0.206059,0.131143,0.091207,0.095801,0.069306,0.087932,0.087932,0.129583,0.619004
2,0.441258,0.467084,0.687903,0.654308,0.774606,0.728659,0.566314,0.076509,0.568478,0.633135,...,0.206059,0.131143,0.091207,0.095801,0.069306,0.087932,0.087932,0.129583,0.619004,0.999706
3,0.467084,0.687903,0.654308,0.774606,0.728659,0.566314,0.076509,0.568478,0.633135,0.081918,...,0.131143,0.091207,0.095801,0.069306,0.087932,0.087932,0.129583,0.619004,0.999706,0.418302
4,0.687903,0.654308,0.774606,0.728659,0.566314,0.076509,0.568478,0.633135,0.081918,0.09657,...,0.091207,0.095801,0.069306,0.087932,0.087932,0.129583,0.619004,0.999706,0.418302,0.51762


In [10]:
%%time

# pca = cuml.PCA(n_components=pca_comps)
# df2 = pca.fit_transform(df2)
# df2.head()

CPU times: user 4.59 s, sys: 1.28 s, total: 5.87 s
Wall time: 5.82 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-0.637827,-0.010745,0.022649,0.557703,0.078834,0.516488,0.213425,0.294054,0.10552,0.071255,...,-0.302996,-0.143127,-0.173242,-0.19263,-0.119048,0.026319,0.114258,-0.177409,-0.035944,-0.230596
1,-0.631638,-0.012372,0.019212,0.552819,0.076975,0.51294,0.21985,0.290188,0.104151,0.067413,...,-0.300189,-0.161526,-0.155773,-0.177257,-0.139263,0.021728,0.070222,-0.197549,-0.026235,-0.22764
2,-0.634146,-0.013763,0.01497,0.556711,0.072457,0.52064,0.220989,0.291998,0.105123,0.061807,...,-0.307581,-0.182066,-0.12594,-0.152298,-0.166136,0.026087,0.012169,-0.20822,-0.013622,-0.233871
3,-0.633286,-0.015462,0.011287,0.556385,0.069396,0.521307,0.224372,0.290574,0.105143,0.057778,...,-0.308104,-0.195947,-0.097556,-0.12754,-0.18466,0.026896,-0.040126,-0.20412,-0.004806,-0.234138
4,-0.633012,-0.017112,0.007509,0.556802,0.066073,0.523303,0.227346,0.289741,0.105345,0.053488,...,-0.30843,-0.206444,-0.067071,-0.099981,-0.200632,0.031498,-0.090869,-0.187702,0.004685,-0.234825


# Clustering

In [11]:
%%time

kmeans_float = cuml.KMeans(n_clusters=n_clusters)
kmeans_float.fit(df2)

CPU times: user 31.5 ms, sys: 0 ns, total: 31.5 ms
Wall time: 28.7 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,-0.637827,-0.010745,0.022649,0.557703,0.078834,0.516488,0.213425,0.294054,0.10552,0.071255,...,-0.302996,-0.143127,-0.173242,-0.19263,-0.119048,0.026319,0.114258,-0.177409,-0.035944,-0.230596
1,-0.631638,-0.012372,0.019212,0.552819,0.076975,0.51294,0.21985,0.290188,0.104151,0.067413,...,-0.300189,-0.161526,-0.155773,-0.177257,-0.139263,0.021728,0.070222,-0.197549,-0.026235,-0.22764
2,-0.634146,-0.013763,0.01497,0.556711,0.072457,0.52064,0.220989,0.291998,0.105123,0.061807,...,-0.307581,-0.182066,-0.12594,-0.152298,-0.166136,0.026087,0.012169,-0.20822,-0.013622,-0.233871
3,-0.633286,-0.015462,0.011287,0.556385,0.069396,0.521307,0.224372,0.290574,0.105143,0.057778,...,-0.308104,-0.195947,-0.097556,-0.12754,-0.18466,0.026896,-0.040126,-0.20412,-0.004806,-0.234138
4,-0.633012,-0.017112,0.007509,0.556802,0.066073,0.523303,0.227346,0.289741,0.105345,0.053488,...,-0.30843,-0.206444,-0.067071,-0.099981,-0.200632,0.031498,-0.090869,-0.187702,0.004685,-0.234825


In [12]:
%%time

umap = cuml.UMAP(n_neighbors=100,
                    a=1.0,
                    b=1.0,
                    learning_rate=1.0)
Xt = umap.fit_transform(df2)

df2['x'] = Xt[0]
df2['y'] = Xt[1]
df2['cluster'] = kmeans_float.labels_

df2.head(5)

CPU times: user 590 ms, sys: 175 ms, total: 765 ms
Wall time: 806 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,x,y,cluster
0,-0.637827,-0.010745,0.022649,0.557703,0.078834,0.516488,0.213425,0.294054,0.10552,0.071255,...,-0.19263,-0.119048,0.026319,0.114258,-0.177409,-0.035944,-0.230596,-9.921024,-17.947638,0
1,-0.631638,-0.012372,0.019212,0.552819,0.076975,0.51294,0.21985,0.290188,0.104151,0.067413,...,-0.177257,-0.139263,0.021728,0.070222,-0.197549,-0.026235,-0.22764,-9.840667,-17.988644,0
2,-0.634146,-0.013763,0.01497,0.556711,0.072457,0.52064,0.220989,0.291998,0.105123,0.061807,...,-0.152298,-0.166136,0.026087,0.012169,-0.20822,-0.013622,-0.233871,-9.809313,-18.002487,0
3,-0.633286,-0.015462,0.011287,0.556385,0.069396,0.521307,0.224372,0.290574,0.105143,0.057778,...,-0.12754,-0.18466,0.026896,-0.040126,-0.20412,-0.004806,-0.234138,-9.729506,-18.040386,0
4,-0.633012,-0.017112,0.007509,0.556802,0.066073,0.523303,0.227346,0.289741,0.105345,0.053488,...,-0.099981,-0.200632,0.031498,-0.090869,-0.187702,0.004685,-0.234825,-9.764634,-18.022793,5


In [13]:
%%time
show_cluster_plot(df2)

CPU times: user 982 ms, sys: 8.44 ms, total: 990 ms
Wall time: 944 ms
