In [23]:
import os
import math
import shutil
import warnings

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import cupy
# import cudf
# import cuml

from rdkit.Chem import DataStructs
from rdkit.DataManip.Metric import rdMetricMatrixCalc

from nvidia.cheminformatics.utils.distance import tanimoto_calculate
from nvidia.cheminformatics.utils.metrics import batched_silhouette_scores, spearman_rho

## Load Benchmark

In [10]:
num_molecules = 100

benchmark_df = pd.read_csv('/workspace/nvidia/cheminformatics/data/benchmark_approved_drugs.csv')
fp = pd.read_csv('/workspace/nvidia/cheminformatics/data/fingerprints_approved_drugs.csv')

benchmark_df = benchmark_df.iloc[:num_molecules]
fp = fp.iloc[:num_molecules]

benchmark_df.set_index('molregno', inplace=True)
fp.set_index('molregno', inplace=True)

## Validate GPU Version of Tanimoto Distance -- INCONSISTENT

Use RDKit's version (Rogers Tanimoto as the benchmark)

In [11]:
def create_bitvect(array):
    array = np.asarray(array).astype(int)
    bitstring = ''.join(array.astype(str))
    return DataStructs.cDataStructs.CreateFromBitString(bitstring)

fp_bitvect = fp.apply(create_bitvect, axis=1)

# https://github.com/rdkit/rdkit-orig/blob/master/Code/DataManip/MetricMatrixCalc/Wrap/rdMetricMatrixCalc.cpp#L169
# https://github.com/rdkit/rdkit-orig/blob/57058c886a49cc597b0c40641a28697ee3a57aee/Code/DataManip/MetricMatrixCalc/MetricFuncs.h#L32
# https://github.com/rdkit/rdkit-orig/blob/57058c886a49cc597b0c40641a28697ee3a57aee/Code/DataStructs/BitOps.h#L29-L67
tanimoto_dist = rdMetricMatrixCalc.GetTanimotoDistMat(fp_bitvect.to_list())

n = len(fp_bitvect)
idx = np.tril_indices(n, k=-1)
cpu_tanimoto_dist_matrix = np.zeros((n,n)).astype(float)
cpu_tanimoto_dist_matrix[idx] = tanimoto_dist

Compare to GPU version

In [21]:
gpu_tanimoto_dist_matrix = tanimoto_calculate(cupy.array(fp.values), calc_distance=True)

for ct,i in enumerate(zip(*idx)):
    if ct > 10:
        break
    print(gpu_tanimoto_dist_matrix[i], cpu_tanimoto_dist_matrix[i])

0.8688952 0.8837209302325582
0.7980343 0.85
0.91975254 0.8793103448275862
0.7891815 0.835820895522388
0.91071063 0.8955223880597015
0.6922421 0.7297297297297297
0.6970374 0.803030303030303
0.87041837 0.8805970149253731
0.7239917 0.736842105263158
0.2837614 0.32352941176470584
0.85124236 0.8461538461538461


## Validate Batched Silhouette Score -- OK, BUT SEE NOTE

Note sight variability based on batch size for the GPU version..

In [45]:
km = KMeans(n_clusters=8).fit(fp)
km_coords = km.transform(fp)
km_clusters = km.predict(fp)

In [46]:
silhouette_score(km_coords, km_clusters)

0.13464282639947014

In [47]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=100)

0.13464282639947014

In [48]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=50)

0.12065168973891105

In [49]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=10)

0.014661038373309707

## Validate GPU Version of Spearman's R