In [53]:
import os
import math
import shutil
import warnings

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, euclidean_distances
from scipy.stats import spearmanr as spearman_rho_cpu

import cupy
# import cudf
# import cuml

from rdkit.Chem import DataStructs
from rdkit.DataManip.Metric import rdMetricMatrixCalc

from nvidia.cheminformatics.utils.distance import tanimoto_calculate
from nvidia.cheminformatics.utils.metrics import batched_silhouette_scores, spearman_rho

## Load Benchmark

In [10]:
num_molecules = 100

benchmark_df = pd.read_csv('/workspace/nvidia/cheminformatics/data/benchmark_approved_drugs.csv')
fp = pd.read_csv('/workspace/nvidia/cheminformatics/data/fingerprints_approved_drugs.csv')

benchmark_df = benchmark_df.iloc[:num_molecules]
fp = fp.iloc[:num_molecules]

benchmark_df.set_index('molregno', inplace=True)
fp.set_index('molregno', inplace=True)

## Validate GPU Version of Tanimoto Distance -- INCONSISTENT

Use RDKit's version (Rogers Tanimoto as the benchmark)

In [11]:
def create_bitvect(array):
    array = np.asarray(array).astype(int)
    bitstring = ''.join(array.astype(str))
    return DataStructs.cDataStructs.CreateFromBitString(bitstring)

fp_bitvect = fp.apply(create_bitvect, axis=1)

# https://github.com/rdkit/rdkit-orig/blob/master/Code/DataManip/MetricMatrixCalc/Wrap/rdMetricMatrixCalc.cpp#L169
# https://github.com/rdkit/rdkit-orig/blob/57058c886a49cc597b0c40641a28697ee3a57aee/Code/DataManip/MetricMatrixCalc/MetricFuncs.h#L32
# https://github.com/rdkit/rdkit-orig/blob/57058c886a49cc597b0c40641a28697ee3a57aee/Code/DataStructs/BitOps.h#L29-L67
tanimoto_dist = rdMetricMatrixCalc.GetTanimotoDistMat(fp_bitvect.to_list())

n = len(fp_bitvect)
idx = np.tril_indices(n, k=-1)
cpu_tanimoto_dist_matrix = np.zeros((n,n)).astype(float)
cpu_tanimoto_dist_matrix[idx] = tanimoto_dist

Compare to GPU version

In [21]:
gpu_tanimoto_dist_matrix = tanimoto_calculate(cupy.array(fp.values), calc_distance=True)

for ct,i in enumerate(zip(*idx)):
    if ct > 10:
        break
    print(gpu_tanimoto_dist_matrix[i], cpu_tanimoto_dist_matrix[i])

0.8688952 0.8837209302325582
0.7980343 0.85
0.91975254 0.8793103448275862
0.7891815 0.835820895522388
0.91071063 0.8955223880597015
0.6922421 0.7297297297297297
0.6970374 0.803030303030303
0.87041837 0.8805970149253731
0.7239917 0.736842105263158
0.2837614 0.32352941176470584
0.85124236 0.8461538461538461


## Validate Batched Silhouette Score -- OK, BUT SEE NOTE

Note sight variability based on batch size for the GPU version..

In [45]:
km = KMeans(n_clusters=8).fit(fp)
km_coords = km.transform(fp)
km_clusters = km.predict(fp)

In [46]:
silhouette_score(km_coords, km_clusters)

0.13464282639947014

In [47]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=100)

0.13464282639947014

In [48]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=50)

0.12065168973891105

In [49]:
batched_silhouette_scores(cupy.array(km_coords), cupy.array(km_clusters), batch_size=10)

0.014661038373309707

## Validate GPU Version of Spearman's R

In [87]:
from scipy.stats import rankdata

In [304]:
def rankdata_gpu(array, tol=1.0e-5):
    # assert dim = 1
    idx = cupy.argsort(array)
    
    #sorted_array = array[idx]
    #pairwise_diff = cupy.abs(sorted_array[:-1] - sorted_array[1:])
    #repeated_bool = pairwise_diff <= tol
    #repeated_values = cupy.unique(sorted_array[:-1][similar_bool]))
    return idx + 1.0


def spearman_rho(data_matrix1, data_matrix2, top_k=0):
    """Calculate spearman's Rho, ranked correlation coefficient

    Args:
        data_matrix1 (2D array or dataframe): matrix with samples as rows, the reference matrix
        data_matrix2 (2D array or dataframe): matrix with samples as rows

    Returns:
        matrix: ranked correlation coeffcients for data
    """
    
    #assert (data_matrix1.ndim in [1, 2]) & (data_matrix2.ndim in [1, 2])
    #data_matrix1 = data_matrix1[np.newaxis, :] if data_matrix1.ndim == 1 else data_matrix1
    #data_matrix2 = data_matrix2[np.newaxis, :] if data_matrix2.ndim == 1 else data_matrix2
    #assert data_matrix1.shape == data_matrix2.shape
    
    data_matrix1 = cupy.asnumpy(data_matrix1)
    data_matrix2 = cupy.asnumpy(data_matrix2)
    #n_samples, n_features = data_matrix1.shape
    #max_features = n_features
    
    # This is an auto correlation matrix -- need to skip the diagonal values
    #if n_samples == n_features:
    #    if cupy.allclose(data_matrix1, data_matrix2.T):
    #        print('auto')
    #        cupy.fill_diagonal(data_matrix1, cupy.inf)
    #        cupy.fill_diagonal(data_matrix2, cupy.inf)
    #        max_features -=1
    
    # TODO: fix ranking to handle identical values
    data_matrix1_sort = rankdata_gpu(data_matrix1)
    data_matrix2_sort = rankdata_gpu(data_matrix2)

    #top_k = max_features if (top_k==0) | (top_k>max_features) else top_k
    #mask_top_k = (data_matrix1_sort < top_k).reshape(n_samples, -1)
    #data_matrix1_top_k = data_matrix1_sort[mask_top_k].reshape(n_samples, -1) + 1
    #data_matrix2_top_k = data_matrix2_sort[mask_top_k].reshape(n_samples, -1) + 1
    
    data_matrix1_top_k = data_matrix1_sort
    data_matrix2_top_k = data_matrix2_sort
    
    return cupy.corrcoef(data_matrix1_sort, data_matrix2_sort)[0, 1]

In [305]:
a1 = np.random.rand(4)
b1 = np.random.rand(4)
a2 = cupy.array(a1)
b2 = cupy.array(b1)

print('CPU vs GPU')
print('Array A', a1, a2)
print('Array B', b1, b2)
print('Rank A', rankdata(a1), rankdata_gpu(a2))
print('Rank B', rankdata(b1), rankdata_gpu(b2))
print('Spearman Rho', spearmanr(a1, b1).correlation, spearman_rho(a2, b2))

CPU vs GPU
Array A [0.62591156 0.54648339 0.30253198 0.37325067] [0.62591156 0.54648339 0.30253198 0.37325067]
Array B [0.81755957 0.68942276 0.07536547 0.10987463] [0.81755957 0.68942276 0.07536547 0.10987463]
Rank A [4. 3. 1. 2.] [3. 4. 2. 1.]
Rank B [4. 3. 1. 2.] [3. 4. 2. 1.]
Spearman Rho 1.0 1.0


In [306]:
a1 = np.random.rand(4)
b1 = np.random.rand(4)
a2 = cupy.array(a1)
b2 = cupy.array(b1)

print('CPU vs GPU')
print('Array A', a1, a2)
print('Array B', b1, b2)
print('Rank A', rankdata(a1), rankdata_gpu(a2))
print('Rank B', rankdata(b1), rankdata_gpu(b2))
print('Spearman Rho', spearmanr(a1, b1).correlation, spearman_rho(a2, b2))

CPU vs GPU
Array A [0.5406782  0.54081674 0.33069447 0.56410849] [0.5406782  0.54081674 0.33069447 0.56410849]
Array B [0.31573929 0.63141851 0.90867346 0.80387163] [0.31573929 0.63141851 0.90867346 0.80387163]
Rank A [2. 3. 1. 4.] [3. 1. 2. 4.]
Rank B [1. 2. 4. 3.] [1. 2. 4. 3.]
Spearman Rho -0.19999999999999998 0.0
