# DBSCAN python sklearn performance

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
import time

In [None]:
X, y = make_blobs(n_samples=1_500, n_features=2, cluster_std=.5, center_box=(- 10.0, 10.0), shuffle=True, random_state=None)
print(X.shape)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
import multiprocessing

t0 = time.perf_counter()
clustering = DBSCAN(eps=.5, min_samples=5, n_jobs=-1).fit(X)
python_time = time.perf_counter() - t0
print(f'Elapsed time {python_time}s with {multiprocessing.cpu_count()} cpu')
clustering.labels_

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=clustering.labels_)
plt.show()

In [None]:
from sklearn import metrics

metrics.adjusted_rand_score(y, clustering.labels_)

## Phase 1 of graph generation python

In [None]:
X = np.array(range(2*1500), dtype=np.float32)

for i, x in enumerate(X):
  X[i] = (x % 20) - 10

X = np.reshape(X, (1500, 2), order='F')

plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
# Cupy kernel invocation
# Why the results from cupy are totally different from native and python check?
import cupy as cp

threshold = .1

code = None
with open('/content/drive/MyDrive/G-DBSCAN/gdbscan_cupy.cu', 'r') as f:
  code = f.read()

o_degrees = cp.empty(len(X), dtype=cp.int32)
adjListIx = cp.empty(len(X), dtype=cp.int32)

compute_degrees = cp.RawKernel(code, 'compute_degrees')

x_gpu = cp.array(X.flatten('F'), dtype=cp.float32)

blocks = 1024
grid = (int(len(X) / blocks) + 1, 1, 1)

args = (x_gpu, 2, len(X), o_degrees, threshold ** 2)
shared_mem = blocks * 2 * 4

compute_degrees(grid, (blocks, 1, 1), args=args, shared_mem=shared_mem)
cp.cuda.Stream.null.synchronize()

cp.cumsum(o_degrees, out=adjListIx)
cp.cuda.Stream.null.synchronize()


adjList = cp.empty(int(o_degrees[-1]+adjListIx[-1]), dtype=cp.int32)

compute_adjacency_list = cp.RawKernel(code, 'compute_adjacency_list')
args = (x_gpu, 2, len(X), o_degrees, adjListIx, adjList, threshold ** 2)

compute_adjacency_list(grid, (blocks, 1, 1), args=args, shared_mem=shared_mem)
cp.cuda.Stream.null.synchronize()

In [None]:
cp.asnumpy(o_degrees)

In [None]:
cp.asnumpy(adjListIx)

In [None]:
cp.asnumpy(adjList)

In [None]:
from scipy.spatial.distance import euclidean
from tqdm.notebook import tqdm

degrees = []
adjList = []  

X = np.reshape(X, (1500, 2), order='F')

# Aggiungere adjacency

for xi in tqdm(X):
  degree = 0
  for i, xj in enumerate(X):
    d = euclidean(xi, xj)
    if d < threshold:
      degree += 1
      adjList.append(i)

  degrees.append(degree)

# Adj list seems much shorter than it should. To check
print(degrees, len(degrees))
print(adjList)

# Test and profiling CUDA Implementation

In [None]:
!nvidia-smi

In [None]:
%cd /content/drive/MyDrive/G-DBSCAN
#!nvcc --generate-line-info -arch=sm_75 gdbscan.cu && ./a.out
#k80
!nvcc --generate-line-info -arch=sm_37 gdbscan.cu && ./a.out

In [None]:
!cuda-memcheck ./a.out | head -n 100

In [None]:
!rm *.qdrep
!nsys profile ./a.out

In [None]:
#!ncu --kernel-name compute_degrees --launch-skip 0 --launch-count 1 "/content/drive/My Drive/G-DBSCAN/./a.out" 
!ncu --kernel-name compute_degrees "/content/drive/My Drive/G-DBSCAN/./a.out" 

In [None]:
!ncu --kernel-name compute_adjacency_list "/content/drive/My Drive/G-DBSCAN/./a.out" 