### Comparing DBSCAN in scikit learn, pyclustering and hdbscan

In [118]:
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh import palettes

import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
import itertools

from pyclustering.cluster.dbscan import dbscan
import hdbscan

%load_ext memory_profiler

output_notebook()

In [128]:
# helper functions
def plot_data(X):
    if X.shape[1] > 2:
        pca = PCA(n_components=2, random_state=0)
        pca.fit(X.transpose())
        x1, x2 = pca.components_
        variance_explained = sum(pca.explained_variance_ratio_) *100
        title="first two components explained {0: .1f} % of variance".format(variance_explained)
        p = figure(title=title)
    else:    
        p = figure()
        x1, x2 = X[:, 0], X[:, 1]
    p.scatter(x1, x2)
    show(p)
    
    
def plot_cluster(X, y_pred, centers=[], xlabel="component 1", ylabel="component 2", title=""):
    colors = np.array([palettes.Set1[9][i] for i in y_pred])
    TOOLS="hover,crosshair,box_zoom, reset,save"
    p = figure(tools=TOOLS, plot_height=450, plot_width=500, title=title)
    p.scatter(X[:,0], X[:, 1], fill_alpha=0.5, line_color=None, fill_color=colors)
    if len(centers) != 0:
        for num, center in enumerate(centers):
            text = Label(x=center[0], y=center[1], text=str(num))
            p.add_layout(text)
        p.diamond(centers[:,0], centers[:,1], fill_color="black", line_color="black")
    p.xaxis.axis_label = xlabel 
    p.yaxis.axis_label = ylabel
    return p


def convert_X_to_pyclustering_format(X):
    return [list(i) for i in X]

def convert_y_to_plot_format(X, y_pred):
    new_y = [-1] * len(X)
    for cluster_num, indices in enumerate(y_pred):
        for i in indices:
            new_y[i] = cluster_num
    return np.array(new_y)

In [131]:
# simulate data points with 3 clusters
X, y = make_blobs(n_samples=1000, n_features=2, centers=3, random_state=13)

transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
plot_data(X_aniso)

In [133]:
sk_kmeans = KMeans(n_clusters=3)
y_pred = sk_kmeans.fit_predict(X_aniso)
p0 = plot_cluster(X_aniso, y_pred, title="scikit learn Kmeans")
show(p0)

In [136]:
sk_dbscan = DBSCAN(eps=0.35, min_samples=30)
%memit y_pred = sk_dbscan.fit_predict(X_aniso)
%timeit y_pred = sk_dbscan.fit_predict(X_aniso)
y_pred = sk_dbscan.fit_predict(X_aniso)
p1 = plot_cluster(X_aniso, y_pred, title="scikit learn DBSCAN")
show(p1)

peak memory: 187.51 MiB, increment: 0.00 MiB
100 loops, best of 3: 6.23 ms per loop


In [137]:
def pc_dbscan_two_step():
    pc_dbscan = dbscan(convert_X_to_pyclustering_format(X_aniso), 0.35, 25)
    pc_dbscan.process()
    return pc_dbscan
%memit pc_dbscan_two_step()
%timeit pc_dbscan_two_step()

pc_dbscan = pc_dbscan_two_step()
y_pred = convert_y_to_plot_format(X_aniso, pc_dbscan.get_clusters())
p2 = plot_cluster(X_aniso, y_pred, title="pyclustering dbscan")
show(p2)

peak memory: 187.51 MiB, increment: 0.00 MiB
1 loop, best of 3: 1.61 s per loop


In [138]:
hdbscan_ = hdbscan.HDBSCAN(min_samples=15)
%memit hdbscan_.fit(X_aniso)
%timeit hdbscan_.fit(X_aniso)
hdbscan_.fit(X_aniso)
y_pred = np.array(hdbscan_.labels_)
p3 = plot_cluster(X_aniso, y_pred, title="hdbscan")
show(p3)

peak memory: 187.76 MiB, increment: 0.00 MiB
100 loops, best of 3: 15.8 ms per loop


### Conclusion
* performance-wise, scikit learn and pyclustering does a good job distinguishing close clusters, while hbscan have a hard time distinguishing clusters too close. Otherwise all three give similar results
* Speed wise, for the same data, sklearn takes 8ms, hdbscan takes 17ms, while pyclustering takes 1.66 seconds
* memory wise, all three methods takes a 187 MB peak memory, probably by precomputing a pair-wise distance matrix. 