In [3]:
import hdbscan
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs


In [4]:
blobs, labels = make_blobs(n_samples=2000, n_features=10)

## Important parameters
<ul>
    <li> 
        min_cluster_size - smallest size grouping that you wish to consider a cluster
    </li>
    <li> 
        min_samples - a measure of how conservative you want you clustering to be. The larger 
        the value of min_samples you provide, the more conservative the clustering – more points will be 
        declared as noise, and clusters will be restricted to progressively more dense areas.
    </li>
</ul>

In [5]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=10)
clusterer.fit(blobs)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [7]:
#stores the cluster information
#cluster number for each data sample
#note that -1 is noise
clusterer.labels_

array([1, 0, 0, ..., 0, 1, 2], dtype=int64)

In [9]:
#number of clusters
clusterer.labels_.max() + 1

3

In [10]:
#hdbscan implements soft clustering - where each data point is assigned a cluster membership score ranging 
# from 0.0 to 1.0. A score of 0.0 represents a sample that is not in the cluster at all 
# (all noise points will get this score) while a score of 1.0 represents a sample that is 
# at the heart of the cluster (note that this is not the spatial centroid notion of core).

clusterer.probabilities_

#probabiity for each data point

array([0.63928435, 0.54577043, 0.68495415, ..., 0.75363376, 0.84837895,
       1.        ])

In [None]:
#hbscan allows us to data associate using predict