In [26]:
from KMDHierarchicalClustering.KMDclustering.KMDClustering import KMDLinkage
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
import cluster_scoring
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import warnings
%matplotlib inline

In [27]:
### generate datasets ###
np.random.seed(1)
n_samples = 1000

# nested circle data
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.3,
                                      noise=0.14)
# moons dataset
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.24)

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.5, 2, 1],random_state=random_state)

In [28]:
# Set up cluster parameters
default_base = {'quantile': .3,
                'eps': 0.15,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 3,
                'min_samples': 20,
                'xi': 0.07,
                'min_cluster_size': 0.1,
                'k':10
                }

datasets = [
    (noisy_circles, {'damping': .60, 'preference': -240,'quantile': .2, 'n_clusters': 2,'min_samples': 20, 'xi': 0.25, 'k':11}),
    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2, 'k':40}),
    (varied, {'eps': .18, 'n_neighbors': 2,'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2, 'k':10}),
    (aniso, {'eps': .15, 'n_neighbors': 2,'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2, 'k':10}),
]

In [29]:
# set up figure 
f =plt.figure(1,figsize=(9 * 2 + 3, 12.5))
f.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01)
plot_num = 1

<Figure size 1512x900 with 0 Axes>

In [30]:
# iterate on all datasets 
for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph( X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # Create cluster objects
    HAC_single = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='single',connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack',affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    average_linkage = cluster.AgglomerativeClustering(linkage="average", n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture( n_components=params['n_clusters'], covariance_type='full')
    kmd = KMDLinkage(X, k=params['k'], n_clusters = params['n_clusters'], min_cluster_size = 10, affinity ='compute', certainty = 0.5,
                             k_scan_range = (1,100,3))
    kmd_core = KMDLinkage(X, k=params['k'], n_clusters = params['n_clusters'], min_cluster_size = 10, affinity ='compute', certainty = 1,
                                  k_scan_range = (1,100,3))
    
    clustering_algorithms = (
        ('True labels',0),
        ('SpectralClustering', spectral),
        ('HAC average', average_linkage),
        ('HAC single', HAC_single),
        ('DBSCAN', dbscan),
        ('GaussianMixture', gmm),
        ('KMD HAC' ,kmd),
        ('KMD HAC(core)', kmd_core)
    )
        

In [None]:
    # cluster each dataset 
    for name, algorithm in clustering_algorithms:
        if algorithm != 0:
                algorithm.fit(X)
        elif name == 'True labels':
            y_pred= y
        elif hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        core_idx = y_pred != -1
        y_pred_eval = y_pred[core_idx]
        y_eval = y[core_idx]

        plt.subplot(len(datasets), len(clustering_algorithms) , plot_num)

        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a','#f781bf', '#a65628', '#984ea3','#999999', '#e41a1c', '#dede00']),int(max(y_pred) + 1))))
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1


  affinity=affinity)
