In [9]:
import time
import warnings
from itertools import cycle, islice

import matplotlib.pyplot as plt
import numpy as np

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

import pandas as pd

import hvplot
import hvplot.pandas
import holoviews as hv
from holoviews import opts

In [10]:
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [11]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [12]:
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 500
seed = 30
noisy_circles = datasets.make_circles(
    n_samples=n_samples, factor=0.5, noise=0.05, random_state=seed
)
noisy_moons = datasets.make_moons(
    n_samples=n_samples, noise=0.05, random_state=seed)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=seed)
rng = np.random.RandomState(seed)
no_structure = rng.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)

In [13]:
default_base = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 3,
    "n_clusters": 3,
    "min_samples": 7,
    "xi": 0.05,
    "min_cluster_size": 0.1,
    "allow_single_cluster": True,
    "hdbscan_min_cluster_size": 15,
    "hdbscan_min_samples": 3,
    "random_state": 42,
}

data = {
    '1:noisy_circles':
    (noisy_circles,
        {
            "damping": 0.77,
            "preference": -240,
            "quantile": 0.2,
            "n_clusters": 2,
            "min_samples": 7,
            "xi": 0.08,
        },
     ),
    '2:noisy_moons':
    (noisy_moons,
        {
            "damping": 0.75,
            "preference": -220,
            "n_clusters": 2,
            "min_samples": 7,
            "xi": 0.1,
        },
     ),
    '3:varied':
    (varied,
        {
            "eps": 0.18,
            "n_neighbors": 2,
            "min_samples": 7,
            "xi": 0.01,
            "min_cluster_size": 0.2,
        },
     ),
    '4:aniso':
    (aniso,
        {
            "eps": 0.15,
            "n_neighbors": 2,
            "min_samples": 7,
            "xi": 0.1,
            "min_cluster_size": 0.2,
        },
     ),
    '5:blobs':
    (blobs,
        {
            "min_samples": 7,
            "xi": 0.1,
            "min_cluster_size": 0.2
        },

     ),
    '6:no_structure':
    (no_structure,
        {

        },
     ),

}

In [14]:
import time
! pip uninstall progressbar
import progressbar
num_alg = 14
with progressbar.ProgressBar(max_value=len(data)*num_alg, enable_colors=False) as bar:
    plot_dict= {} 
    perf_list= []
    
    for data_name in data:
        # update parameters with dataset-specific values
        #print (data_name)
        ds, algo_params = data[data_name]
        
        params = default_base.copy()
        params.update(algo_params)
        X, y = ds
        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)
    
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])
    
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(
            X, n_neighbors=params["n_neighbors"], include_self=False
        )
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
    
        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(
            n_clusters=params["n_clusters"],
            random_state=params["random_state"],
        )
        
        spectral = cluster.SpectralClustering(
            n_clusters=params["n_clusters"],
            eigen_solver="arpack",
            affinity="nearest_neighbors",
            random_state=params["random_state"],
        )
        dbscan = cluster.DBSCAN(eps=params["eps"])
        hdbscan = cluster.HDBSCAN(
            min_samples=params["hdbscan_min_samples"],
            min_cluster_size=params["hdbscan_min_cluster_size"],
            allow_single_cluster=params["allow_single_cluster"],
        )
        optics = cluster.OPTICS(
            min_samples=params["min_samples"],
            xi=params["xi"],
            min_cluster_size=params["min_cluster_size"],
        )
        affinity_propagation = cluster.AffinityPropagation(
            damping=params["damping"],
            preference=params["preference"],
            random_state=params["random_state"],
        )
       
        birch = cluster.Birch(n_clusters=params["n_clusters"])
        
        gmm = mixture.GaussianMixture(
            n_components=params["n_clusters"],
            covariance_type="full",
            random_state=params["random_state"],
        )

        ward = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="ward"
        )
        complete = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="complete"
        )
        average = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="average"
        )
        single = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="single"
        )

        kmeans = cluster.KMeans(
            n_clusters=params["n_clusters"], 
            init='k-means++', 
            n_init=10)

      
    
        clustering_algorithms = {
            "kMeans": kmeans,
            "MiniBatch\nKMeans": two_means,
            "Affinity\nPropagation": affinity_propagation,
            "MeanShift": ms,
            "Spectral\nClustering": spectral,
            "Single\nLinkage": single,
            "Average\nLinkage": average,
            "Complete\nLinkage": complete,
            "Ward\nLinkage": ward,
            "DBSCAN": dbscan,
            "HDBSCAN": hdbscan,
            "OPTICS": optics,
            "BIRCH": birch,
            "Gaussian\nMixture": gmm,
            
            
        }
    
        plot_df=pd.DataFrame(X)
    
        for algorithm_name in clustering_algorithms:
            
            bar.next()
            
           # print(algorithm_name)
            t0 = time.time()
    
            algorithm=clustering_algorithms[algorithm_name]
    
            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the "
                    + "connectivity matrix is [0-9]{1,2}"
                    + " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning,
                )
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning,
                )
                algorithm.fit(X)
    
            
            if hasattr(algorithm, "labels_"):
                y_pred = algorithm.labels_.astype(int)
            else:
                y_pred = algorithm.predict(X)
    
            t1 = time.time()
    
            
    
            plot_df['Color']= y_pred
    
            plot = plot_df.hvplot(kind='scatter', x='0', y='1',marker='circle',size=10, alpha=1, by='Color')
          
            plot_dict[( algorithm_name, data_name)] =  hv.Overlay(plot).opts(toolbar='disable') 
            perf_list.append( [algorithm_name, data_name,  t1-t0])

[0m

100% (84 of 84) |########################| Elapsed Time: 0:00:07 Time:  0:00:07


In [15]:
pd.DataFrame(perf_list, columns=['Алгоритм', 'Датасет', 'Время']).pivot_table(
    values='Время', index='Алгоритм', columns='Датасет', aggfunc='first')

Датасет,1:noisy_circles,2:noisy_moons,3:varied,4:aniso,5:blobs,6:no_structure
Алгоритм,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Affinity\nPropagation,0.099547,0.090788,0.074762,0.092493,0.083775,0.089171
Average\nLinkage,0.003499,0.003399,0.003392,0.003299,0.005763,0.003896
BIRCH,0.006304,0.006506,0.007066,0.027433,0.026476,0.006589
Complete\nLinkage,0.002682,0.002747,0.002526,0.002541,0.00299,0.00326
DBSCAN,0.00235,0.002235,0.002261,0.002063,0.002647,0.002599
Gaussian\nMixture,0.004698,0.003863,0.006603,0.0298,0.017757,0.005635
HDBSCAN,0.005312,0.005183,0.00415,0.004702,0.004431,0.008813
MeanShift,0.061992,0.025533,0.067842,0.045816,0.268632,0.155224
MiniBatch\nKMeans,0.015129,0.038706,0.019305,0.015185,0.389222,0.446019
OPTICS,0.298677,0.297798,0.304547,0.326904,0.305437,0.320877


In [16]:
for algorithm_name in clustering_algorithms:
    plot_dict[(algorithm_name, 'a')] = hv.Overlay(
        [hv.Text(.5, .5, algorithm_name, fontsize=10)])
holomap = hv.HoloMap(plot_dict, kdims=['Набор данных', 'Алгоритм'])
grid = hv.GridSpace(holomap).opts(plot_size=85, xaxis=False, yaxis=False)
grid