In [25]:
from typing import Callable
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift, Birch, AffinityPropagation, MiniBatchKMeans
from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.read_csv('datasets/iris.csv',
                      skiprows=1,
                      names = ['sepal-length',
                               'sepal-width',
                               'petal-length',
                               'petal-width',
                               'class'])

In [9]:
df = df.sample(frac=1).reset_index(drop=True)

In [12]:
df.sample(10)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
116,6.3,3.4,5.6,2.4,Iris-virginica
149,7.0,3.2,4.7,1.4,Iris-versicolor
109,4.8,3.4,1.9,0.2,Iris-setosa
148,6.3,3.3,4.7,1.6,Iris-versicolor
2,6.9,3.1,5.1,2.3,Iris-virginica
135,4.6,3.6,1.0,0.2,Iris-setosa
91,5.0,2.3,3.3,1.0,Iris-versicolor
112,5.0,3.4,1.5,0.2,Iris-setosa
85,5.8,4.0,1.2,0.2,Iris-setosa
88,6.7,3.3,5.7,2.1,Iris-virginica


In [14]:
encoder = LabelEncoder()
df['class'] = encoder.fit_transform(df['class'])

In [16]:
encoder.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [15]:
df.sample(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
127,5.7,3.8,1.7,0.3,0
117,6.8,3.0,5.5,2.1,2
29,5.7,3.0,4.2,1.2,1
51,5.6,2.9,3.6,1.3,1
102,5.5,3.5,1.3,0.2,0


In [23]:
iris_features = df.drop('class', axis=1)
iris_features

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
0,5.7,4.4,1.5,0.4
1,5.4,3.9,1.7,0.4
2,6.9,3.1,5.1,2.3
3,6.0,2.7,5.1,1.6
4,4.9,2.5,4.5,1.7
...,...,...,...,...
145,7.2,3.0,5.8,1.6
146,5.7,2.5,5.0,2.0
147,6.0,2.9,4.5,1.5
148,6.3,3.3,4.7,1.6


In [29]:
iris_features = df.drop('class', axis=1).values
iris_features

array([[5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.7, 0.4],
       [6.9, 3.1, 5.1, 2.3],
       [6. , 2.7, 5.1, 1.6],
       [4.9, 2.5, 4.5, 1.7],
       [4.6, 3.4, 1.4, 0.3],
       [7.7, 3.8, 6.7, 2.2],
       [6.7, 3. , 5. , 1.7],
       [5.1, 3.7, 1.5, 0.4],
       [5. , 3.3, 1.4, 0.2],
       [6.1, 2.8, 4. , 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.1, 3.8, 1.5, 0.3],
       [6. , 2.2, 5. , 1.5],
       [5.8, 2.6, 4. , 1.2],
       [4.5, 2.3, 1.3, 0.3],
       [6.9, 3.1, 4.9, 1.5],
       [5. , 3.4, 1.6, 0.4],
       [7.1, 3. , 5.9, 2.1],
       [5.1, 3.5, 1.4, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [6.3, 3.3, 6. , 2.5],
       [6.3, 2.9, 5.6, 1.8],
       [6.7, 3.1, 4.4, 1.4],
       [4.7, 3.2, 1.6, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.6, 2.8, 4.9, 2. ],
       [6.1, 3. , 4.9, 1.8],
       [7.7, 3. , 6.1, 2.3],
       [5.7, 3. , 4.2, 1.2],
       [5. , 3.6, 1.4, 0.2],
       [5.6, 3. , 4.5, 1.5],
       [6.5, 3. , 5.8, 2.2],
       [7.9, 3.8, 6.4, 2. ],
       [6.1, 2

In [30]:
iris_labels = df['class'].values
iris_labels

array([0, 0, 2, 1, 2, 0, 2, 1, 0, 0, 1, 1, 0, 2, 1, 0, 1, 0, 2, 0, 0, 2,
       2, 1, 0, 0, 2, 2, 2, 1, 0, 1, 2, 2, 1, 2, 1, 2, 0, 1, 0, 2, 0, 0,
       1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 2, 0, 1, 1, 0, 0, 2, 0, 0, 1, 2,
       1, 2, 0, 0, 2, 0, 1, 1, 0, 1, 0, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 0,
       2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 0, 0, 0, 2, 1, 2, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2, 1, 2, 0, 0, 1, 0, 1, 1,
       2, 0, 1, 0, 2, 0, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1])

In [24]:
def k_means(
        data: np.ndarray,
        n_clusters: int = 3,
        max_iter: int = 1000,
        random_state: int | None = 42
) -> object:
    if not isinstance(data, np.ndarray):
        raise TypeError('data must be a numpy ndarray')

    return KMeans(
        n_clusters=n_clusters,
        max_iter=max_iter,
        random_state=random_state,
        n_init='auto'
    ).fit(data)

In [26]:
ClusteringModel = KMeans | AgglomerativeClustering | DBSCAN | MeanShift | Birch | AffinityPropagation | MiniBatchKMeans

In [27]:
def evaluate_clustering_model(
        clustering_model: Callable[[np.ndarray, ...], object],
        data: np.ndarray,
        labels_true: np.ndarray
):

    model = clustering_model(data)
    labels_pred = model.labels_

    homogeneity = metrics.homogeneity_score(labels_true, labels_pred)
    completeness = metrics.completeness_score(labels_true, labels_pred)
    v_measure = metrics.v_measure_score(labels_true, labels_pred)
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)

    unique_labels = np.unique(labels_pred)

    if len(unique_labels) > 1:
        silhouette = metrics.silhouette_score(data, labels_pred)
    else:
        silhouette = np.nan

    metrics_dict = {
        "homogeneity": homogeneity,
        "completeness": completeness,
        "v_measure": v_measure,
        "ARI": ari,
        "AMI": ami,
        "silhouette": silhouette,
    }

    header = "homog\tcompl\tv-meas\tARI\tAMI\tsilhouette"
    print(header)
    print("-" * len(header.expandtabs()))
    print(
        "{homogeneity:.3f}\t{completeness:.3f}\t{v_measure:.3f}\t"
        "{ARI:.3f}\t{AMI:.3f}\t{silhouette:.3f}".format(**metrics_dict)
    )

    return metrics_dict

In [31]:
evaluate_clustering_model(k_means, iris_features, iris_labels)

homog	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.736	0.747	0.742	0.716	0.739	0.551


{'homogeneity': 0.7364192881252849,
 'completeness': 0.7474865805095324,
 'v_measure': 0.7419116631817836,
 'ARI': 0.7163421126838476,
 'AMI': 0.7386548254402864,
 'silhouette': 0.5509643746707443}

# MiniBatchKMeans

- fast version KMeans for the large datasets.
- Huge data, big data, streaming and when the KMeans is too slow.

In [32]:
def mini_batch_k_means(data: np.ndarray, n_cluster: int = 3, max_iter: int = 1000) -> object:
    if not isinstance(data, np.ndarray):
        raise TypeError()

    return MiniBatchKMeans(
        n_clusters=n_cluster,
        max_iter=max_iter,
        batch_size=20
    ).fit(data)

In [33]:
evaluate_clustering_model(mini_batch_k_means, iris_features, iris_labels)

homog	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


{'homogeneity': 0.7514854021988338,
 'completeness': 0.7649861514489815,
 'v_measure': 0.7581756800057784,
 'ARI': 0.7302382722834697,
 'AMI': 0.7551191675800484,
 'silhouette': 0.5525919445499758}