In [None]:
import os
import numpy as np
import pandas as pd
import clustbench
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from genieclust import Genie
from sklearn.metrics import (adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score, silhouette_score, calinski_harabasz_score, davies_bouldin_score)

In [None]:
data_url = "https://github.com/gagolews/clustering-data-v1/raw/v1.1.0"
datasets = ['a1', 'a2', 'a3', 'aggregation', 'compound', 'd31', 'r15', 'flame', 'jain', 'pathbased', 'spiral', 's1', 's2', 's3', 's4', 'unbalance']

genie_params = [0.1, 0.3, 0.5, 0.7, 0.9]
agg_linkages = ['single', 'average', 'complete', 'ward']
dbscan_params = {'eps': [0.1, 0.2, 0.3, 0.5], 'min_samples': [3, 5, 10]}

metrics = ['ARI', 'NMI', 'FMI', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']  ##https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

In [None]:
def evaluate_clustering(X, y_true, clustering_algorithm):
    """
    Evaluate clustering performance using various metrics,
    but only compute Silhouette / Calinski-Harabasz / Davies-Bouldin
    if there are at least 2 clusters.
    """
    y_pred = clustering_algorithm.fit_predict(X)

    results = {
        'ARI': adjusted_rand_score(y_true, y_pred),
        'NMI': normalized_mutual_info_score(y_true, y_pred),
        'FMI': fowlkes_mallows_score(y_true, y_pred),
    }

    # count unique labels to chech if we can compute the other metrics
    n_labels = len(np.unique(y_pred))
    if n_labels >= 2 and n_labels <= len(y_pred) - 1:
        results['Silhouette']          = silhouette_score(X, y_pred)
        results['Calinski-Harabasz']   = calinski_harabasz_score(X, y_pred)
        results['Davies-Bouldin']      = davies_bouldin_score(X, y_pred)
    else:
        results['Silhouette']          = np.nan
        results['Calinski-Harabasz']   = np.nan
        results['Davies-Bouldin']      = np.nan

    return results

In [None]:
def run_experiment(X, y_true, n_clusters):
    records = []
    
    # KMeans, with three different seeds
    for i in range(3):
        seed = 42 + i
        kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
        metrics = evaluate_clustering(X, y_true, kmeans)
        record = {
            'method': 'KMeans',
            'param': f'random_state={seed}',
            **metrics
        }
        records.append(record)
    
    # Agglomerative Clustering
    for linkage in agg_linkages:
        agg = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
        metrics = evaluate_clustering(X, y_true, agg)
        records.append({
            'method': 'Agglomerative',
            'param': f'linkage={linkage}',
            **metrics
        })
    
    # DBSCAN
    for eps in dbscan_params['eps']:
        for min_samples in dbscan_params['min_samples']:
            dbs = DBSCAN(eps=eps, min_samples=min_samples)
            metrics = evaluate_clustering(X, y_true, dbs)
            records.append({
                'method': 'DBSCAN',
                'param': f'eps={eps}, min_samples={min_samples}',
                **metrics
            })
    
    # Genie
    for t in genie_params:
        genie = Genie(n_clusters=n_clusters)
        metrics = evaluate_clustering(X, y_true, genie)
        records.append({
            'method': 'Genie',
            'param': f'threshold={t}',
            **metrics
        })
    
    # turn the list of dicts into a DataFrame
    df = pd.DataFrame.from_records(records)
    return df
    

In [23]:
if not os.path.exists('./benchmark_metrics.csv'):
    all_results = []

    for name in datasets:
        print(f"Dataset processing: {name}")
        bench = clustbench.load_dataset('sipu', name, url = data_url)
        X, y_true = bench.data, bench.labels[0]
        n_clusters = bench.n_clusters[0]
        df_res = run_experiment(X, y_true, n_clusters)   # now a DataFrame
        for entry in df_res.to_dict('records'):
            entry['dataset'] = name
            all_results.append(entry)
    df = pd.DataFrame(all_results)
    df.to_csv('benchmark_metrics.csv', index = False)
else:
    df = pd.read_csv('./benchmark_metrics.csv')

Dataset processing: a1
Dataset processing: a2
Dataset processing: a3
Dataset processing: aggregation
Dataset processing: compound
Dataset processing: d31
Dataset processing: r15
Dataset processing: flame
Dataset processing: jain
Dataset processing: pathbased
Dataset processing: spiral
Dataset processing: s1
Dataset processing: s2
Dataset processing: s3
Dataset processing: s4
Dataset processing: unbalance
