In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")

# Load dataset
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# Preprocessing
def preprocess_data(X, method):
    if method == "none":
        return X
    elif method == "normalize":
        return MinMaxScaler().fit_transform(X)
    elif method == "standardize":
        return StandardScaler().fit_transform(X)
    elif method == "pca":
        return PCA(n_components=2).fit_transform(StandardScaler().fit_transform(X))
    elif method == "t+n":
        X_t = np.log1p(X)
        return MinMaxScaler().fit_transform(X_t)
    elif method == "t+n+pca":
        X_t = np.log1p(X)
        X_n = MinMaxScaler().fit_transform(X_t)
        return PCA(n_components=2).fit_transform(X_n)

# Evaluation metrics
def evaluate_clustering(X, labels):
    return {
        "Silhouette": silhouette_score(X, labels),
        "Calinski-Harabasz": calinski_harabasz_score(X, labels),
        "Davies-Bouldin": davies_bouldin_score(X, labels)
    }

# Parameters
cluster_range = [3, 4, 5]
preprocess_methods = ["none", "normalize", "standardize", "pca", "t+n", "t+n+pca"]

# Collect results
results = []

# KMeans
for prep in preprocess_methods:
    X_prep = preprocess_data(X, prep)
    for c in cluster_range:
        try:
            kmeans = KMeans(n_clusters=c, n_init=10, random_state=42)
            labels = kmeans.fit_predict(X_prep)
            scores = evaluate_clustering(X_prep, labels)
            results.append({"Method": "KMeans", "Preprocessing": prep, "Clusters": c, **scores})
        except:
            results.append({"Method": "KMeans", "Preprocessing": prep, "Clusters": c,
                            "Silhouette": np.nan, "Calinski-Harabasz": np.nan, "Davies-Bouldin": np.nan})

# Hierarchical
for prep in preprocess_methods:
    X_prep = preprocess_data(X, prep)
    for c in cluster_range:
        try:
            hc = AgglomerativeClustering(n_clusters=c)
            labels = hc.fit_predict(X_prep)
            scores = evaluate_clustering(X_prep, labels)
            results.append({"Method": "Hierarchical", "Preprocessing": prep, "Clusters": c, **scores})
        except:
            results.append({"Method": "Hierarchical", "Preprocessing": prep, "Clusters": c,
                            "Silhouette": np.nan, "Calinski-Harabasz": np.nan, "Davies-Bouldin": np.nan})

# MeanShift
for prep in preprocess_methods:
    X_prep = preprocess_data(X, prep)
    try:
        bandwidth = estimate_bandwidth(X_prep, quantile=0.2)
        if bandwidth <= 0:
            bandwidth = 1  # fallback
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        labels = ms.fit_predict(X_prep)
        n_clusters_ = len(np.unique(labels))
        scores = evaluate_clustering(X_prep, labels)
        for c in cluster_range:
            if c == n_clusters_:
                results.append({"Method": "MeanShift", "Preprocessing": prep, "Clusters": c, **scores})
            else:
                results.append({"Method": "MeanShift", "Preprocessing": prep, "Clusters": c,
                                "Silhouette": np.nan, "Calinski-Harabasz": np.nan, "Davies-Bouldin": np.nan})
    except:
        for c in cluster_range:
            results.append({"Method": "MeanShift", "Preprocessing": prep, "Clusters": c,
                            "Silhouette": np.nan, "Calinski-Harabasz": np.nan, "Davies-Bouldin": np.nan})

# Final DataFrame
df_final = pd.DataFrame(results)
pd.set_option("display.precision", 3)
df_final_styled = df_final.style.background_gradient(cmap='coolwarm',
    subset=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"])

# Show
df_final_styled

Unnamed: 0,Method,Preprocessing,Clusters,Silhouette,Calinski-Harabasz,Davies-Bouldin
0,KMeans,none,3,0.552819,561.627757,0.661972
1,KMeans,none,4,0.498051,530.765808,0.780307
2,KMeans,none,5,0.488749,495.541488,0.805965
3,KMeans,normalize,3,0.504769,359.845074,0.760277
4,KMeans,normalize,4,0.445065,314.472999,0.900449
5,KMeans,normalize,5,0.352571,289.505999,0.957021
6,KMeans,standardize,3,0.459948,241.904402,0.833595
7,KMeans,standardize,4,0.386941,207.265914,0.869814
8,KMeans,standardize,5,0.341947,203.268233,0.953046
9,KMeans,pca,3,0.509168,293.856516,0.709931
