In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load Wine dataset
wine = load_wine()
X = wine.data

# Preprocessing functions
def normalize(X):
    return MinMaxScaler().fit_transform(X)

def log_transform(X):
    return FunctionTransformer(np.log1p).fit_transform(X)

def apply_pca(X, n_components=2):
    return PCA(n_components=n_components).fit_transform(X)

# Preprocessing variations
preprocessing_options = {
    "No Data Processing": lambda X: X,
    "Using Normalization": normalize,
    "Using Transform": log_transform,
    "Using PCA": lambda X: apply_pca(X),
    "Using T+N": lambda X: normalize(log_transform(X)),
    "T+N+PCA": lambda X: apply_pca(normalize(log_transform(X))),
}

# Initialize result tables
def init_result_table():
    return {
        "Silhouette": {},
        "Calinski-Harabasz": {},
        "Davies-Bouldin": {}
    }

def evaluate_clustering(X, labels):
    return (
        silhouette_score(X, labels),
        calinski_harabasz_score(X, labels),
        davies_bouldin_score(X, labels)
    )

# Clustering evaluation function
def run_clustering(X_original, clusterer, cluster_range=(3, 6), fixed=False):
    results = init_result_table()
    for name, preprocess in preprocessing_options.items():
        X = preprocess(X_original)
        for c in range(*cluster_range):
            try:
                if fixed:
                    model = clusterer(n_clusters=c)
                else:
                    model = clusterer()
                labels = model.fit_predict(X)
                sil, ch, db = evaluate_clustering(X, labels)
                results["Silhouette"][(name, c)] = round(sil, 2)
                results["Calinski-Harabasz"][(name, c)] = int(ch)
                results["Davies-Bouldin"][(name, c)] = round(db, 2)
            except:
                results["Silhouette"][(name, c)] = "NA"
                results["Calinski-Harabasz"][(name, c)] = "NA"
                results["Davies-Bouldin"][(name, c)] = "NA"
    return results

# Run KMeans Clustering
kmeans_results = run_clustering(X, KMeans, (3, 6), fixed=True)

# Run Hierarchical Clustering
hierarchical_results = run_clustering(X, AgglomerativeClustering, (3, 6), fixed=True)

# Run Mean Shift Clustering
meanshift_results = run_clustering(X, MeanShift, (3, 6), fixed=False)

# Function to display results as table
def print_results_table(title, results):
    print(f"\n{'='*20} {title} {'='*20}")
    for metric, scores in results.items():
        print(f"\n{metric}:")
        df = pd.Series(scores).unstack(level=1)  # Level 1 is c=3,4,5
        df.columns = [f"c={col}" for col in df.columns]
        df.index.name = "Preprocessing"
        print(df.to_string())

# Show results
print_results_table("Using K-Means Clustering", kmeans_results)
print_results_table("Using Hierarchical Clustering", hierarchical_results)
print_results_table("Using Mean Shift Clustering", meanshift_results)




Silhouette:
                      c=3   c=4   c=5
Preprocessing                        
No Data Processing   0.57  0.56  0.56
T+N+PCA              0.60  0.53  0.47
Using Normalization  0.30  0.26  0.20
Using PCA            0.56  0.56  0.51
Using T+N            0.33  0.24  0.25
Using Transform      0.39  0.34  0.33

Calinski-Harabasz:
                     c=3  c=4  c=5
Preprocessing                     
No Data Processing   561  702  703
T+N+PCA              426  392  353
Using Normalization   83   65   52
Using PCA            497  706  722
Using T+N             91   65   59
Using Transform      120   92   78

Davies-Bouldin:
                      c=3   c=4   c=5
Preprocessing                        
No Data Processing   0.53  0.55  0.48
T+N+PCA              0.55  0.69  0.79
Using Normalization  1.31  1.75  1.94
Using PCA            0.55  0.54  0.57
Using T+N            1.24  1.70  1.59
Using Transform      1.02  1.36  1.47


Silhouette:
                      c=3   c=4   c=5
Preproces