In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load Dataset (Wine Dataset from UCI)
data = load_wine()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Define Preprocessing Techniques
def apply_preprocessing(X, method):
    if method == "No Data Processing":
        return X
    elif method == "Using Normalization":
        return pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    elif method == "Using Transform":
        return pd.DataFrame(np.log1p(X), columns=X.columns)  # Log Transform
    elif method == "Using PCA":
        return pd.DataFrame(PCA(n_components=5).fit_transform(X))  # Reduce to 5 components
    elif method == "Using T+N":
        X_transformed = np.log1p(X)
        return pd.DataFrame(StandardScaler().fit_transform(X_transformed), columns=X.columns)
    elif method == "T+N+PCA":
        X_transformed = np.log1p(X)
        X_normalized = StandardScaler().fit_transform(X_transformed)
        return pd.DataFrame(PCA(n_components=5).fit_transform(X_normalized))

# Clustering Algorithms
def perform_clustering(X, n_clusters, method):
    if method == "K-Means":
        model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    elif method == "Hierarchical":
        model = AgglomerativeClustering(n_clusters=n_clusters)
    elif method == "Mean-Shift":
        model = MeanShift()
        return model.fit_predict(X)  # No cluster parameter for Mean-Shift
    return model.fit_predict(X)

# Define evaluation metrics
def evaluate_clustering(X, labels):
    if len(set(labels)) == 1:  # Avoid error when only one cluster is detected
        return np.nan, np.nan, np.nan
    silhouette = silhouette_score(X, labels)
    calinski = calinski_harabasz_score(X, labels)
    davies = davies_bouldin_score(X, labels)
    return silhouette, calinski, davies

# Preprocessing Methods
preprocessing_methods = [
    "No Data Processing",
    "Using Normalization",
    "Using Transform",
    "Using PCA",
    "Using T+N",
    "T+N+PCA"
]

# Clustering Techniques
clustering_methods = ["K-Means", "Hierarchical", "Mean-Shift"]

# Cluster sizes
cluster_counts = [3, 4]  # Adjust this list if you want more cluster counts

# Perform clustering and store results
for cluster_method in clustering_methods:
    print(f"\n\nUsing {cluster_method} Clustering\n")

    # Data structure to store results
    results_dict = {}

    for preprocessing in preprocessing_methods:
        for c in cluster_counts:
            X_processed = apply_preprocessing(X, preprocessing)

            # Perform clustering
            if cluster_method == "Mean-Shift":
                labels = perform_clustering(X_processed, None, cluster_method)
            else:
                labels = perform_clustering(X_processed, c, cluster_method)

            # Compute metrics
            silhouette, calinski, davies = evaluate_clustering(X_processed, labels)

            # Store results
            results_dict[f"{preprocessing} (c={c})"] = [silhouette, calinski, davies]

    # Convert dictionary to DataFrame
    df_results = pd.DataFrame(results_dict, index=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"])
    print(df_results.to_string(index=True))




Using K-Means Clustering

                   No Data Processing (c=3)  No Data Processing (c=4)  Using Normalization (c=3)  Using Normalization (c=4)  Using Transform (c=3)  Using Transform (c=4)  Using PCA (c=3)  Using PCA (c=4)  Using T+N (c=3)  Using T+N (c=4)  T+N+PCA (c=3)  T+N+PCA (c=4)
Silhouette                         0.571138                  0.562032                   0.284859                   0.260170               0.391821               0.337233         0.571237         0.562153         0.306927         0.295069       0.391030       0.378766
Calinski-Harabasz                561.815658                708.086676                  70.940008                  56.181355             120.025162              95.945856       561.877765       708.226941        76.019395        59.827978     116.347340      95.294050
Davies-Bouldin                     0.534243                  0.544345                   1.389188                   1.796892               1.018898               1.39981