In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# =============================
# Load your CSV
# =============================
df = pd.read_csv("Process_Clustering_Results/unique_processes_clusters.csv")

# Embeddings column = stringified list → convert to numpy arrays
df["Embedding"] = df["Embedding"].apply(lambda x: np.array(eval(x)))
X = np.vstack(df["Embedding"].values)

# List of algos to evaluate
algorithms = ["KMeans", "Agglo_or_MBK", "DBSCAN", "GMM"]

def evaluate_clustering(name, labels):
    print(f"\n=== {name} ===")
    labels = np.array(labels)

    # If all points same cluster → metrics cannot be computed
    if len(set(labels)) <= 1:
        print("Not enough clusters to evaluate.")
        return

    # DBSCAN: if almost all noise, skip silhouette
    is_dbscan = (name == "DBSCAN")
    noise_fraction = None
    if is_dbscan:
        noise_fraction = (labels == -1).sum() / len(labels)
        print(f"Noise points: {noise_fraction*100:.2f}%")

        # DBSCAN edge case: if <2 non-noise clusters, silhouette can't run
        unique_non_noise = set(labels) - {-1}
        if len(unique_non_noise) < 2:
            print("Not enough core clusters for silhouette.")
        else:
            sil = silhouette_score(X[labels != -1], labels[labels != -1])
            print(f"Silhouette Score (no noise): {sil:.4f}")
    else:
        sil = silhouette_score(X, labels)
        print(f"Silhouette Score: {sil:.4f}")

    # CH & DB Scores (work even with noise)
    ch = calinski_harabasz_score(X, labels)
    db = davies_bouldin_score(X, labels)

    print(f"Calinski-Harabasz Index: {ch:.2f}")
    print(f"Davies-Bouldin Score: {db:.4f}")

# Run evaluation for each algorithm
for algo in algorithms:
    evaluate_clustering(algo, df[algo])



=== KMeans ===
Silhouette Score: 0.0357
Calinski-Harabasz Index: 7.48
Davies-Bouldin Score: 3.3103

=== Agglo_or_MBK ===
Silhouette Score: 0.0216
Calinski-Harabasz Index: 6.91
Davies-Bouldin Score: 3.4477

=== DBSCAN ===
Not enough clusters to evaluate.

=== GMM ===
Silhouette Score: 0.0341
Calinski-Harabasz Index: 7.45
Davies-Bouldin Score: 3.3184


: 