In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler

# Example: Generate synthetic data for clustering
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)

# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply KMeans clustering
n_clusters = 4  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Step 3: Calculate the silhouette score
sil_score = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score for {n_clusters} clusters: {sil_score:.2f}")

# Step 4: Visualize the silhouette scores for each sample
silhouette_vals = silhouette_samples(X_scaled, cluster_labels)

# Create the silhouette plot
plt.figure(figsize=(8, 6))
y_lower = 10  # Offset for plotting
for i in range(n_clusters):
    # Select silhouette scores for the current cluster
    ith_cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
    ith_cluster_silhouette_vals.sort()

    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    plt.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_vals,
        alpha=0.7,
        label=f"Cluster {i + 1}",
    )
    y_lower = y_upper + 10  # Add some spacing between clusters

# Add labels and title
plt.axvline(x=sil_score, color="red", linestyle="--", label="Average Silhouette Score")
plt.title("Silhouette Plot for KMeans Clustering", fontsize=14)
plt.xlabel("Silhouette Score", fontsize=12)
plt.ylabel("Cluster", fontsize=12)
plt.legend(loc="best")
plt.tight_layout()
plt.show()
