In [None]:
# ==========================================
# KMEANS CLUSTERING ON PCA COMPONENTS
# ==========================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

In [None]:
# Load PCA transformed data
X_pca = pd.read_csv("pca_transformed_data.csv")

print("Shape of PCA Data:", X_pca.shape)
X_pca.head()

In [None]:
print("Any nulls?", X_pca.isnull().sum().sum())
print("Any infinite values?", np.isinf(X_pca.values).sum())

Elbow method 

In [None]:
inertia = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    kmeans.fit(X_pca)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K_range, inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

Silhouette Score

In [None]:
silhouette_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = kmeans.fit_predict(X_pca)
    score = silhouette_score(X_pca, labels)
    silhouette_scores.append(score)

plt.figure(figsize=(8,5))
plt.plot(K_range, silhouette_scores, marker='o')
plt.title("Silhouette Score vs K")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()

for k, score in zip(K_range, silhouette_scores):
    print(f"K={k} â†’ Silhouette Score = {score:.4f}")

Calinski-Harabasz Score

In [None]:
ch_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = kmeans.fit_predict(X_pca)
    score = calinski_harabasz_score(X_pca, labels)
    ch_scores.append(score)

plt.figure(figsize=(8,5))
plt.plot(K_range, ch_scores, marker='o')
plt.title("Calinski-Harabasz Score vs K")
plt.xlabel("Number of Clusters")
plt.ylabel("CH Score")
plt.grid(True)
plt.show()

Davies-Bouldin Score

In [None]:
db_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = kmeans.fit_predict(X_pca)
    score = davies_bouldin_score(X_pca, labels)
    db_scores.append(score)

plt.figure(figsize=(8,5))
plt.plot(K_range, db_scores, marker='o')
plt.title("Davies-Bouldin Score vs K")
plt.xlabel("Number of Clusters")
plt.ylabel("DB Score")
plt.grid(True)
plt.show()