## Aim

## Theory

## Code


In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score, 
    silhouette_samples,
    davies_bouldin_score, 
    calinski_harabasz_score
)
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist



df = pd.read_csv("/Users/jahanavisingh/Downloads/Mall_Customers.csv")
print(df.head())
print(df.info())



df = df.drop(columns=["CustomerID"])
df = pd.get_dummies(df, drop_first=True)  # Encode Gender column


scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)



inertia_list = []
K = range(2, 10)

for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(scaled_data)
    inertia_list.append(km.inertia_)

plt.plot(K, inertia_list, marker='o')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()


wcss = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(6,4))
plt.plot(range(2,11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()



kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)

# Add cluster labels to dataframe
df["Cluster"] = cluster_labels


sil = silhouette_score(scaled_data, cluster_labels)
db_index = davies_bouldin_score(scaled_data, cluster_labels)
ch_index = calinski_harabasz_score(scaled_data, cluster_labels)

print("\nInternal Evaluation Metrics:")
print(f"Silhouette Score: {sil:.4f}   (higher = better, range [-1,1])")
print(f"Davies-Bouldin Index: {db_index:.4f}   (lower = better)")
print(f"Calinski-Harabasz Index: {ch_index:.4f}   (higher = better)")



sample_silhouette_values = silhouette_samples(scaled_data, cluster_labels)

plt.figure(figsize=(10,5))
y_lower = 10
for i in range(5):
    ith_cluster = sample_silhouette_values[cluster_labels == i]
    ith_cluster.sort()
    size_i = ith_cluster.shape[0]
    y_upper = y_lower + size_i

    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster)
    plt.text(-0.05, y_lower + 0.5*size_i, str(i))

    y_lower = y_upper + 10

plt.axvline(x=sil, color="red", linestyle="--")
plt.xlabel("Silhouette Value")
plt.ylabel("Clusters")
plt.title("Silhouette Plot for K-Means")
plt.show()


pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

plt.scatter(pca_data[:, 0], pca_data[:, 1], c=cluster_labels)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("2D PCA Cluster Visualization")
plt.show()



pca_3d = PCA(n_components=3)
pca_data_3d = pca_3d.fit_transform(scaled_data)

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_data_3d[:, 0], pca_data_3d[:, 1], pca_data_3d[:, 2],
           c=cluster_labels)
ax.set_title("3D PCA Cluster Visualization")
plt.show()




cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=df.columns[:-1])

plt.figure(figsize=(8,5))
sns.heatmap(cluster_centers, annot=True, cmap="viridis")
plt.title("Cluster Centroid Heatmap")
plt.show()


print("\nCluster Means (numeric features):")
print(df.groupby("Cluster").mean(numeric_only=True))

print("\nCluster Sizes:")
print(df["Cluster"].value_counts())



## Result


## Learning Outcome