In [11]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

Load the Boston Housing dataset

In [12]:
boston = fetch_openml(name="boston", version=1, as_frame=True)
X = boston.data

Scale the data

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

Perform K-Means clustering for k from 2 to 6

In [14]:
silhouette_scores = []
k_values = range(2, 7)
kmeans_models = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    kmeans_models[k] = kmeans
    print(f"K={k}, Silhouette Score: {silhouette_avg:.4f}")

K=2, Silhouette Score: 0.3601
K=3, Silhouette Score: 0.2448
K=4, Silhouette Score: 0.2275
K=5, Silhouette Score: 0.2389
K=6, Silhouette Score: 0.2291




Determine the optimal k value

In [15]:
optimal_k = k_values[np.argmax(silhouette_scores)]
print(f"\nOptimal k: {optimal_k} (Highest Silhouette Score: {max(silhouette_scores):.4f})")


Optimal k: 2 (Highest Silhouette Score: 0.3601)


Get cluster means and centroids for the optimal k

In [16]:
optimal_kmeans = kmeans_models[optimal_k]
cluster_labels = optimal_kmeans.predict(X_scaled)
X_scaled_df['Cluster'] = cluster_labels

Calculate feature means for each cluster

In [17]:
cluster_means = X_scaled_df.groupby('Cluster').mean()
print("\nCluster Means (Scaled Features):")
print(cluster_means)


Cluster Means (Scaled Features):
             CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
Cluster                                                                         
0       -0.390124  0.262392 -0.620368  0.002912 -0.584675  0.243315 -0.435108   
1        0.725146 -0.487722  1.153113 -0.005412  1.086769 -0.452263  0.808760   

              DIS       RAD       TAX   PTRATIO         B     LSTAT  
Cluster                                                              
0        0.457222 -0.583801 -0.631460 -0.285808  0.326451 -0.446421  
1       -0.849865  1.085145  1.173731  0.531248 -0.606793  0.829787  


Get centroids (KMeans cluster_centers_)

In [18]:
centroids = pd.DataFrame(optimal_kmeans.cluster_centers_, 
                        columns=X.columns,
                        index=[f'Centroid {i}' for i in range(optimal_k)])
print("\nCentroids (Scaled Features):")
print(centroids)


Centroids (Scaled Features):
                CRIM        ZN     INDUS      CHAS       NOX        RM  \
Centroid 0 -0.390124  0.262392 -0.620368  0.002912 -0.584675  0.243315   
Centroid 1  0.725146 -0.487722  1.153113 -0.005412  1.086769 -0.452263   

                 AGE       DIS       RAD       TAX   PTRATIO         B  \
Centroid 0 -0.435108  0.457222 -0.583801 -0.631460 -0.285808  0.326451   
Centroid 1  0.808760 -0.849865  1.085145  1.173731  0.531248 -0.606793   

               LSTAT  
Centroid 0 -0.446421  
Centroid 1  0.829787  


Compare cluster means with centroids

In [19]:
difference = cluster_means - centroids.values[:optimal_k]
print("\nDifference (Cluster Means - Centroids):")
print(difference)


Difference (Cluster Means - Centroids):
                 CRIM            ZN         INDUS          CHAS           NOX  \
Cluster                                                                         
0        5.551115e-17 -1.110223e-16 -3.330669e-16  3.339343e-16 -1.110223e-16   
1       -2.220446e-16 -4.996004e-16 -1.110223e-15  2.437286e-16 -6.661338e-16   

                   RM           AGE           DIS           RAD           TAX  \
Cluster                                                                         
0        5.551115e-17 -5.551115e-17 -3.885781e-16  3.330669e-16 -1.110223e-16   
1        2.220446e-16 -3.330669e-16  1.110223e-16 -1.110223e-15 -3.996803e-15   

              PTRATIO             B         LSTAT  
Cluster                                            
0        0.000000e+00  2.775558e-16  0.000000e+00  
1       -1.443290e-15  1.110223e-16 -3.330669e-16  
