In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.datasets import load_wine, fetch_california_housing
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score


In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model_year', 'origin', 'car_name']
df_auto = pd.read_csv(url, names=column_names, na_values='?', 
                      comment='\t', sep=' ', skipinitialspace=True)

continuous_features = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
df_cont = df_auto[continuous_features]

df_cont.fillna(df_cont.mean(), inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cont)

agg_clust = AgglomerativeClustering(n_clusters=3, linkage='average')
clusters = agg_clust.fit_predict(X_scaled)
df_auto['cluster'] = clusters

print("Problem 1: Auto-mpg Dataset")
print("Cluster Means:")
print(df_auto.groupby('cluster')[continuous_features].mean())
print("\nCluster Variances:")
print(df_auto.groupby('cluster')[continuous_features].var())

print("\nOrigin Category Means:")
print(df_auto.groupby('origin')[continuous_features].mean())
print("\nOrigin Category Variances:")
print(df_auto.groupby('origin')[continuous_features].var())

cross_tab = pd.crosstab(df_auto['origin'], df_auto['cluster'])
print("\nCross-tabulation of Origin and Cluster:")
print(cross_tab)

Problem 1: Auto-mpg Dataset
Cluster Means:
               mpg  displacement  horsepower       weight  acceleration
cluster                                                                
0        26.177441    144.304714   86.120275  2598.414141     16.425589
1        14.528866    348.020619  161.804124  4143.969072     12.641237
2        43.700000     91.750000   49.000000  2133.750000     22.875000

Cluster Variances:
               mpg  displacement  horsepower         weight  acceleration
cluster                                                                  
0        41.303375   3511.485383  294.554450  299118.709664      4.875221
1         4.771033   2089.499570  674.075816  193847.051117      3.189948
2         0.300000     12.250000    4.000000   21672.916667      2.309167

Origin Category Means:
              mpg  displacement  horsepower       weight  acceleration
origin                                                                
1       20.083534    245.901606  119.0489

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cont.fillna(df_cont.mean(), inplace=True)


In [3]:

df_boston = pd.read_csv("boston.csv")

features = df_boston.drop(columns=['MEDV']) if 'MEDV' in df_boston.columns else df_boston
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

sil_scores = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    sil_scores.append(score)
    print(f"k={k}, Silhouette Score = {score:.4f}")

best_k = np.argmax(sil_scores) + 2
print(f"\nOptimal number of clusters: {best_k}")

kmeans_opt = KMeans(n_clusters=best_k, random_state=42)
labels_opt = kmeans_opt.fit_predict(X_scaled)
features['cluster'] = labels_opt

print("\nMean feature values for each cluster:")
print(features.groupby('cluster').mean())

print("\nKMeans cluster center coordinates:")
print(pd.DataFrame(kmeans_opt.cluster_centers_, columns=features.columns[:-1]))

k=2, Silhouette Score = 0.3482
k=3, Silhouette Score = 0.2389
k=4, Silhouette Score = 0.2154
k=5, Silhouette Score = 0.2213
k=6, Silhouette Score = 0.2645

Optimal number of clusters: 2

Mean feature values for each cluster:
         Unnamed: 0       crim         zn      indus      chas       nox  \
cluster                                                                    
0        192.703593   0.288011  17.215569   7.170659  0.068862  0.488754   
1        371.558140  10.071205   0.000000  18.838430  0.069767  0.682744   

               rm        age       dis       rad         tax    ptratio  \
cluster                                                                   
0        6.451398  57.000299  4.717151   4.45509  302.215569  17.789820   
1        5.960802  91.051163  2.004438  19.44186  614.116279  19.748256   

                  b      lstat       medv  
cluster                                    
0        384.764760   9.502545  25.697305  
1        302.125756  18.770930  16.38

In [4]:
wine = load_wine()
df_wine = pd.DataFrame(wine.data, columns=wine.feature_names)
X_wine = StandardScaler().fit_transform(df_wine)

kmeans_wine = KMeans(n_clusters=3, random_state=42)
wine_labels = kmeans_wine.fit_predict(X_wine)

true_labels = wine.target
homogeneity = homogeneity_score(true_labels, wine_labels)
completeness = completeness_score(true_labels, wine_labels)

print(f"\nProblem 3: Wine Dataset - Homogeneity = {homogeneity:.4f}, Completeness = {completeness:.4f}")

print("\nMetric Explanation:")
print("- Homogeneity indicates that each cluster contains only samples from the same class. The higher, the better.")
print("- Completeness indicates the degree to which all samples of the same class are assigned to the same cluster. The higher, the better.")


Problem 3: Wine Dataset - Homogeneity = 0.8788, Completeness = 0.8730

Metric Explanation:
- Homogeneity indicates that each cluster contains only samples from the same class. The higher, the better.
- Completeness indicates the degree to which all samples of the same class are assigned to the same cluster. The higher, the better.
