In [28]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
from scipy.stats import chi2_contingency

Load the UCI auto-mpg dataset (ID=9)

In [35]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 
                'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, names=column_names, delim_whitespace=True)

Select continuous data and create a copy for modification.

In [36]:
continuous_features = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
X = df[continuous_features].copy()

Handle missing values and outliers in the data

In [37]:
X['horsepower'] = X['horsepower'].replace('?', np.nan).astype(float)
X.fillna(X.mean(), inplace=True)

df['horsepower'] = df['horsepower'].replace('?', np.nan).astype(float)
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

Data standardization

In [38]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Perform hierarchical clustering

In [39]:
clustering = AgglomerativeClustering(n_clusters=3, linkage='average', metric='euclidean')
cluster_labels = clustering.fit_predict(X_scaled)

Add cluster labels to the DataFrame

In [40]:
X['cluster'] = cluster_labels
df['cluster'] = cluster_labels

Calculate mean and variance for each cluster

In [41]:
cluster_stats = X.groupby('cluster')[continuous_features].agg(['mean', 'var'])
print("\nCluster Statistics (Mean and Variance):")
print(cluster_stats)


Cluster Statistics (Mean and Variance):
               mpg            displacement               horsepower  \
              mean        var         mean          var        mean   
cluster                                                               
0        26.177441  41.303375   144.304714  3511.485383   86.490964   
1        14.528866   4.771033   348.020619  2089.499570  161.804124   
2        43.700000   0.300000    91.750000    12.250000   49.000000   

                          weight                acceleration            
                var         mean            var         mean       var  
cluster                                                                 
0        295.270673  2598.414141  299118.709664    16.425589  4.875221  
1        674.075816  4143.969072  193847.051117    12.641237  3.189948  
2          4.000000  2133.750000   21672.916667    22.875000  2.309167  


Calculate mean and variance for each origin class

In [42]:
origin_stats = df.groupby('origin')[continuous_features].agg(['mean', 'var'])
print("\nOrigin Class Statistics (Mean and Variance):")
print(origin_stats)


Origin Class Statistics (Mean and Variance):
              mpg            displacement               horsepower  \
             mean        var         mean          var        mean   
origin                                                               
1       20.083534  40.997026   245.901606  9702.612255  118.814769   
2       27.891429  45.211230   109.142857   509.950311   81.241983   
3       30.450633  37.088685   102.708861   535.465433   79.835443   

                          weight                acceleration            
                var         mean            var         mean       var  
origin                                                                  
1       1569.532304  3361.931727  631695.128385    15.033735  7.568615  
2        410.659789  2423.300000  240142.328986    16.787143  9.276209  
3        317.523856  2221.227848  102718.485881    16.172152  3.821779  


Analyze relationship between clusters and origin labels

In [43]:
crosstab = pd.crosstab(df['cluster'], df['origin'])
print("\nContingency Table (Cluster vs Origin):")
print(crosstab)


Contingency Table (Cluster vs Origin):
origin     1   2   3
cluster             
0        152  66  79
1         97   0   0
2          0   4   0


Calculate standardized residuals to assess significance

In [44]:
chi2, p, dof, expected = chi2_contingency(crosstab)
standardized_residuals = (crosstab - expected) / np.sqrt(expected)
print("\nStandardized Residuals:")
print(standardized_residuals)
print(f"\nChi-square p-value: {p:.4f}")


Standardized Residuals:
origin          1         2         3
cluster                              
0       -2.480441  1.904378  2.611050
1        4.661556 -4.130412 -4.387912
2       -1.581933  3.930188 -0.891050

Chi-square p-value: 0.0000
