Problem 1

In [95]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [96]:
# Load the dataset using a Pandas dataframe
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
columns_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df_raw = pd.read_csv(data_url, sep='\s+', names=columns_names, na_values='?')
print(df_raw.head())

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin                   car_name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


In [97]:
# Use only the continuous fields as features
continuous_features = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
df_continuous = df_raw[continuous_features]
print(df_continuous.head())

    mpg  displacement  horsepower  weight  acceleration
0  18.0         307.0       130.0  3504.0          12.0
1  15.0         350.0       165.0  3693.0          11.5
2  18.0         318.0       150.0  3436.0          11.0
3  16.0         304.0       150.0  3433.0          12.0
4  17.0         302.0       140.0  3449.0          10.5


In [98]:
# Impute any missing values with the mean
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_continuous), columns=continuous_features)
print(df_imputed, '\n')
print(f"{df_continuous.isnull().sum().sum()} missing values are imputed")

      mpg  displacement  horsepower  weight  acceleration
0    18.0         307.0       130.0  3504.0          12.0
1    15.0         350.0       165.0  3693.0          11.5
2    18.0         318.0       150.0  3436.0          11.0
3    16.0         304.0       150.0  3433.0          12.0
4    17.0         302.0       140.0  3449.0          10.5
..    ...           ...         ...     ...           ...
393  27.0         140.0        86.0  2790.0          15.6
394  44.0          97.0        52.0  2130.0          24.6
395  32.0         135.0        84.0  2295.0          11.6
396  28.0         120.0        79.0  2625.0          18.6
397  31.0         119.0        82.0  2720.0          19.4

[398 rows x 5 columns] 

6 missing values are imputed


In [99]:
# Perform Hierarchical Clustering
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df_imputed), columns=continuous_features)

clustering = AgglomerativeClustering(n_clusters=3, linkage='average', metric='euclidean')
df['cluster'] = clustering.fit_predict(df)
print(df)

          mpg  displacement  horsepower    weight  acceleration  cluster
0   -0.706439      1.090604    0.669196  0.630870     -1.295498        1
1   -1.090751      1.503514    1.586599  0.854333     -1.477038        1
2   -0.706439      1.196232    1.193426  0.550470     -1.658577        1
3   -0.962647      1.061796    1.193426  0.546923     -1.295498        1
4   -0.834543      1.042591    0.931311  0.565841     -1.840117        1
..        ...           ...         ...       ...           ...      ...
393  0.446497     -0.513026   -0.484111 -0.213324      0.011586        0
394  2.624265     -0.925936   -1.375302 -0.993671      3.279296        2
395  1.087017     -0.561039   -0.536534 -0.798585     -1.440730        0
396  0.574601     -0.705077   -0.667591 -0.408411      1.100822        0
397  0.958913     -0.714680   -0.588957 -0.296088      1.391285        0

[398 rows x 6 columns]


In [103]:
# Obtain the mean and variance values for each cluster & compare
cluster_stats = df.groupby('cluster')[continuous_features].agg(['mean', 'var'])
print("Mean and Variance values for each cluster:")
print(cluster_stats, '\n')

df['origin'] = df_raw['origin']
origin_stats = df.groupby('origin')[continuous_features].agg(['mean', 'var'])
print("Mean and Variance values for classes using origin as a label:")
print(origin_stats, '\n')

crosstab = pd.crosstab(df['cluster'], df['origin'])
print('Crosstab:')
print(crosstab)

Mean and Variance values for each cluster:
              mpg           displacement           horsepower            \
             mean       var         mean       var       mean       var   
cluster                                                                   
0        0.341124  0.677814    -0.471690  0.323792  -0.471242  0.202864   
1       -1.151105  0.078296     1.484507  0.192671   1.502830  0.463119   
2        2.585833  0.004923    -0.976350  0.001130  -1.453937  0.002748   

           weight           acceleration            
             mean       var         mean       var  
cluster                                             
0       -0.439845  0.418150     0.311340  0.642682  
1        1.387534  0.270986    -1.062679  0.420519  
2       -0.989238  0.030297     2.652985  0.304409   

Mean and Variance values for classes using origin as a label:
             mpg           displacement           horsepower            \
            mean       var         mean       var 

Problem 2

In [105]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [106]:
# Load the Boston dataset using a Pandas dataframe
boston = fetch_openml(name='boston', version=1, parser='auto')
features = boston.feature_names
df_raw = pd.DataFrame(boston.data, columns=boston.feature_names)
df_raw = df_raw.apply(pd.to_numeric, errors='coerce')
print(df_raw.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  


In [107]:
# Perform a K-Means analysis on scaled data with different numbers of clusters
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw)

s_scores = []
for k in range(2,7):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(df_scaled)

# Provide the Silhouette score to justify which value of k is optimal
    s_score = silhouette_score(df_scaled, labels)
    s_scores.append([k, s_score])
    print(f'k = {k}: sihouette score = {s_score:.4f}')

opt_k = max(s_scores, key=lambda x:x[1])[0]
opt_score = max(s_scores, key=lambda x:x[1])[1]
print(f'\nThe optimal k is {opt_k}, its silhouette score is {opt_score:.4f}')

k = 2: sihouette score = 0.3601
k = 3: sihouette score = 0.2575
k = 4: sihouette score = 0.2898
k = 5: sihouette score = 0.2878
k = 6: sihouette score = 0.2625

The optimal k is 2, its silhouette score is 0.3601


In [108]:
# Calculate the mean values for all features
opt_km = KMeans(n_clusters=opt_k, random_state=42, n_init=10).fit(df_scaled)
df = df_raw
df['cluster'] = opt_km.fit_predict(df_scaled)
cluster_means = df.groupby('cluster')[features].mean()
print("Means of each clusters:")
print(cluster_means)

Means of each clusters:
             CRIM         ZN      INDUS      CHAS       NOX        RM  \
cluster                                                                 
0        0.261172  17.477204   6.885046  0.069909  0.487011  6.455422   
1        9.844730   0.000000  19.039718  0.067797  0.680503  5.967181   

               AGE       DIS        RAD         TAX    PTRATIO           B  \
cluster                                                                      
0        56.339210  4.756868   4.471125  301.917933  17.837386  386.447872   
1        91.318079  2.007242  18.988701  605.858757  19.604520  301.331695   

             LSTAT  
cluster             
0         9.468298  
1        18.572768  


In [109]:
# How the mean values differ from the centroid coordinates
centroid_original = pd.DataFrame(scaler.inverse_transform(opt_km.cluster_centers_), columns=features)
for cluster in range(opt_k):
    print(f'\n cluster {cluster}:')
    print(cluster_means.loc[cluster] - centroid_original.loc[cluster])


 cluster 0:
CRIM       5.551115e-16
ZN        -3.552714e-15
INDUS     -2.664535e-15
CHAS       6.938894e-17
NOX       -5.551115e-17
RM         0.000000e+00
AGE       -7.105427e-15
DIS        0.000000e+00
RAD        2.664535e-15
TAX        0.000000e+00
PTRATIO    0.000000e+00
B          5.684342e-14
LSTAT      0.000000e+00
Name: 0, dtype: float64

 cluster 1:
CRIM      -1.776357e-15
ZN        -1.243450e-14
INDUS     -7.105427e-15
CHAS       5.551115e-17
NOX       -1.110223e-16
RM         0.000000e+00
AGE       -1.421085e-14
DIS        0.000000e+00
RAD       -7.105427e-15
TAX       -6.821210e-13
PTRATIO    0.000000e+00
B          0.000000e+00
LSTAT      0.000000e+00
Name: 1, dtype: float64


Problem 3

In [89]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score

In [90]:
# Load the wine dataset using a Pandas dataframe
wine = load_wine()
features = wine.feature_names
df_raw = pd.DataFrame(wine.data, columns=features)
df_raw['true_label'] = wine.target
print(df_raw.head())

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  true_label  
0      

In [91]:
# Perform a K-Means analysis on scaled data, with the number of clusters set to 3
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw[features])

km = KMeans(n_clusters=3, random_state=42, n_init=10)
df = df_raw
df['cluster'] = km.fit_predict(df_scaled)
print(df)

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     flavanoids  nonflavanoid_phenols  proanthocyan

In [93]:
# Calculate the Homogeneity/Completeness for the optimal k
h_score = homogeneity_score(df['true_label'], df['cluster'])
c_score = completeness_score(df['true_label'], df['cluster'])
print(f'Homogeneity Score:{h_score:.4f}')
print(f'Completeness Score:{c_score:.4f}')

Homogeneity Score:0.8788
Completeness Score:0.8730
