In [1]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns


In [2]:
# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X = mnist.data.astype('float32') / 255.0  # Normalize data to [0, 1]
y = mnist.target.astype(int)  # Convert labels to integers

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [3]:
# Reduce dimensions with PCA (e.g., to 50 components)
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_scaled)

print(f"Shape after PCA reduction: {X_pca.shape}")


Shape after PCA reduction: (70000, 50)


In [None]:
# Initialize Agglomerative Clustering model with 10 clusters (for 10 digits)
agg_clust = AgglomerativeClustering(n_clusters=10, linkage='ward')

# Fit and predict cluster labels
cluster_labels = agg_clust.fit_predict(X_pca)

# Print unique cluster labels to see the distribution
print(f"Cluster labels: {np.unique(cluster_labels)}")


In [None]:
# Create a confusion matrix to compare the true labels with the cluster labels
conf_mat = confusion_matrix(y, cluster_labels)

# Visualize the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='viridis')
plt.title('Confusion Matrix for Agglomerative Clustering on MNIST')
plt.xlabel('Cluster Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
# Reduce dataset to 2D using PCA
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

# Plot the clustering results in 2D
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_2d[:, 0], y=X_2d[:, 1], hue=cluster_labels, palette='tab10', s=10)
plt.title('Agglomerative Clustering on MNIST (2D PCA Projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
