In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target
le = LabelEncoder()
y = le.fit_transform(y)
# Display basic information about the dataset
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (70000, 784)
Shape of y: (70000,)


In [27]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [42]:
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_train_scaled)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

In [43]:
y_pred_knn = knn.predict(X_test_scaled)
y_pred_kmeans = kmeans.predict(X_test_scaled)

In [None]:

labels = np.zeros_like(y_pred_kmeans)
for i in range(kmeans.n_clusters):
    mask = (y_pred_kmeans == i)
    true_labels = y_test[mask]

    if len(true_labels) == 0:
        continue

    values, counts = np.unique(true_labels, return_counts=True)
    most_common = values[np.argmax(counts)]
    labels[mask] = most_common
    

[4 7 4 ... 4 7 7]
[6 6 6 ... 6 6 6]
[3 1 1 ... 1 2 1]
[8 8 5 ... 8 3 5]
[8 9 9 4 4 9 9 7 9 9 4 9 4 4 9 9 5 4 9 4 4 4 8 4 4 7 4 3 9 9 9 7 4 4 4 9 3
 4 4 9 9 9 4 9 9 5 9 4 9 4 4 9 9 8 9 7 9 4 7 4 4 9 5 2 4 4 7 5 9 9 4 9 9 4
 9 9 9 4 6 7 4 9 8 7 4 2 4 4 4 9 9 4 9 9 5 9 9 5 9 4 9 4 8 9 7 4 8 9 7 8 8
 4 9 9 4 8 4 9 9 4 9 3 9 4 5 9 9 7 2 4 9 9 8 4 9 8 4 4 4 4 4 5 4 4 4 9 9 9
 9 9 5 4 7 3 9 9 4 4 5 2 9 0 9 4 5 4 4 5 4 9 9 8 9 4 4 4 2 4 4 4 4 9 3 4 4
 4 4 4 4 7 4 8 9 8 5 9 5 9 5 4 4 7 7 9 4 4 9 9 9 4 3 9 9 4 4 9 5 9 4 9 9 4
 9 9 4 5 9 9 9 8 5 9 4 7 4 9 5 4 4 5 6 9 9 4 9 4 4 4 4 4 9 4 9 4 4 9 4 9 9
 9 8 4 4 8 9 9 9 9 9 4 9 9 9 9 4 4 7 4 8 7 4 4 8 8 3 3 9 9 7 1 9 4 2 4 4 4
 5 4 4 5 4 7 4 9 4 9 4 8 8 4 8 2 9 9 9 4 7 4 4 7 4 4 7 9 9 9 8 4 9 4 8 4 8
 4 2 9 9 7 9 9 9 9 4 5 4 4 4 8 9 3 4 9 7 7 5 7 4 8 4 9 9 5 9 9 9 9 4 5 9 4
 4 9 4 4 4 9 2 3 9 5 4 4 9 4 8 4 4 9 9 5 9 4 9 8 9 9 9 4 8 3 4 8 9 8 3 9 3
 7 9 4 3 4 4 4 9 4 9 7 9 9 9 4 4 7 9 9 4 7 9 5 4 7 9 4 4 4 5 5 4 5 9 4 4 4
 4 9 4 5 9 4 8 7 9 3 9 9 4 7

In [45]:
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

print("KMeans Classification Report:")
print(classification_report(y_test, labels, target_names=le.classes_))

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1343
           1       0.95      0.99      0.97      1600
           2       0.95      0.94      0.95      1380
           3       0.93      0.95      0.94      1433
           4       0.94      0.94      0.94      1295
           5       0.95      0.94      0.94      1273
           6       0.97      0.97      0.97      1396
           7       0.94      0.93      0.93      1503
           8       0.97      0.90      0.93      1357
           9       0.91      0.92      0.91      1420

    accuracy                           0.95     14000
   macro avg       0.95      0.95      0.95     14000
weighted avg       0.95      0.95      0.95     14000

KMeans Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.54      0.67      1343
           1       0.58      0.97      0.73      1600
           2       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
