In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
SEED = 51

In [None]:
mnist = fetch_openml('mnist_784', version=1)

In [None]:
X, y = mnist.data, mnist.target
y = y.astype(np.uint8)

X_train, X_test = X[:60_000], X[60_000:]
y_train, y_test = y[:60_000], y[60_000:]

In [None]:
plt.imshow(X.iloc[0].values.reshape(28,28), cmap='binary')
plt.title(f'Example of digit {y[0]}')
plt.axis('off')
plt.show()

In [None]:
model = RandomForestClassifier(random_state=SEED, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
y_train_predictions = cross_val_predict(model, X_train, y_train, cv=3)

In [None]:
cross_val_score(model, X_train, y_train, cv=3)

In [None]:
standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train)
cross_val_score(model, X_train_scaled, y_train, cv=3)

In [None]:
train_confusion_matrix = confusion_matrix(y_train, y_train_predictions)
plt.matshow(train_confusion_matrix)
plt.show()

In [None]:
train_class_counts = train_confusion_matrix.sum(axis=1, keepdims=True)
normalised_train_confusion_matrix = train_confusion_matrix / train_class_counts
np.fill_diagonal(normalised_train_confusion_matrix, 0)

plt.matshow(normalised_train_confusion_matrix)
plt.show()

# Denoising numbers

In [None]:
X_train_noise = X_train.copy() + np.random.randint(0, 255, X_train.shape) % 255
X_test_noise = X_test.copy() + np.random.randint(0, 255, X_test.shape) % 255

y_train_noise = X_train.copy()
y_test_noise = X_test.copy()

In [None]:
plt.imshow(X_train_noise.iloc[0].values.reshape(28,28), cmap='binary')
plt.title('Example of noisy digit')
plt.axis('off')
plt.show()

In [None]:
model = KNeighborsClassifier(n_jobs=-1)
model.fit(X_train_noise, y_train_noise)

In [None]:
noisy_digit_train = X_train_noise.iloc[0].values.reshape(28,28)
denoised_digit_train = model.predict([X_train_noise.iloc[0]]).reshape(28,28)
true_digit_train = y_train_noise.iloc[0].values.reshape(28,28)

noisy_digit_test = X_test_noise.iloc[0].values.reshape(28,28)
denoised_digit_test = model.predict([X_test_noise.iloc[0]]).reshape(28,28)
true_digit_test = y_test_noise.iloc[0].values.reshape(28,28)

fig, axs = plt.subplots(2, 3, figsize=(15,10))
axs[0,0].imshow(noisy_digit_train, cmap='binary')
axs[0,1].imshow(denoised_digit_train, cmap='binary')
axs[0,2].imshow(true_digit_train, cmap='binary')
axs[1,0].imshow(noisy_digit_test, cmap='binary')
axs[1,1].imshow(denoised_digit_test, cmap='binary')
axs[1,2].imshow(true_digit_test, cmap='binary')