In [None]:
import numpy as np

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [None]:
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [None]:
# MNIST is already split into training (first 60,000 instances) and test
# already shuffled
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
# Improve accuracy by scaling inputs (as discussed in Chapter 2)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Original version which I'm fairly sure doesn't actually take effect, which seems stupid
#X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

# My version, which I hope works and actually scales the data properly
X_train = scaler.fit_transform(X_train.astype(np.float64))

In [None]:
# RandomizedSearch for KNNClassifier on hyperparameteres n_neighbors and weights (gridsearch takes too long)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
hyperparameters = { 'n_neighbors': [1,2,3,4,5,6,7,8,9,10], 'weights': ['uniform', 'distance'] }
# According to the book, n_neighbors=4, weights="distance" achieves enough accuracy, 
# so even if the code runs into issues it has the potential to pass 97%+
rnd_search = RandomizedSearchCV(knn_clf, hyperparameters, random_state=42, n_iter=5, n_jobs=6, cv=3, verbose=3)
rnd_search.fit(X_train, y_train)

# Most likely place of code breaking; had an error earlier stating that best_score
# does not exist, which I'd assume is because rnd_search had scoring=None. 
# However, the solutions didn't pass it a scorer either so...?
rnd_search.best_params_
rnd_search.best_score_

In [None]:
# Check accuracy of best build on the test set
from sklearn.metrics import accuracy_score
rnd_search_predictions = rnd_search.predict(X_test)
accuracy = accuracy_score(y_test, rnd_search_predictions)
print("Accuracy: %.4f" % accuracy)

In [None]:
# Shift and add data
from scipy.ndimage import shift

# Create copy to prevent messing with original training set
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for image in X_train:
  # Temporary variables not really necessary, used for readability
  # Shift data (reshape([-1]) apparently undoes the changes)
  up_transformed = shift(image.reshape(28, 28), [-1, 0].reshape([-1]))
  right_transformed = shift(image.reshape(28, 28), [0, 1].reshape([-1]))
  down_transformed = shift(image.reshape(28, 28), [-1, 0].reshape([-1]))
  left_transformed = shift(image.reshape(28, 28), [-1, 0].reshape([-1]))
  # Add data
  X_train_augmented.append(up_transformed)
  X_train_augmented.append(right_transformed)
  X_train_augmented.append(down_transformed)
  X_train_augmented.append(left_transformed)

In [None]:
# Convert to array to prepare for shuffling
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [None]:
# Shuffle training set (use shuffle_idx to shuffle the data and labels the same way)
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [None]:
# Create new classifier
knn_clf = KNeighborsClassifier(**grid_search.best_params_)

In [None]:
# Retrain on new data
knn_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# Test accuracy on new data
knn_clf_predictions = knn_clf.predict(X_test)
accuracy_score(y_test, knn_clf_predictions)

In [None]:
# Error analysis: (leaving these next two blocks in because if they work that's a plus)
y_predicted = cross_val_predict(knn_clf, X_train, y_train, cv=3)
confusion_mx = confusion_matrix(y_train, y_predicted)
print(confusion_mx)

In [None]:
# Visualization: 
# normalize the confusion matrix so we see error rates instead of absolute numbers
row_sums = confusion_mx.sum(axis=1, keepdims=True)
confusion_mx_normed = confusion_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0)   # get rid of the diagonal (correct answers)
                                    # so we can focus on the errors

np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(confusion_mx_normed, cmap=plt.cm.gray)
plt.show()