In [8]:
from scipy.io import loadmat
mnist = loadmat("mnist-original.mat")
mnist_data = mnist["data"].T
mnist_label = mnist["label"][0]


In [9]:
from sklearn.preprocessing import StandardScaler
import numpy as np
#preprocessing the data.  We must first reshape the data to a 4D array(samples,width,height,number of colors), where each image is a different 2D matrix
#Normalizing the values of the data, could experiment with minmax scaler too.  To do this, we must flatten the data to a 2D array, then turn it back into a 4D array
#I do not need to look for null values because this data does not have any
mnist_data = mnist_data.reshape(-1, 28, 28, 1)
mnist_data_2d = mnist_data.reshape(-1, 28*28)
scaler = StandardScaler()
mnist_data_normalized = scaler.fit_transform(mnist_data_2d)
#After the 2D data has been normalized, return it to a 4D array
mnist_data_normalized = mnist_data_normalized.reshape(-1, 28, 28, 1)

In [10]:
from sklearn.model_selection import train_test_split
#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(mnist_data_normalized, mnist_label, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(56000, 28, 28, 1) (56000,)
(14000, 28, 28, 1) (14000,)


Preprocessing is now complete.  KNN is now to be implemented.

In [5]:
#Using cross fold validation to find the best N value for KNN
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

neighbors = [1, 5, 7, 10, 15, 20]
cv_scores = []

for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(knn, X_train_flattened, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

best_n = neighbors[cv_scores.index(max(cv_scores))]
print("Best n_neighbors:", best_n)

knn = KNeighborsClassifier(n_neighbors=best_n)
knn.fit(X_train_flattened, y_train)
y_pred = knn.predict(X_test_flattened)
accuracy = accuracy_score(y_test, y_pred)
print("KNN accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average='macro')
print("Macro F1 score:", f1)
f1 = f1_score(y_test, y_pred, average='micro')
print("Micro F1 score:", f1)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error

#for KNN to work, x_train must be a 2D array, so we must flatten it
#in the reshape, the -1 means that it will be a 2D array with the number of rows being the same as the original array, and the number of columns being 28*28
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_flattened, y_train)
y_pred = knn.predict(X_test_flattened)
accuracy = accuracy_score(y_test, y_pred)
print("KNN accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average='macro')
print("Macro F1 score:", f1)
f1 = f1_score(y_test, y_pred, average='micro')
print("Micro F1 score:", f1)
precision_macro = precision_score(y_test, y_pred, average='macro')
print("Macro precision score:", precision_macro)
precision_micro = precision_score(y_test, y_pred, average='micro')
print("Micro precision score:", precision_micro)
recall_macro = recall_score(y_test, y_pred, average='macro')
print("Macro recall score:", recall_macro)
recall_micro = recall_score(y_test, y_pred, average='micro')
print("Micro recall score:", recall_micro)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

KNN accuracy: 0.9459285714285715
Macro F1 score: 0.9452349162519578
Micro F1 score: 0.9459285714285716
Macro precision score: 0.9455855017929776
Micro precision score: 0.9459285714285715
Macro recall score: 0.9450852528902273
Micro recall score: 0.9459285714285715
MAE: 0.1935


In [16]:
from keras.preprocessing.image import ImageDataGenerator
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
#print(X_train.shape, y_train.shape)
additionalImages=[10000, 100000, 1000000]
for i in additionalImages:
    datagen = ImageDataGenerator(
            rotation_range=25,  
            zoom_range = 0.20,  
            )
    datagen.fit(X_train)

    gen_augmented = datagen.flow(X_train, y_train, batch_size=64)
    augmented_images = []
    augmented_labels = []
    for i in range(i // 64):
        batch_augmented_images, batch_augmented_labels = next(gen_augmented)
        augmented_images.append(batch_augmented_images)
        augmented_labels.append(batch_augmented_labels)

    augmented_images = np.concatenate(augmented_images, axis=0)
    augmented_labels = np.concatenate(augmented_labels, axis=0)
    X_train_augmented = np.concatenate([X_train, augmented_images], axis=0)
    y_train_augmented = np.concatenate([y_train, augmented_labels], axis=0)
    #print(X_train_augmented.shape, y_train_augmented.shape)
    #print(augmented_images.shape, augmented_labels.shape)
    X_train_flattened = X_train_augmented.reshape(X_train_augmented.shape[0], -1)
    X_test_flattened = X_test.reshape(X_test.shape[0], -1)
    #cross fold validation showed us 1 neighbor is the most accurate
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train_flattened, y_train_augmented)
    y_pred = knn.predict(X_test_flattened)
    accuracy = accuracy_score(y_test, y_pred)
    print(i)
    print("KNN accuracy:", accuracy)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Macro F1 score:", f1)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Micro F1 score:", f1)
    precision_macro = precision_score(y_test, y_pred, average='macro')
    print("Macro precision score:", precision_macro)
    precision_micro = precision_score(y_test, y_pred, average='micro')
    print("Micro precision score:", precision_micro)
    recall_macro = recall_score(y_test, y_pred, average='macro')
    print("Macro recall score:", recall_macro)
    recall_micro = recall_score(y_test, y_pred, average='micro')
    print("Micro recall score:", recall_micro)
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE:", mae)

155
KNN accuracy: 0.9466428571428571
Macro F1 score: 0.945982654965926
Micro F1 score: 0.9466428571428571
Macro precision score: 0.9464147738366508
Micro precision score: 0.9466428571428571
Macro recall score: 0.945801091575348
Micro recall score: 0.9466428571428571
MAE: 0.192
1561
KNN accuracy: 0.9499285714285715
Macro F1 score: 0.9492336547602637
Micro F1 score: 0.9499285714285715
Macro precision score: 0.9496376144304435
Micro precision score: 0.9499285714285715
Macro recall score: 0.9490259279897023
Micro recall score: 0.9499285714285715
MAE: 0.1792857142857143
15624
KNN accuracy: 0.9611428571428572
Macro F1 score: 0.960740376894659
Micro F1 score: 0.9611428571428572
Macro precision score: 0.9609978076976138
Micro precision score: 0.9611428571428572
Macro recall score: 0.960587378380613
Micro recall score: 0.9611428571428572
MAE: 0.13285714285714287
