#### . Write a function that can shift an MNIST image in any direction (left, right, up, or down) by one pixel.5 Then, for each image in the training set, create four shif‐ted copies (one per direction) and add them to the training set. Finally, train your best model on this expanded training set and measure its accuracy on the test set.You should observe that your model performs even better now! This technique of artificially growing the training set is called data augmentation or training setexpansion.

In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.ndimage import shift

# Step 1: Load MNIST
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target.astype(np.uint8)

# Step 2: Shift function
def shift_image(image, dx, dy):
    """Shifts an image (28x28) by dx and dy (in pixels)"""
    image = image.reshape(28, 28)
    shifted = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted.reshape(784)

# Step 3: Expand the training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42, stratify=y)

X_augmented = [X_train]
y_augmented = [y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):  # right, left, down, up
    shifted_images = np.apply_along_axis(shift_image, axis=1, arr=X_train.reshape(-1, 784), dx=dx, dy=dy)
    X_augmented.append(shifted_images)
    y_augmented.append(y_train)  # labels stay the same

X_train_aug = np.concatenate(X_augmented)
y_train_aug = np.concatenate(y_augmented)

print("✅ Augmented training set size:", X_train_aug.shape)

# Step 4: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_aug)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train the model
knn_clf = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_clf.fit(X_train_scaled, y_train_aug)

# Step 6: Evaluate
y_pred = knn_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Accuracy on test set after data augmentation: {accuracy:.4f}")


✅ Augmented training set size: (300000, 784)

🎯 Accuracy on test set after data augmentation: 0.9636


##### Almost got our 97% accuracy, that was so close