In [86]:
import numpy as np
import pandas as pd
from scipy.ndimage import rotate, shift, zoom
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [87]:
def load_and_split_data(file_path='mnist_train.csv', train_ratio=0.7, val_ratio=0.15):
    df = pd.read_csv(file_path)
    
    X = df.drop('label', axis=1).values / 255.0  
    y = df['label'].values

    X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=train_ratio, random_state=42, shuffle=True)
    
    test_ratio = 1 - (train_ratio + val_ratio)  
    X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=test_ratio / (val_ratio + test_ratio), random_state=42, shuffle=True)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [88]:
def augment_rotation(images):
    return np.array([rotate(img.reshape(28, 28), np.random.uniform(-15, 15), reshape=False, mode='nearest').flatten()for img in images])

def augment_shift(images):
    return np.array([shift(img.reshape(28, 28), shift=np.random.randint(-2, 3, size=2), mode='nearest').flatten()for img in images])

def augment_flip(images):
    return np.array([np.flip(img.reshape(28, 28), axis=np.random.choice([0, 1])).flatten()for img in images])

In [89]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [90]:
def initialize_parameters():
    input_size = 784
    hidden1_size = 256
    hidden2_size = 128
    output_size = 10
    
    W1 = np.random.randn(input_size, hidden1_size) * 0.01
    b1 = np.zeros((1, hidden1_size))
    W2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
    b2 = np.zeros((1, hidden2_size))
    W3 = np.random.randn(hidden2_size, output_size) * 0.01
    b3 = np.zeros((1, output_size))
    
    return W1, b1, W2, b2, W3, b3

In [91]:
def forward_propagation(X, W1, b1, W2, b2, W3, b3):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = relu(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

In [92]:
def backward_propagation(X, y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3):
    m = X.shape[0]
    y_onehot = np.zeros((m, 10))
    y_onehot[np.arange(m), y] = 1
    
    dZ3 = A3 - y_onehot
    dW3 = np.dot(A2.T, dZ3) / m
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m
    
    dA2 = np.dot(dZ3, W3.T)
    dZ2 = dA2 * relu_derivative(Z2)
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2, dW3, db3

In [93]:
def train(X, y, iter=50, batch_size=64, learning_rate=0.1):
    W1, b1, W2, b2, W3, b3 = initialize_parameters()
    
    n_samples = len(X)
    for epoch in range(iter):
        perm = np.random.permutation(n_samples)
        X_shuffled = X[perm]
        y_shuffled = y[perm]
        
        for i in range(0, n_samples, batch_size):
            batch_X = X_shuffled[i:min(i+batch_size, n_samples)]
            batch_y = y_shuffled[i:min(i+batch_size, n_samples)]
            
            Z1, A1, Z2, A2, Z3, A3 = forward_propagation(batch_X, W1, b1, W2, b2, W3, b3)
            dW1, db1, dW2, db2, dW3, db3 = backward_propagation(batch_X, batch_y, Z1, A1, Z2, A2, Z3, A3, W1, W2, W3)
                                                             
            W1 -= learning_rate * dW1
            b1 -= learning_rate * db1
            W2 -= learning_rate * dW2
            b2 -= learning_rate * db2
            W3 -= learning_rate * dW3
            b3 -= learning_rate * db3
    return W1, b1, W2, b2, W3, b3

In [94]:
def predict(X, W1, b1, W2, b2, W3, b3):
    _, _, _, _, _, A3 = forward_propagation(X, W1, b1, W2, b2, W3, b3)
    return np.argmax(A3, axis=1)

In [95]:
def evaluate(X, y, W1, b1, W2, b2, W3, b3, split_name):
    predictions = predict(X, W1, b1, W2, b2, W3, b3)
    acc = accuracy_score(y, predictions)
    
    print(f"\nMetrics for {split_name} split:")
    print(f"Accuracy: {acc:.4f}")

In [96]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_split_data()

X_train_rotated = augment_rotation(X_train)
X_train_shifted = augment_shift(X_train)
X_train_flipped = augment_flip(X_train)

X_train_aug = np.concatenate((X_train, X_train_rotated, X_train_shifted, X_train_flipped))
y_train_aug = np.concatenate((y_train, y_train, y_train, y_train))


W1, b1, W2, b2, W3, b3 = train(X_train_aug, y_train_aug)

evaluate(X_train, y_train, W1, b1, W2, b2, W3, b3, "train")
evaluate(X_val, y_val, W1, b1, W2, b2, W3, b3, "validation")
evaluate(X_test, y_test, W1, b1, W2, b2, W3, b3, "test")


Metrics for train split:
Accuracy: 1.0000

Metrics for validation split:
Accuracy: 0.9640

Metrics for test split:
Accuracy: 0.9513
