In [52]:
import numpy as np
from tensorflow.keras.datasets import mnist

# Load MNIST Dataset

In [53]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

print("Original training data shape:", X_train.shape)
print("Original test data shape:", X_test.shape)

X_train = X_train.reshape(X_train.shape[0], -1).T  # shape: (784, 60000)
X_test = X_test.reshape(X_test.shape[0], -1).T     # shape: (784, 10000)

X_train = X_train / 255.0
X_test = X_test / 255.0

def one_hot_encode(y, num_classes=10):
    one_hot = np.zeros((num_classes, y.size))
    one_hot[y, np.arange(y.size)] = 1
    return one_hot

Y_train = one_hot_encode(y_train)
Y_test = one_hot_encode(y_test)

print("Processed training data shape:", X_train.shape)
print("Processed training labels shape:", Y_train.shape)

Original training data shape: (60000, 28, 28)
Original test data shape: (10000, 28, 28)
Processed training data shape: (784, 60000)
Processed training labels shape: (10, 60000)


# Network Architecture & Parameter Initialization

In [54]:
def layer_sizes(input_size, hidden1_size, hidden2_size, output_size):
    return [input_size, hidden1_size, hidden2_size, output_size]

layers = layer_sizes(784, 128, 64, 10)

def initialize_parameters(layer_dims):
    np.random.seed(42)
    parameters = {}
    
    for l in range(1, len(layer_dims)):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2. / layer_dims[l-1])
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
    
    return parameters

parameters = initialize_parameters(layers)

for key, value in parameters.items():
    print(f"{key}.shape = {value.shape}")

W1.shape = (128, 784)
b1.shape = (128, 1)
W2.shape = (64, 128)
b2.shape = (64, 1)
W3.shape = (10, 64)
b3.shape = (10, 1)


# Forward Propagation

In [55]:
import numpy as np

def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(float)

def softmax(Z):
    Z_shift = Z - np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z_shift)
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

def forward_propagation(X, parameters):
    caches = {}
    caches["A0"] = X
    A = X
    
    L = len(parameters) // 2

    for l in range(1, L):
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        Z = np.dot(W, A) + b
        A = relu(Z)

        caches[f"Z{l}"] = Z
        caches[f"A{l}"] = A

    W = parameters[f"W{L}"]
    b = parameters[f"b{L}"]
    Z = np.dot(W, A) + b
    A = softmax(Z)

    caches[f"Z{L}"] = Z
    caches[f"A{L}"] = A

    return A, caches

# Loss Function

In [56]:
def compute_loss(AL, Y):
    m = Y.shape[1]
    
    epsilon = 1e-12
    
    loss = -np.sum(Y * np.log(AL + epsilon)) / m
    
    return loss

# Backward Propagation

In [57]:
def backward_propagation(X, Y, parameters, caches):
    grads = {}
    m = X.shape[1]
    L = len(parameters) // 2

    AL = caches[f"A{L}"]
    dZL = AL - Y
    A_prev = caches[f"A{L-1}"]

    grads[f"dW{L}"] = (1 / m) * np.dot(dZL, A_prev.T)
    grads[f"db{L}"] = (1 / m) * np.sum(dZL, axis=1, keepdims=True)

    dA_prev = np.dot(parameters[f"W{L}"].T, dZL)

    for l in reversed(range(1, L)):
        Z = caches[f"Z{l}"]
        dZ = dA_prev * relu_derivative(Z)

        A_prev = caches[f"A{l-1}"]

        grads[f"dW{l}"] = (1 / m) * np.dot(dZ, A_prev.T)
        grads[f"db{l}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)

        if l > 1:
            dA_prev = np.dot(parameters[f"W{l}"].T, dZ)

    return grads

# Parameter Update

In [58]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    
    for l in range(1, L + 1):
        parameters[f"W{l}"] -= learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] -= learning_rate * grads[f"db{l}"]
    
    return parameters

# Training Loop

In [59]:
import numpy as np

def predict(X, parameters):
    AL, _ = forward_propagation(X, parameters)
    preds = np.argmax(AL, axis=0)
    return preds

def compute_accuracy(X, Y, parameters):
    y_pred = predict(X, parameters)
    y_true = np.argmax(Y, axis=0)
    return np.mean(y_pred == y_true)

In [60]:
def train_model(X_train, Y_train, X_test, Y_test, layer_dims, learning_rate=0.01, epochs=10, batch_size=64, print_every=1):
    np.random.seed(42)
    parameters = initialize_parameters(layer_dims)
    m = X_train.shape[1]

    for epoch in range(1, epochs + 1):
        permutation = np.random.permutation(m)
        X_shuffled = X_train[:, permutation]
        Y_shuffled = Y_train[:, permutation]

        for i in range(0, m, batch_size):
            X_batch = X_shuffled[:, i:i + batch_size]
            Y_batch = Y_shuffled[:, i:i + batch_size]

            _, caches = forward_propagation(X_batch, parameters)

            grads = backward_propagation(X_batch, Y_batch, parameters, caches)

            parameters = update_parameters(parameters, grads, learning_rate)

        if epoch % print_every == 0:
            AL_train, _ = forward_propagation(X_train, parameters)
            train_loss = compute_loss(AL_train, Y_train)
            train_acc = np.mean(
                np.argmax(AL_train, axis=0) == np.argmax(Y_train, axis=0)
            )

            AL_test, _ = forward_propagation(X_test, parameters)
            test_acc = np.mean(
                np.argmax(AL_test, axis=0) == np.argmax(Y_test, axis=0)
            )

            print(f"Epoch {epoch}/{epochs} "
                  f"- loss: {train_loss:.4f} "
                  f"- train_acc: {train_acc*100:.2f}% "
                  f"- test_acc: {test_acc*100:.2f}%")

    return parameters

In [61]:
layers = [784, 128, 64, 10]
params = train_model(
    X_train, Y_train,
    X_test, Y_test,
    layer_dims=layers,
    learning_rate=0.01,
    epochs=10,
    batch_size=64
)

Epoch 1/10 - loss: 0.3858 - train_acc: 89.34% - test_acc: 90.31%
Epoch 2/10 - loss: 0.2997 - train_acc: 91.48% - test_acc: 92.08%
Epoch 3/10 - loss: 0.2597 - train_acc: 92.68% - test_acc: 92.97%
Epoch 4/10 - loss: 0.2335 - train_acc: 93.39% - test_acc: 93.43%
Epoch 5/10 - loss: 0.2159 - train_acc: 93.91% - test_acc: 93.94%
Epoch 6/10 - loss: 0.1998 - train_acc: 94.37% - test_acc: 94.22%
Epoch 7/10 - loss: 0.1851 - train_acc: 94.78% - test_acc: 94.61%
Epoch 8/10 - loss: 0.1750 - train_acc: 95.02% - test_acc: 94.80%
Epoch 9/10 - loss: 0.1652 - train_acc: 95.34% - test_acc: 94.94%
Epoch 10/10 - loss: 0.1566 - train_acc: 95.54% - test_acc: 95.19%


# Save, Load and Test Model

In [62]:
def save_model(parameters, filename="mlp_model.npz"):
    np.savez(filename, **parameters)
    print(f"Model saved to {filename}")

In [63]:
save_model(params, "mnist_mlp.npz")

Model saved to mnist_mlp.npz


In [64]:
def load_model(filename="mlp_model.npz"):
    data = np.load(filename)
    parameters = {key: data[key] for key in data.files}
    print(f"Model loaded from {filename}")
    return parameters

In [65]:
parameters = load_model("mnist_mlp.npz")

Model loaded from mnist_mlp.npz


In [66]:
def test_model(parameters, X_test, Y_test):
    preds = predict(X_test, parameters)
    y_true = np.argmax(Y_test, axis=0)
    accuracy = np.mean(preds == y_true)

    print(f"Test Accuracy: {accuracy*100:.2f}%")
    return accuracy

In [67]:
test_model(parameters, X_test, Y_test)

Test Accuracy: 95.19%


np.float64(0.9519)