In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:

mnist = fetch_openml('mnist_784', version=1)

In [4]:
X = mnist.data / 255.0  
y = mnist.target.astype(int)

In [7]:
encoder = OneHotEncoder()
y_onehot = encoder.fit_transform(y.values.reshape(-1,1))


In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

In [9]:
import numpy as np

input_size = 784
hidden1_size = 128
hidden2_size = 64
output_size = 10

def xavier_init(size_in, size_out):
    return np.random.randn(size_in, size_out) * np.sqrt(2 / (size_in + size_out))

W1 = xavier_init(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))

W2 = xavier_init(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))

W3 = xavier_init(hidden2_size, output_size)
b3 = np.zeros((1, output_size))


In [10]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)


In [13]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# -----------------------------
# 1️⃣ Load MNIST dataset
# -----------------------------
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Flatten images: 28x28 → 784
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# One-hot encode labels
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

In [14]:
print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

# -----------------------------
# 2️⃣ Network Architecture
# -----------------------------
input_size = 784
hidden1_size = 256
hidden2_size = 128
hidden3_size = 64
output_size = 10

Shapes:
X_train: (60000, 784) y_train: (60000, 10)
X_test: (10000, 784) y_test: (10000, 10)


In [15]:
def xavier_init(size_in, size_out):
    return np.random.randn(size_in, size_out) * np.sqrt(2 / (size_in + size_out))

W1 = xavier_init(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))

W2 = xavier_init(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))

W3 = xavier_init(hidden2_size, hidden3_size)
b3 = np.zeros((1, hidden3_size))

W4 = xavier_init(hidden3_size, output_size)
b4 = np.zeros((1, output_size))

In [17]:

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

lr = 0.01
epochs = 10 
batch_size = 64

In [18]:
for epoch in range(epochs):
    perm = np.random.permutation(X_train.shape[0])
    X_train_shuffled = X_train[perm]
    y_train_shuffled = y_train[perm]

    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train_shuffled[i:i+batch_size]
        y_batch = y_train_shuffled[i:i+batch_size]

        z1 = np.dot(X_batch, W1) + b1
        a1 = relu(z1)

        z2 = np.dot(a1, W2) + b2
        a2 = relu(z2)

        z3 = np.dot(a2, W3) + b3
        a3 = relu(z3)

        z4 = np.dot(a3, W4) + b4
        y_pred = softmax(z4)

        loss = -np.mean(np.sum(y_batch * np.log(y_pred + 1e-8), axis=1))

        dz4 = (y_pred - y_batch) / batch_size
        dW4 = np.dot(a3.T, dz4)
        db4 = np.sum(dz4, axis=0, keepdims=True)

        da3 = np.dot(dz4, W4.T)
        dz3 = da3 * relu_derivative(a3)
        dW3 = np.dot(a2.T, dz3)
        db3 = np.sum(dz3, axis=0, keepdims=True)

        da2 = np.dot(dz3, W3.T)
        dz2 = da2 * relu_derivative(a2)
        dW2 = np.dot(a1.T, dz2)
        db2 = np.sum(dz2, axis=0, keepdims=True)

        da1 = np.dot(dz2, W2.T)
        dz1 = da1 * relu_derivative(a1)
        dW1 = np.dot(X_batch.T, dz1)
        db1 = np.sum(dz1, axis=0, keepdims=True)

        W4 -= lr * dW4
        b4 -= lr * db4
        W3 -= lr * dW3
        b3 -= lr * db3
        W2 -= lr * dW2
        b2 -= lr * db2
        W1 -= lr * dW1
        b1 -= lr * db1

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}")


Epoch 1/10, Loss: 0.3395
Epoch 2/10, Loss: 0.3963
Epoch 3/10, Loss: 0.1637
Epoch 4/10, Loss: 0.1690
Epoch 5/10, Loss: 0.1300
Epoch 6/10, Loss: 0.0714
Epoch 7/10, Loss: 0.0422
Epoch 8/10, Loss: 0.1119
Epoch 9/10, Loss: 0.3442
Epoch 10/10, Loss: 0.1285


In [19]:
a1 = relu(np.dot(X_test, W1) + b1)
a2 = relu(np.dot(a1, W2) + b2)
a3 = relu(np.dot(a2, W3) + b3)
y_test_pred = softmax(np.dot(a3, W4) + b4)

y_test_labels = np.argmax(y_test, axis=1)
y_pred_labels = np.argmax(y_test_pred, axis=1)

accuracy = np.mean(y_test_labels == y_pred_labels)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9612
