In [1]:
from tensorflow.keras.datasets import mnist
import numpy as np

In [2]:
(X_train,y_train),(X_test,y_test) = mnist.load_data()

In [3]:
X_train = X_train / 255.0
X_test = X_test / 255.0

In [4]:
X_train = X_train.reshape(-1,784)
X_test = X_test.reshape(-1,784)

In [5]:
def one_hot(y, num_classes=10):
    onehot=np.zeros((y.size, num_classes))
    onehot[np.arange(y.size),y] = 1
    return onehot


In [6]:
y_train = one_hot(y_train)
y_test = one_hot(y_test)

In [8]:
input_size = 784
hidden1 = 128
hidden2 = 128
output_size = 10

np.random.seed(42)


W1 = np.random.randn(input_size, hidden1) * np.sqrt(2 / input_size)
W2 = np.random.randn(hidden1, hidden2) * np.sqrt(2 / hidden1)
W3 = np.random.randn(hidden2, output_size) * np.sqrt(2 / hidden2)
b1 = np.zeros((1,hidden_size1))
b2 = np.zeros((1,hidden_size2))
b3 = np.zeros((1,output_size))

In [9]:
def leaky_relu(Z):
    return np.where(Z > 0,Z,0.01*Z)

def leaky_relu_derivative(Z):
    return np.where(Z>0,1,0.01)
def softmax(Z):
    expZ = np.exp(Z - np.max(Z,axis=1,keepdims = True))
    return expZ / np.sum(expZ, axis=1,keepdims = True)

In [10]:
def compute_loss(y_true, y_pred):
    m = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-9)) / m
    return loss

In [11]:
def forward(X):
    Z1 = X @ W1 + b1
    A1 = leaky_relu(Z1)
    Z2 = A1 @ W2 + b2
    A2 = leaky_relu(Z2)
    Z3 = A2 @ W3 + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

In [12]:
def backward(X, y, Z1, A1, Z2, A2, Z3, A3):
    global W1, W2, W3, b1, b2, b3
    
    m = X.shape[0]

    dZ3 = A3 - y
    dW3 = A2.T @ dZ3 / m
    db3 = np.sum(dZ3, axis=0, keepdims=True) / m

    dA2 = dZ3 @ W3.T
    dZ2 = dA2 * leaky_relu_derivative(Z2)
    dW2 = A1.T @ dZ2 / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    
    dA1 = dZ2 @ W2.T
    dZ1 = dA1 * leaky_relu_derivative(Z1)
    dW1 = X.T @ dZ1 / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    lr = 0.1
    W3 -= lr * dW3
    b3 -= lr * db3
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1
    

In [13]:
def accuracy(X, y_true):
    _, _, _, _, _, A3 = forward(X)
    predictions = np.argmax(A3, axis=1)
    actual = np.argmax(y_true, axis=1)
    return np.mean(predictions == actual)

print("Train Accuracy:", accuracy(X_train, y_train))
print("Test Accuracy:", accuracy(X_test, y_test))

Train Accuracy: 0.1097
Test Accuracy: 0.1094


In [15]:
epochs = 200

for epoch in range(epochs):
    Z1, A1, Z2, A2, Z3 , A3 = forward(X_train)
    loss = compute_loss(y_train, A3)
    backward(X_train, y_train, Z1, A1,Z2, A2,Z3,A3)

    if epoch % 10 == 0:
        print("Epoch:", epoch)
        print("Train Acc:", accuracy(X_train, y_train))
        print("Test Acc:", accuracy(X_test, y_test))

Epoch: 0
Train Acc: 0.16121666666666667
Test Acc: 0.1631
Epoch: 10
Train Acc: 0.7174333333333334
Test Acc: 0.7193
Epoch: 20
Train Acc: 0.8055
Test Acc: 0.8065
Epoch: 30
Train Acc: 0.8412666666666667
Test Acc: 0.8457
Epoch: 40
Train Acc: 0.8598333333333333
Test Acc: 0.865
Epoch: 50
Train Acc: 0.8713333333333333
Test Acc: 0.8778
Epoch: 60
Train Acc: 0.8796833333333334
Test Acc: 0.8866
Epoch: 70
Train Acc: 0.8860666666666667
Test Acc: 0.8918
Epoch: 80
Train Acc: 0.8907
Test Acc: 0.8953
Epoch: 90
Train Acc: 0.89465
Test Acc: 0.8985
Epoch: 100
Train Acc: 0.8981833333333333
Test Acc: 0.9004
Epoch: 110
Train Acc: 0.90095
Test Acc: 0.9031
Epoch: 120
Train Acc: 0.9036
Test Acc: 0.9059
Epoch: 130
Train Acc: 0.9057666666666667
Test Acc: 0.9083
Epoch: 140
Train Acc: 0.9076166666666666
Test Acc: 0.9101
Epoch: 150
Train Acc: 0.9095
Test Acc: 0.912
Epoch: 160
Train Acc: 0.91105
Test Acc: 0.9134
Epoch: 170
Train Acc: 0.9122333333333333
Test Acc: 0.9149
Epoch: 180
Train Acc: 0.9138666666666667
Test Acc