In [7]:
import numpy as np
from tensorflow.keras.datasets import mnist

# 1. Load Data
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# 2. Flatten and Normalize
# Reshape from (60000, 28, 28) to (60000, 784) and divide by 255
train_images_flat = train_images.reshape(train_images.shape[0], -1).T / 255.
test_images_flat = test_images.reshape(test_images.shape[0], -1).T / 255.

# Check dimensions
print(f"X_train shape: {train_images_flat.shape}") 
print(f"X_test shape: {test_images_flat.shape}") 


X_train shape: (784, 60000)
X_test shape: (784, 10000)


In [8]:
def one_hot(Y, C=10):
    """
    Y: Vector of labels, shape (m,)
    C: Number of classes
    Returns: One-hot matrix of shape (C, m)
    """
    # np.eye(C) creates a 10x10 identity matrix.
    # [Y] selects the row corresponding to each label.
    # .T transposes it to (10, m) so columns are examples.
    one_hot_Y = np.eye(C)[Y].T
    return one_hot_Y

# Apply to your data
Y_train_encoded = one_hot(train_labels)
Y_test_encoded = one_hot(test_labels)

print(f"Original shape: {train_labels.shape}")  # (60000,)
print(f"Encoded shape: {Y_train_encoded.shape}") # (10, 60000)

Original shape: (60000,)
Encoded shape: (10, 60000)


In [14]:
# Defining weights and Biases


W_i_to_h = np.random.uniform(low = -0.5 , high = 0.5 , size = (128 , 784))

W_h_to_o = np.random.uniform(low = -0.5 , high = 0.5 , size = (10,128))

biases_i_to_h = np.zeros((128,1))
biases_h_to_o = np.zeros((10,1))


In [None]:
# Activation Functions

def ReLU(X):
    return np.maximum(0 , X)


def Softmax(X):
    T = np.exp(X)
    Sum = np.sum(T , axis = 0, keepdims=True)
    return T/Sum

def Forwardprop(X , W_i_to_h , W_h_to_o , biases_i_to_h , biases_h_to_o):
    Z1 = W_i_to_h @ X + biases_i_to_h
    A1 = ReLU(Z1)
    Z2 = W_h_to_o @ A1 + biases_h_to_o
    A2 = Softmax(Z2)
    return A1 , Z1 , A2 , Z2

def compute_cost(A2, Y):
    m = Y.shape[1] # Number of examples
    # Multiply Y by log(A2). The zeros in Y cancel out the wrong classes.
    # We add a tiny epsilon (1e-8) inside log to prevent log(0) errors.
    cost = - (1/m) * np.sum(Y * np.log(A2))
    return cost
    

In [None]:
# Testing Forwardprop and compute cost 

A1 , Z1 , A2 , Z2 = Forwardprop(train_images_flat , W_i_to_h , W_h_to_o , biases_i_to_h , biases_h_to_o)
compute_cost(A2 , Y_train_encoded)


np.float64(9.046874363107467)

In [None]:
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    """
    Computes the gradients of the Loss with respect to W and b.
    """
    m = X.shape[1] # Number of examples (60,000)

    # --- PART 1: Output Layer Gradients ---
    # Math: dZ2 = A2 - Y
    dZ2 = A2 - Y
    
    # Math: dW2 = (1/m) * dZ2 . A1^T
    dW2 = (1 / m) * (dZ2 @ A1.T)
    
    # Math: db2 = (1/m) * sum(dZ2)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    
    # --- PART 2: Hidden Layer Gradients ---
    # Math: dZ1 = (W2^T . dZ2) * g'(Z1)
    # Note: (Z1 > 0) creates a mask of 1s and 0s. This IS the derivative of ReLU.
    dZ1 = (W2.T @ dZ2) * (Z1 > 0)
    
    # Math: dW1 = (1/m) * dZ1 . X^T
    dW1 = (1 / m) * (dZ1 @ X.T)
    
    # Math: db1 = (1/m) * sum(dZ1)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
    
    return dW1, db1, dW2, db2

dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W_i_to_h, W_h_to_o, train_images_flat, Y_train_encoded)
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha = 0.1):
    """
    Updates parameters using Gradient Descent.
    alpha: Learning rate (e.g., 0.1)
    """
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    
    return W1, b1, W2, b2
