In [74]:
import pandas as pd
import numpy as np

In [75]:
train_set = np.array(pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv'))[:1000]
test_set = np.array(pd.read_csv('/kaggle/input/mnist-in-csv/mnist_test.csv'))[2000:3000]

In [76]:
train_labels = train_set[:, 0].T
test_labels = test_set[:, 0].T
X_train = train_set[:, 1:].T
X_test = test_set[:, 1:].T
print("X_train shape: ", X_train.shape)
print("X_label shape: ", train_labels.shape)
print("training labels shape: ", train_labels.shape)
print("test labels shape: ", test_labels.shape)

X_train shape:  (784, 1000)
X_label shape:  (1000,)
training labels shape:  (1000,)
test labels shape:  (1000,)


In [77]:
# initialize parameters
def init_params():
    w1 = np.random.rand(784, 10)
    b1 = np.random.rand(10, 1)
    w2 = np.random.rand(10, 10)
    b2 = np.random.rand(10, 1)
    return w1, b1, w2, b2

In [78]:
# Defining activation functions here
def ReLU(x):
    return np.maximum(0, x)

def Softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x/exp_x.sum()

In [79]:
# defining the derivatives of the activations used
def der_ReLU(x):
    return np.where(x <= 0, 0, 1)
    
def der_Softmax(z):
    exps = np.exp(z)
    sum_exps = np.sum(exps, axis=1, keepdims=True)  # Sum along the class dimension
    softmax = exps / sum_exps

    return softmax * (1 - softmax)

In [80]:
def one_hot_encode(Y):
    return np.eye(10)[Y].reshape(-1, 1)

print(one_hot_encode(5))

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [81]:
# Defining the loss function
# The labels have to be one hot encoded
def cost(output, label):
    epsilon = 1e-10  # small constant to avoid log(0)
    return -np.sum(label * np.log(output + epsilon)) / label.shape[1]

def cost_derivative(output, label):
    return 2*(output-label)

In [82]:
# Forward pass
def forward(x, w1, b1, w2, b2):
    Z1 = w1.T.dot(x) + b1
    A1 = ReLU(Z1)
    Z2 = w2.dot(A1) + b2
    A2 = Softmax(Z2)
    return Z1, A1, Z2, A2

In [83]:
# Backward pass
def backward(X, Z2, A2, A1, Z1, W2, label):
    dz2 = Z2 - label
    dw2 = dz2.dot(Z1.T)
    db2 = np.sum(dz2, axis=1, keepdims=True)
    
    #the derivative wrt z1
    dz1 = W2.T.dot(dz2) * der_ReLU(A1)
    
    dw1 = dz1.dot(X.T)
    db1 = np.sum(dz1, axis=1, keepdims=True)
    return dw1, db1, dw2, db2

In [84]:
# Update parameters
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 = w1 - (alpha * dw1.T)
    w2 = w2 - (alpha * dw2)
    b1 = b1 - (alpha * db1)
    b2 = b2 - (alpha * db2)
    
    return w1, b1, w2, b2

In [85]:
def gradient_descent(X, Y, num_epochs, alpha):
    """Performs stochastic gradient descent for a specified number of epochs."""

    w1, b1, w2, b2 = init_params()
    loss = 0.0
    for epoch in range(num_epochs):
        for i in range(X.shape[1]):  # Iterate over samples
            sample_X = X[:, i].reshape(-1, 1)  # Reshape to column vector
            sample_Y = Y[i]
            sample_Y = one_hot_encode(sample_Y)

            # Perform forward pass, backward pass, and updates for this sample
            Z1, A1, Z2, A2 = forward(sample_X, w1, b1, w2, b2)
            dw1, db1, dw2, db2 = backward(sample_X, Z2, A2, A1, Z1, w2, sample_Y)
            w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
            loss = cost(Z2, sample_Y)

        # Print loss periodically
        if epoch % 50== 0:  # Print loss every 50 samples
            print("Epoch:", epoch, "Loss:", loss)

    return w1, b1, w2, b2


In [86]:
w1, b1, w2, b2 = gradient_descent(X_train, train_labels, 500, 0.01)

Epoch:

  return -np.sum(label * np.log(output + epsilon)) / label.shape[1]


 0 Loss: 2.868382809752485
Epoch: 50 Loss: 2.2615720040274017
Epoch: 100 Loss: 2.2615720040274017
Epoch: 150 Loss: 2.2615720040274017
Epoch: 200 Loss: 2.2615720040274017
Epoch: 250 Loss: 2.2615720040274017
Epoch: 300 Loss: 2.2615720040274017
Epoch: 350 Loss: 2.2615720040274017
Epoch: 400 Loss: 2.2615720040274017
Epoch: 450 Loss: 2.2615720040274017
