In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load and preprocess
data = pd.read_csv('/content/train.csv')
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

train_data = data[0:int(0.8*m), :]
val_data = data[int(0.8*m):m, :]

x_train = train_data[:, 1:].T / 255.0
y_train = train_data[:, 0]

x_val = val_data[:, 1:].T / 255.0
y_val = val_data[:, 0]

# ------------------ NN Functions ------------------

def initialize_parameters():
    w1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    w2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2

def ReLU(x):
    return np.maximum(x, 0)

def softmax_calculator(z):
    exp = np.exp(z - np.max(z))
    return exp / np.sum(exp, axis=0, keepdims=True)

def forward_propagation(w1, b1, w2, b2, x):
    z1 = np.dot(w1, x) + b1
    a1 = ReLU(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = softmax_calculator(z2)
    return z1, a1, z2, a2

def one_hot_converter(y):
    one_hot_y = np.zeros((y.size, y.max() + 1))
    one_hot_y[np.arange(y.size), y] = 1
    return one_hot_y.T

def backward_propagation(w1, b1, w2, b2, z1, a1, z2, a2, x, y):
    m = x.shape[1]  # number of samples
    one_hot_y = one_hot_converter(y)

    dz2 = a2 - one_hot_y
    dw2 = (1 / m) * dz2.dot(a1.T)
    db2 = (1 / m) * np.sum(dz2, axis=1, keepdims=True)

    dz1 = w2.T.dot(dz2) * (z1 > 0)
    dw1 = (1 / m) * dz1.dot(x.T)
    db1 = (1 / m) * np.sum(dz1, axis=1, keepdims=True)

    return dw1, db1, dw2, db2

def update_parameters(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate):
    w1 -= learning_rate * dw1
    b1 -= learning_rate * db1
    w2 -= learning_rate * dw2
    b2 -= learning_rate * db2
    return w1, b1, w2, b2

def get_predictions(a2):
    return np.argmax(a2, axis=0)

def get_accuracy(predictions, y):
    return np.mean(predictions == y)

def gradient_descent(x, y, learning_rate, iterations):
    w1, b1, w2, b2 = initialize_parameters()

    for i in range(iterations):
        z1, a1, z2, a2 = forward_propagation(w1, b1, w2, b2, x)
        dw1, db1, dw2, db2 = backward_propagation(w1, b1, w2, b2, z1, a1, z2, a2, x, y)
        w1, b1, w2, b2 = update_parameters(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate)

        if i % 20 == 0:
            predictions = get_predictions(a2)
            acc = get_accuracy(predictions, y)
            print(f"Iteration {i} - Accuracy: {acc:.4f}")

    return w1, b1, w2, b2

# ------------------ Run Training ------------------
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.1, 1000)


Iteration 0 - Accuracy: 0.0998
Iteration 20 - Accuracy: 0.2315
Iteration 40 - Accuracy: 0.3850
Iteration 60 - Accuracy: 0.4660
Iteration 80 - Accuracy: 0.5437
Iteration 100 - Accuracy: 0.6013
Iteration 120 - Accuracy: 0.6456
Iteration 140 - Accuracy: 0.6762
Iteration 160 - Accuracy: 0.6998
Iteration 180 - Accuracy: 0.7189
Iteration 200 - Accuracy: 0.7336
Iteration 220 - Accuracy: 0.7465
Iteration 240 - Accuracy: 0.7573
Iteration 260 - Accuracy: 0.7686
Iteration 280 - Accuracy: 0.7778
Iteration 300 - Accuracy: 0.7854
Iteration 320 - Accuracy: 0.7932
Iteration 340 - Accuracy: 0.8006
Iteration 360 - Accuracy: 0.8073
Iteration 380 - Accuracy: 0.8137
Iteration 400 - Accuracy: 0.8187
Iteration 420 - Accuracy: 0.8239
Iteration 440 - Accuracy: 0.8285
Iteration 460 - Accuracy: 0.8326
Iteration 480 - Accuracy: 0.8361
Iteration 500 - Accuracy: 0.8399
Iteration 520 - Accuracy: 0.8431
Iteration 540 - Accuracy: 0.8454
Iteration 560 - Accuracy: 0.8488
Iteration 580 - Accuracy: 0.8509
Iteration 600 - 