In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('Data/train.csv') # Loads in the training data
data.head(5)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
---
<br>

# **Introduction**
This notebook was inspired by and modeled after one created by Samson Zhang, found [here](https://www.kaggle.com/code/wwsalmon/simple-mnist-nn-from-scratch-numpy-no-tf-keras/notebook). I have added additional features and made some modifications to improve performance. Notably, I have significantly expanded on and edited the accompanying text to increase clarity.

The MNIST dataset is from Kaggle, found [here](https://www.kaggle.com/c/digit-recognizer).
<br>
<br>

---
---
<br>

In [2]:
data = np.array(data)
m, n = data.shape # Assigns dimension values to m and n
np.random.shuffle(data) # Shuffles the data to prevent overfitting

validation_data = data[0:1000].T # Grabs the first 1000 columns of the dataset
Y_validation = validation_data[0] # Creates a vector of the target variables, or "correct digits"
X_validation = validation_data[1:, :]
X_validation= X_validation / 255.0

training_data = data[1000:m].T
Y_training = training_data[0]
X_training = training_data[1:n]
X_training = X_training / 255.0
_,m_training = X_training.shape

In [3]:
def initialize_parameters():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

In [4]:
def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    return np.exp(Z) / sum(np.exp(Z))

def forward_propagation(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [5]:
def one_hot_encode(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def deriv_ReLU(Z):
    return Z > 0

def backpropagation(Z1, A1, A2, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot_encode(Y)
    dZ2 = A2 - one_hot_Y # Derivative of the loss function w.r.t the pre-activation values of the output layer
    dW2 = dZ2.dot(A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1) # Derivative of the loss function w.r.t. the pre-activation values of the output layer
    dW1 = dZ1.dot(X.T) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m

    return dW1, db1, dW2, db2

In [6]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def update_param(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = initialize_parameters()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_propagation(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backpropagation(Z1, A1, A2, W2, X, Y)
        W1, b1, W2, b2 = update_param(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        if i % 100 == 0: # Print the progress every 100 iterations
            one_hot_Y = one_hot_encode(Y)
            cost = -np.sum(one_hot_Y * np.log(A2)) / m
            predictions = get_predictions(A2)
            accuracy_percent = get_accuracy(predictions, Y) * 100

            print(f'Iteration: {i}')
            print(f'Accuracy: {accuracy_percent:.3f}%')
            print(f'Cost: {cost:,.3f}\n')

    return W1, b1, W2, b2

def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_propagation(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

def test_prediction(index, W1, b1, W2, b2):
    current_image = X_training[:, index, None]
    prediction = make_predictions(X_training[:, index, None], W1, b1, W2, b2)
    label = Y_training[index]

    is_correct = prediction == label
    return current_image, prediction, label, is_correct

def display_predictions(n, W1, b1, W2, b2, display_correct=True):
    predictions = []
    count = 0
    while count < n:
        random_index = np.random.randint(0, X_training.shape[1])
        current_image, prediction, label, is_correct = test_prediction(random_index, W1, b1, W2, b2)
        if is_correct == display_correct:
            predictions.append((prediction, label, current_image))
            count += 1

    fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 6))

    for i, (prediction, label, img) in enumerate(predictions):
        row = i // 5
        col = i % 5
        axes[row, col].imshow(img.reshape((28, 28)) * 255, cmap='gray')
        axes[row, col].set_title(f"Prediction: {prediction}\nTrue Label: {label}")
        axes[row, col].axis('off')

    if display_correct:
        print('Correct Predictions:\n')
    else:
        print('Incorrect Predictions:\n')

    plt.tight_layout()
    plt.show()

def graph_data(data):
    pass

In [7]:
# Train the model with the training data
W1, b1, W2, b2 = gradient_descent(X_training, Y_training, 500, 0.25)

TypeError: gradient_descent() missing 1 required positional argument: 'alpha'

In [None]:
# Run the model with the validation data
validation_predictions = make_predictions(X_validation, W1, b1, W2, b2)
validation_accuracy_percent = get_accuracy(validation_predictions, Y_validation) * 100
print(f'Validation Data Accuracy: {validation_accuracy_percent:.3f}%\n')

# Display 10 random correct predictions
display_predictions(10, W1, b1, W2, b2)

# Display 10 random incorrect predictions
display_predictions(10, W1, b1, W2, b2, display_correct=False)