<a href="https://colab.research.google.com/github/MarkoMile/mlp-from-scratch/blob/master/MLP_minibatch_grad_descent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook I am trying to create a Multi-layer perceptron from scratch, meaning I will only use numPy for creating and training the NN.

The goal with this project is to learn how a MLP "learns" and to understand the calculus behind basic concepts in deep learning, like gradient descent and back-propagation.

In [None]:
import numpy as np
import keras
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# create training and testing arrays

(x_train,y_train), (x_test,y_test) = keras.datasets.mnist.load_data()

# normalize pixel values to 0-1

x_train = x_train/255
x_test = x_test/255

In [None]:
# constants
HIDDEN_LAYER_NEURONS = 16
LEARN_RATE = 0.01  # Reduced learning rate
EPOCHS = 20
BATCH_SIZE = 16

def sigmoid(x):
  return 1/(1+np.exp(-(x)))

def d_sigmoid(x):
  return sigmoid(x)*(1-sigmoid(x))

def cost_fn(y_hat,y):
  return (y_hat-y)**2

def d_cost_fn(y_hat,y):
  return 2*(y_hat-y)

In [None]:
# create variables

# we are creating a 1 hidden layer MLP
hidden_layer_activation = np.random.rand(HIDDEN_LAYER_NEURONS)
hidden_layer_z = np.random.rand(HIDDEN_LAYER_NEURONS)

# neuron biases
hidden_biases = np.random.randn(HIDDEN_LAYER_NEURONS) * 0.01
output_biases = np.random.randn(10) * 0.01

# weight matrix
input_weights = np.random.randn(HIDDEN_LAYER_NEURONS,784) * 0.01
output_weights = np.random.randn(10,HIDDEN_LAYER_NEURONS) * 0.01

# slopes
slope_w0 = np.zeros((784,HIDDEN_LAYER_NEURONS))
slope_w1 = np.zeros((HIDDEN_LAYER_NEURONS,10))
slope_b0 = np.zeros(HIDDEN_LAYER_NEURONS)
slope_b1 = np.zeros(10)

# output layer
y_hat = np.zeros(10)
y_hat_z = np.zeros(10)

In [None]:
def feedforward(input_layer):
    hidden_layer_z = np.dot(input_weights, input_layer) + hidden_biases
    hidden_layer_activation = sigmoid(hidden_layer_z)
    y_hat_z = np.dot(output_weights, hidden_layer_activation) + output_biases
    y_hat = sigmoid(y_hat_z)
    return hidden_layer_activation, y_hat_z, y_hat

def backpropagation(input_layer, hidden_layer_activation, y_hat_z, y_hat, y):
    slope_b1 = d_cost_fn(y_hat, y) * d_sigmoid(y_hat_z)
    slope_w1 = np.dot(slope_b1.reshape(10, 1), hidden_layer_activation.reshape(1, HIDDEN_LAYER_NEURONS))
    slope_b0 = np.dot(output_weights.T, slope_b1) * d_sigmoid(np.dot(input_weights, input_layer) + hidden_biases)
    slope_w0 = np.dot(slope_b0.reshape(HIDDEN_LAYER_NEURONS, 1), input_layer.reshape(1, 784))
    return slope_w0, slope_w1, slope_b0, slope_b1

def update_params(slope_w0, slope_w1, slope_b0, slope_b1, learn_rate):
    global input_weights, output_weights, hidden_biases, output_biases
    input_weights -= learn_rate * slope_w0
    output_weights -= learn_rate * slope_w1
    hidden_biases -= learn_rate * slope_b0
    output_biases -= learn_rate * slope_b1

In [None]:
# training

# Training
cost_minibatch_arr = []
cost_epoch_arr = []

for epoch in range(EPOCHS):
    #shuffling the dataset
    indices = np.random.permutation(x_train.shape[0])
    x_train_shuffled = x_train[indices]
    y_train_shuffled = y_train[indices]

    for start in range(0, x_train.shape[0], BATCH_SIZE):
        end = start + BATCH_SIZE
        batch_x = x_train_shuffled[start:end]
        batch_y = y_train_shuffled[start:end]

        # initialize to 0, because they will be summed
        batch_slope_w0 = np.zeros(input_weights.shape)
        batch_slope_w1 = np.zeros(output_weights.shape)
        batch_slope_b0 = np.zeros(hidden_biases.shape)
        batch_slope_b1 = np.zeros(output_biases.shape)

        batch_cost = 0

        for i in range(batch_x.shape[0]):
            input_layer = batch_x[i].reshape(784)
            hidden_layer_activation, y_hat_z, y_hat = feedforward(input_layer)
            y = np.zeros(10)
            y[batch_y[i]] = 1
            batch_cost += np.sum(cost_fn(y_hat, y))

            slope_w0, slope_w1, slope_b0, slope_b1 = backpropagation(
                input_layer, hidden_layer_activation, y_hat_z, y_hat, y
            )
            batch_slope_w0 += slope_w0
            batch_slope_w1 += slope_w1
            batch_slope_b0 += slope_b0
            batch_slope_b1 += slope_b1

        # update the params for the average deltas of the mini-batch
        update_params(batch_slope_w0 / BATCH_SIZE, batch_slope_w1 / BATCH_SIZE,
                      batch_slope_b0 / BATCH_SIZE, batch_slope_b1 / BATCH_SIZE,
                      LEARN_RATE)

        cost_minibatch_arr.append(batch_cost / BATCH_SIZE)
    cost_epoch_arr.append(np.sum(cost_minibatch_arr) / len(cost_minibatch_arr))
    cost_minibatch_arr = []

plt.plot(cost_epoch_arr)
plt.show()

In [None]:
# Evaluate
test_results = []
for i in range(x_test.shape[0]):
    input_layer = x_test[i].reshape(784)
    _, _, y_hat = feedforward(input_layer)
    y = y_test[i]
    test_results.append((np.argmax(y_hat), y))

accuracy = sum(int(x == y) for x, y in test_results) / y_test.shape[0] * 100
print(f'{accuracy}% accuracy')