## Importing libraries

This MLP network is created from scratch using only the numpy library. It focuses on introductory algorithms and techniques, making numpy the sole necessary library. This minimalist approach highlights the fundamental concepts of neural networks while relying on numpy's powerful numerical computations for essential operations.


In [20]:
import mnist_loader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

## Activation Functions Class

In [21]:
class Activation:
  """
    - Activation functions.
    - And their derivatives
  """

  @classmethod
  def sigmoid(cls, vec):
    """
      Sigmoid activation function.
    """
    return 1.0 / (1.0 + np.exp(-vec))

  @classmethod
  def sigmoid_drv(cls, vec):
    """
      Sigmoid function derivative.
    """
    return cls.sigmoid(vec) * (1 - cls.sigmoid(vec))

  @classmethod
  def step(cls, vec):
    """
      Binary Step activation function.
    """
    return np.where(vec > 0, 1, 0)

  @classmethod
  def relu(cls, vec):
    """
      Rectified Linear Unit
    """
    return np.maximum(0, vec)

  @classmethod
  def relu_drv(cls, vec):
    """
      Rectified Linear Unit Derivative
    """
    return np.where(vec > 0, 1, 0)

  @classmethod
  def tanh(cls, vec):
    """
      Hiperbolic Tangent
    """
    return np.tanh(vec)

  @classmethod
  def tanh_drv(cls, vec):
    """
      Hiperbolic Tangent Derivative
    """
    return 1 - np.tanh(vec)**2

## Loss Function Class

In [22]:
class Cost:
  """
    - Cost functions.
  """

  @classmethod
  def cost_derivative(cls, output_activations, y):
    """
      - Return the vector of partial derivatives partial C(x).
      - Partial a for the output activations.
    """
    return (output_activations-y)

## Network Object

In [23]:
class Network:
  """
    - A Feed Foward Neural Network.
    - Uses Stochastic Gradient Descent learning algorithm.
    - Gradients are being caculated using back propagation.
    - Misses some optimizations and omits some desirable features.
  """
  def __init__(self, sizes, activation_function = Activation.sigmoid, actv_drv = Activation.sigmoid_drv):
    self.num_layers = len(sizes)
    self.sizes = sizes
    self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
    self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
    self.activation_function = activation_function
    self.actv_drv = actv_drv

  def feed_foward(self, a):
    """
      Calculates the final result of a input going through the NN.
    """
    for b, w in zip(self.biases, self.weights):
      a = self.activation_function(np.dot(w, a) + b)
    return a

  def update_mini_batch(self,  mini_batch, eta):
    """
      - Updates the network's weights and biases by applying gradient descent using backpropagation to a single mini batch.
      - The "mini_batch" is a list of tuples "(x, y)".
      - "eta is the learning rate.
    """

    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    # Nabla lists are lists of numpy arrays, in the same way as self.weights and self.biases
    # They store, by adding, all the changes in the weights calculated by the back propagation algorithm
    for x, y in mini_batch:
      delta_nabla_b, delta_nabla_w = self.back_propagation(x, y)

      nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
      nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

    # Recalculate weights for a mini-batch run
    self.weights = [w - (eta/len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b - (eta/len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]

  def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
    """
      - Utilizes mini-batch stochastic gradient descent for training the neural network.
      - The "training_data" consists of tuples "(x, y)" representing input and desired output.
      - Other parameters are self-explanatory.
      - If "test_data" is provided, the network evaluates against it after each epoch, allowing for progress tracking but at a slower pace.
    """

    if test_data:
      test_data = list(test_data)
      n_test = len(test_data)

    training_data = list(training_data)
    n = len(training_data)

    for j in range(epochs):
      random.shuffle(training_data)
      mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]

      for mini_batch in mini_batches:
        self.update_mini_batch(mini_batch, eta)

      if test_data :
        print(f"Epoch {j}: {self.evaluate(test_data)} / {n_test}")

      else:
        print(f"Epoch {j}: complete")


  def back_propagation(self, x, y):
    """
      - Returns a tuple "(nabla_b, nabla_w)" representing the gradient for the chosen Cost Function C(x).
      - "nabla_b" & "nabla_w" are layer-by-layer lists of numpy arrays.
      - Similar to "self.biases" and "self.weights".
      - How much is the adjustment.
    """

    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    # Feed-foward phase

    activation = x
    activations = [x] # Stores activations layer by layer
    zs = [] # Stores all the z vectores, layer-by-layer

    for b, w in zip(self.biases, self.weights):
      z = np.dot(w, activation) + b
      zs.append(z)
      activation = self.activation_function(z)
      activations.append(activation)

    # Backward pass -> delta rule
    delta = Cost.cost_derivative(activations[-1], y) * self.actv_drv(zs[-1])

    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())

    # In the following loop we want to iterate from the last to the first layer, excluding the output and the input layers
    # We are taking advantage that python can use negative indices

    for l in range(2, self.num_layers):
      z = zs[-l]
      sig_dv = self.actv_drv(z)
      delta = np.dot(self.weights[-l+1].transpose(), delta) * sig_dv
      nabla_b[-l] = delta
      nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())

    return (nabla_b, nabla_w)


  def evaluate(self, test_data):
    """
      - Evaluates the number of correct predictions of the NN.
      - The output is chosen by the "winner takes all" rule, where the highest number on the output vector is the prediction
    """
    test_results = [(np.argmax(self.feed_foward(x)), y) for (x, y) in test_data]

    return sum (int(x == y) for (x, y) in test_results)


  def debug(self):
    """
      - Write any code for debbuging.
    """

## Training

In this section we're going to train this simple Mlp using the mnist dataset in the ".pkl" file, using its default data division.

## Tuning

Now, we're going to explore the hyperparameters of this MLP, in order to find the most adequate configuration for it.

### Activation Function

#### Tanh Function


The hyperbolic tangent (tanh) function didn't give us good results. We used the same parameters as before when we were using the sigmoid function, but the tanh function didn't work well in this case. Initially, the accuracy improved in the first 5-7 epochs, but after that, the performance of the network either stayed the same or got worse. Overall, the tanh function didn't perform well.

In [24]:
net = Network([784, 30, 10], Activation.tanh, Activation.tanh_drv)
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

#epochs, mini_batch_size, eta,
net.SGD(training_data, 30, 10, 4.0, test_data=test_data)

Epoch 0: 1282 / 10000
Epoch 1: 1387 / 10000
Epoch 2: 1184 / 10000
Epoch 3: 965 / 10000
Epoch 4: 1041 / 10000
Epoch 5: 1715 / 10000
Epoch 6: 1670 / 10000
Epoch 7: 1670 / 10000
Epoch 8: 1670 / 10000
Epoch 9: 1670 / 10000
Epoch 10: 1670 / 10000
Epoch 11: 1670 / 10000
Epoch 12: 1670 / 10000
Epoch 13: 1670 / 10000
Epoch 14: 1670 / 10000
Epoch 15: 1670 / 10000
Epoch 16: 1670 / 10000
Epoch 17: 1670 / 10000
Epoch 18: 1670 / 10000
Epoch 19: 1610 / 10000
Epoch 20: 1373 / 10000
Epoch 21: 1368 / 10000
Epoch 22: 1375 / 10000
Epoch 23: 1370 / 10000
Epoch 24: 1370 / 10000
Epoch 25: 1370 / 10000
Epoch 26: 1370 / 10000
Epoch 27: 1369 / 10000
Epoch 28: 1369 / 10000
Epoch 29: 1368 / 10000


### ETA - learning rate

### Mini batch size

### Number of Epochs

### The network architecture

### Regularization / Droup-out

### Machine Learning Algorithm