# 1 Importing Necessary Libraries
Before we begin, we need to import the necessary Python libraries and modules that will help us in matrix operations, data loading, and other mathematical functions.

In [21]:
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils

# 2 Loading and Preprocessing the Data
The MNIST dataset is available in many deep learning libraries. Here, we'll use Keras to load the dataset.

In [22]:
# load MNIST from server
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# training data
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
x_train /= 255
# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np_utils.to_categorical(y_train)

# same for test data
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = np_utils.to_categorical(y_test)

# 3 Activation and Loss Functions Definitions
Activation functions introduce non-linearity into the network. In this implementation, the hyperbolic tangent (tanh) function is utilized due to its zero-centered output, which can help speed up convergence during training. Additionally, the Mean Squared Error (MSE) is employed as the loss function. It measures the average squared difference between the actual and predicted values, making it suitable for regression problems.


In [23]:
# activation function and its derivative
def tanh(x):
    return np.tanh(x);

def tanh_prime(x):
    return 1-np.tanh(x)**2;

# loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2));

def mse_prime(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size;

# 4 Building the Neural Network Components
The neural network consists of layers. Here, we define the fully connected layer and the activation layer.

## 4.1 Fully Connected Layer
The Fully Connected Layer (often abbreviated as FC layer) is a standard layer type that is used in many neural network architectures. Neurons in a fully connected layer have full connections to all activations in the previous layer, as seen in regular Neural Networks. Their activations can hence be computed with a matrix multiplication followed by a bias offset.

In [24]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_propagation(self, input):
        raise NotImplementedError

    # computes dE/dX for a given dE/dY (and update parameters if any)
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # computes dE/dW, dE/dB for a given output_error=dE/dY. Returns input_error=dE/dX.
    def backward_propagation(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        # dBias = output_error

        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

## 4.2 Activation Layer
The Activation Layer is responsible for applying an activation function to its inputs. Activation functions introduce non-linear properties to the network, allowing it to learn from the error and make adjustments, which is essential for learning complex patterns.

In [25]:
# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # returns the activated input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # Returns input_error=dE/dX for a given output_error=dE/dY.
    # learning_rate is not used because there is no "learnable" parameters.
    def backward_propagation(self, output_error, learning_rate):
        return self.activation_prime(self.input) * output_error

# 5 Constructing the Neural Network
The `Network` class represents the structure and functionality of a neural network. Upon initialization, it sets up an empty list for layers and placeholders for the loss function and its derivative. The `add` method allows for the addition of layers to the network, while the `use` method sets the loss function and its derivative to be used during training. The `predict` method performs forward propagation through the network for a given input, returning the network's output. The `fit` method trains the network using the provided training data. During training, the method iterates over the specified number of epochs, performing forward propagation to compute the network's output, calculating the loss, and then executing backward propagation to update the weights and biases of the layers. After each epoch, the average error across all samples is displayed.

In [27]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # set loss to use
    def use(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict output for given input
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        # run network over all samples
        for i in range(samples):
            # forward propagation
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_propagation(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate):
        # sample dimension first
        samples = len(x_train)

        # training loop
        for i in range(epochs):
            err = 0
            for j in range(samples):
                # forward propagation
                output = x_train[j]
                for layer in self.layers:
                    output = layer.forward_propagation(output)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], output)

                # backward propagation
                error = self.loss_prime(y_train[j], output)
                for layer in reversed(self.layers):
                    error = layer.backward_propagation(error, learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

# 6 Training the Neural Network
Training involves feeding the data into the network and adjusting the weights using backpropagation.

In [28]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input_shape=(1, 28*28)    ;   output_shape=(1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input_shape=(1, 100)      ;   output_shape=(1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input_shape=(1, 50)       ;   output_shape=(1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

# train on 1000 samples
# as we didn't implemented mini-batch GD, training will be pretty slow if we update at each iteration on 60000 samples...
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

epoch 1/35   error=0.225384
epoch 2/35   error=0.103824
epoch 3/35   error=0.080902
epoch 4/35   error=0.066233
epoch 5/35   error=0.056656
epoch 6/35   error=0.049281
epoch 7/35   error=0.043336
epoch 8/35   error=0.038188
epoch 9/35   error=0.033800
epoch 10/35   error=0.030063
epoch 11/35   error=0.027024
epoch 12/35   error=0.024651
epoch 13/35   error=0.022683
epoch 14/35   error=0.020999
epoch 15/35   error=0.019545
epoch 16/35   error=0.018324
epoch 17/35   error=0.017207
epoch 18/35   error=0.016211
epoch 19/35   error=0.015293
epoch 20/35   error=0.014477
epoch 21/35   error=0.013789
epoch 22/35   error=0.013225
epoch 23/35   error=0.012712
epoch 24/35   error=0.012270
epoch 25/35   error=0.011825
epoch 26/35   error=0.011389
epoch 27/35   error=0.010992
epoch 28/35   error=0.010478
epoch 29/35   error=0.010119
epoch 30/35   error=0.009660
epoch 31/35   error=0.009361
epoch 32/35   error=0.008923
epoch 33/35   error=0.008660
epoch 34/35   error=0.008282
epoch 35/35   error=0.0

# 7 Evaluating the Neural Network
After training, it's essential to evaluate the model's performance on unseen data.

In [31]:
# test on 3 samples
out = net.predict(x_test[0:3])
print("\n")
print("predicted values : ")
print(out, end="\n")
print("true values : ")
print(y_test[0:3])



predicted values : 
[array([[-0.00348004,  0.01409909,  0.00895138,  0.0028811 ,  0.02987997,
         0.00805698,  0.02639108,  0.97298983,  0.03382498, -0.00432507]]), array([[ 2.41093956e-02, -2.73723194e-02,  7.99137300e-01,
         1.49083206e-01, -1.57058038e-04,  1.77398106e-01,
         1.42346556e-02, -5.61381684e-02,  3.40053372e-01,
         2.75526989e-01]]), array([[-0.02143487,  0.97517556,  0.03774701, -0.00111179,  0.03364395,
         0.00542629,  0.02291317, -0.0200364 ,  0.05049729, -0.0074744 ]])]
true values : 
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


# 8 Conclusion
Using a simple fully connected neural network, we can achieve a decent accuracy on the MNIST dataset. For state-of-the-art results, more advanced techniques like Convolutional Neural Networks (CNNs) are typically used.