<div align=center>

# Implementing a Deep Neural Network

By Hamed Araab

Supervisor: Dr. Marzieh Zarinbal

</div>


## Introduction

This notebook demonstrates a basic implementation of deep neural networks (DNNs)
based on the specifications of `reference.pdf`.


## Required Libraries

First of all, let's import the required libraries:


In [61]:
import math
import numpy as np

## Deep Neural Network

In this section, we implement two main classes:

- `FullyConnectedLayer`, which will be used to declare a layer in the network,
  and
- `NeuralNetwork`, which will be used to declare the network itself.


In [62]:
class FullyConnectedLayer:
    def __init__(self, units, activation_function):
        self.units = units
        self.activation_function = activation_function


class NeuralNetwork:
    @staticmethod
    def get_initial_params(shape, low=-1, high=1):
        return np.random.rand(*shape) * (high - low) + low

    def __init__(self, input_size, layers, loss_function, learning_rate):
        self.input_size = input_size
        self.number_of_layers = len(layers)
        self.loss_function = loss_function
        self.learning_rate = learning_rate

        self.layers = {l + 1: layers[l] for l in range(self.number_of_layers)}

        self.weights = {
            1: NeuralNetwork.get_initial_params(
                (self.layers[1].units, self.input_size)
            ),
        } | {
            l: NeuralNetwork.get_initial_params(
                (self.layers[l].units, self.layers[l - 1].units),
            )
            for l in range(2, self.number_of_layers + 1)
        }

        self.biases = {
            l: NeuralNetwork.get_initial_params((self.layers[l].units, 1))
            for l in range(1, self.number_of_layers + 1)
        }

    # Forward Propagation
    def predict(self, X, return_objects=False):
        Z = {}
        A = {0: X}

        for l in range(1, self.number_of_layers + 1):
            Z[l] = self.weights[l] @ A[l - 1] + self.biases[l]
            A[l] = self.layers[l].activation_function.get_activation(Z[l])

        if return_objects:
            return Z, A
        else:
            Y_hat = A[self.number_of_layers]

            return Y_hat

    # Training Phase (Forward and Backward Propagation)
    def train(self, X, Y, X_test=None, Y_test=None, epochs=10, batch_size=32):
        _, m = X.shape

        for epoch in range(epochs):
            # Shuffle the training data for each epoch
            permutation = np.random.permutation(m)

            X_shuffled = X[:, permutation]
            Y_shuffled = Y[:, permutation]

            batches = math.ceil(m / batch_size)

            cost_epoch = 0

            for batch in range(batches):
                start_index = batch * batch_size
                end_index = start_index + batch_size

                X_batch = X_shuffled[:, start_index:end_index]
                Y_batch = Y_shuffled[:, start_index:end_index]

                Z_batch, A_batch = self.predict(X_batch, return_objects=True)

                Y_hat_batch = A_batch[self.number_of_layers]

                loss_batch = self.loss_function.get_loss(Y_batch, Y_hat_batch)
                cost_epoch += loss_batch / m

                self.update_parameters(Y_batch, Z_batch, A_batch)

            details = (
                f"Epoch: {epoch + 1}/{epochs}, cost: {'{:.3f}'.format(cost_epoch)}"
            )

            if X_test is not None and Y_test is not None:
                _, m_test = X_test.shape

                Y_hat_test = self.predict(X_test)

                cost_test_epoch = (
                    self.loss_function.get_loss(Y_test, Y_hat_test) / m_test
                )

                details += f", cost_test: {'{:.3f}'.format(cost_test_epoch)}"

            print(details)

    # Backward Propagation
    def update_parameters(self, Y, Z, A):
        _, m = Y.shape

        Y_hat = A[self.number_of_layers]

        dC_dY_hat = self.loss_function.get_derivative(Y, Y_hat) / m

        dC_dA = {self.number_of_layers: dC_dY_hat}

        for l in range(self.number_of_layers, 0, -1):
            delta = np.tensordot(
                dC_dA[l],
                self.layers[l].activation_function.get_derivative(Z[l]),
            )

            dC_dW = delta @ A[l - 1].T
            dC_db = delta @ np.ones((m, 1))

            self.weights[l] -= self.learning_rate * dC_dW
            self.biases[l] -= self.learning_rate * dC_db

            if l > 1:
                dC_dA[l - 1] = self.weights[l].T @ delta

## Activation Functions

Now, we are going to write down the activation functions based on the following
class:


In [63]:
class ActivationFunction:
    def get_activation(self, Z_l):
        pass

    def get_derivative(self, Z_l):
        pass

Further on, you are going to see that two versions of `get_derivative` are
implemented for each activation function. The first one uses a for-loop and is
commented out due to being extremely slow. The second one is quite fast since it
uses a vectorized approach.


### Rectified Linear Unit (ReLU)


In [64]:
class ReLU(ActivationFunction):
    def get_activation(self, Z_l):
        return np.maximum(0, Z_l)

    def get_derivative(self, Z_l):
        # n, m = Z_l.shape

        # derivative = np.zeros((n, m, n, m))

        # for (i, j, u, v), _ in np.ndenumerate(derivative):
        #     if i == u and j == v and Z_l[i, j] > 0:
        #         derivative[i, j, u, v] = 1

        # return derivative

        n, m = Z_l.shape
        i, j = np.indices((n, m))

        Z_l[Z_l > 0] = 1
        Z_l[Z_l <= 0] = 0

        derivative = np.zeros((n, m, n, m))

        derivative[i, j, i, j] = Z_l[i, j]

        return derivative

### Sigmoid


In [65]:
class Sigmoid(ActivationFunction):
    def get_activation(self, Z_l):
        return 1 / (1 + np.exp(-Z_l))

    def get_derivative(self, Z_l):
        # n, m = Z_l.shape

        # derivative = np.zeros((n, m, n, m))

        # A_l = self.get_activation(Z_l)

        # for (i, j, u, v), _ in np.ndenumerate(derivative):
        #     if i == u and j == v:
        #         derivative[i, j, u, v] = A_l[i, j] * (1 - A_l[i, j])

        # return derivative

        n, m = Z_l.shape
        i, j = np.indices((n, m))

        derivative = np.zeros((n, m, n, m))

        A_l = self.get_activation(Z_l)

        derivative[i, j, i, j] = A_l[i, j] * (1 - A_l[i, j])

        return derivative

### SoftMax


In [66]:
class Softmax(ActivationFunction):
    def get_activation(self, Z_l):
        exp_Z_l = np.exp(Z_l)

        return exp_Z_l / np.sum(exp_Z_l, axis=0, keepdims=True)

    def get_derivative(self, Z_l):
        # n, m = Z_l.shape

        # derivative = np.zeros((n, m, n, m))

        # A_l = self.get_activation(Z_l)

        # for (i, j, u, v), _ in np.ndenumerate(derivative):
        #     if j == v:
        #         if i == u:
        #             derivative[i, j, u, v] = A_l[i, j] * (1 - A_l[i, j])
        #         else:
        #             derivative[i, j, u, v] = -A_l[i, j] * A_l[u, j]

        # return derivative

        n, m = Z_l.shape
        i, j, u = np.indices((n, m, n))

        derivative = np.zeros((n, m, n, m))

        A_l = self.get_activation(Z_l)

        derivative[i, j, u, j] = -A_l[i, j] * A_l[u, j]
        derivative[i, j, i, j] += A_l[i, j]

        return derivative

## Loss Functions

Similarly, we will be implementing the loss functions based on this class:


In [67]:
class LossFunction:
    def get_loss(self, Y, Y_hat):
        pass

    def get_derivative(self, Y, Y_hat):
        pass

### Sum of Squared Errors (SSE)


In [68]:
class SSELoss(LossFunction):
    def get_loss(self, Y, Y_hat):
        return 1 / 2 * np.sum((Y_hat - Y) ** 2)

    def get_derivative(self, Y, Y_hat):
        return Y_hat - Y

### Binary Cross Entropy (BCE)


In [69]:
class BCELoss(LossFunction):
    def get_loss(self, Y, Y_hat):
        return -np.sum((Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)))

    def get_derivative(self, Y, Y_hat):
        return (Y_hat - Y) / (Y_hat * (1 - Y_hat))

### Cross Entropy (CE)


In [70]:
class CELoss(LossFunction):
    def get_loss(self, Y, Y_hat):
        return -np.sum((Y * np.log(Y_hat)))

    def get_derivative(self, Y, Y_hat):
        return -Y / Y_hat