In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib
from tqdm.notebook import trange

matplotlib.style.use("seaborn")

# Relu function for the hidden layers
def relu(x, grad=False):
    """
    Implements the ReLU (Rectified Linear Unit) activation function.

    Args:
        x: Input value(s).
        grad: Boolean flag indicating whether to return the gradient (True) or activation (False).

    Returns:
        ReLU activation of x if grad is False, otherwise the ReLU gradient.
    """

    # Apply ReLU function: max(0, x)
    return np.maximum(0, x) if not grad else np.where(x > 0, 1.0, 0.0)

# Softmax function for output layer
def softmax(x, grad=False):
    """
    Implements the softmax function for classification problems.

    Args:
        x: Input value(s).
        grad: Boolean flag indicating whether to return the gradient (True) or activation (False).

    Returns:
        Softmax activation of x if grad is False, otherwise the softmax gradient.
    """

    # Calculate the exponentials of the input values for numerical stability
    exp = np.exp(np.maximum(np.minimum(x, 8), -8))

    # Prevent division by zero by adding a small constant to the denominator
    denominator = np.sum(exp, axis=0) + 1e-3

    # Calculate the softmax probabilities
    s = exp / denominator

    # Return activation or gradient based on the grad flag
    return s if not grad else np.multiply(s, 1. - s)

# Cross-entropy loss function
def cross_entropy(x, y, grad=False):
    """
    Calculates the cross-entropy loss between predicted (x) and true labels (y).

    Args:
        x: Predicted probabilities.
        y: True labels (one-hot encoded).
        grad: Boolean flag indicating whether to return the gradient (True) or loss (False).

    Returns:
        Cross-entropy loss if grad is False, otherwise the gradient of the loss.
    """

    # Clip target labels to avoid issues with log(0)
    x = np.clip(x, 1e-10, 1. - 1e-10)

    # Calculate cross-entropy loss
    if grad:
        return x - y  # Gradient of cross-entropy loss

    return -np.sum(y * np.log(x), axis=0)


class Layer:
    """
    Represents a basic neural network layer.

    Attributes:
        input_dim: Dimensionality of the input data.
        output_dim: Dimensionality of the output data.
        activation: Activation function to be applied to the layer's output.
        w: Weight matrix of the layer, initialized with random values.
        grad_w: Gradient of the weight matrix, used for training.
        x: Input data to the layer (stored for backpropagation).
        z: Weighted sum of the input before activation (stored for backpropagation).
        a: Activated output of the layer.
    """

    def __init__(self, input_dim, output_dim, activation=relu):
        """
        Initializes a new Layer object.

        Args:
            input_dim: Dimensionality of the input data.
            output_dim: Dimensionality of the output data.
            activation: Activation function to be applied to the layer's output (default: relu)
        """

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.activation = activation

        # Initialize weight matrix with Xavier initialization for better convergence
        self.w = np.random.normal(scale=1.0 / np.sqrt(input_dim), size=(output_dim, input_dim)).astype(np.float32)
        self.grad_w = np.zeros_like(self.w).astype(np.float32)
        self.x = None
        self.z = None
        self.a = None

    def __call__(self, x):
        """
        Performs the forward pass through the layer.

        Args:
            x: Input data to the layer.

        Returns:
            Activated output of the layer.
        """

        self.x = x  # Store the input for backpropagation
        self.z = np.dot(self.w, x)  # Calculate the weighted sum
        self.a = self.activation(self.z)  # Apply activation function
        return self.a
    

class NeuralNetwork:
    """
    Represents a basic neural network architecture.

    Attributes:
        learning_rate: Learning rate for gradient updates during training.
        batch_size: Size of the data batch used for training.
        layers: List of `Layer` objects representing the network's layers.
        predictions: Network's predicted outputs during the last forward pass (internal use).
        actuals: True labels during the last forward pass (internal use).
        current_loss: Loss value calculated during the last forward pass (internal use).
    """

    def __init__(self, learning_rate=0.01, batch_size=32, loss_function=cross_entropy):
        """
        Initializes a new NeuralNetwork object.

        Args:
            learning_rate: Learning rate for gradient updates during training (default: 0.01).
            batch_size: Size of the data batch used for training (default: 32).
            loss_function: The function used to calculate loss (default: cross_entropy).
        """
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.loss_function = loss_function
        self.layers = []  # List to hold network layers

        # Internal variables to store network outputs for loss calculation
        self.predictions = None
        self.actuals = None
        self.current_loss = None

    def __call__(self, x):
        """
        Performs a forward pass through the network.

        Args:
            x: Input data to the network.

        Returns:
            Activated output of the last layer in the network.
        """
        for layer in self.layers:
            x = layer(x)  # Pass input through each layer
        return x

    def __add__(self, layer):
        """
        Efficiently adds a Layer object to the network.

        Args:
            layer: The Layer object to be added to the network.

        Raises:
            AssertionError: If the input and output dimensions of consecutive layers are incompatible.
        """
        if isinstance(layer, Layer):
            if not self.layers:
                self.layers.append(layer)
            else:
                # Ensure compatible dimensions between layers
                assert layer.w.shape[1] == self.layers[-1].w.shape[0], "Incompatible layer dimensions!"
                self.layers.append(layer)
        return self

    def loss(self, predictions, actuals):
        """
        Calculates and stores the loss between predicted and actual outputs.

        Args:
            predictions: Network's predicted outputs.
            actuals: True labels for the data.

        Returns:
            The calculated loss value.
        """
        self.predictions = predictions
        self.actuals = actuals
        self.current_loss = np.mean(self.loss_function(predictions, actuals))  # Average cross-entropy loss
        return self.current_loss

    def backwards(self):
        """
        Performs backpropagation to update weights of the network based on the calculated loss.
        """
        # Calculate the gradient of the loss with respect to the network's predictions
        loss_grad = cross_entropy(self.predictions, self.actuals, grad=True)
        # Calculate the gradient of the activation function of the last layer
        activation_grad = self.layers[-1].activation(self.layers[-1].z, grad=True)

        # Compute the delta, which is the product of the loss gradient and activation gradient
        delta = loss_grad * activation_grad
        # Reshape delta for compatibility with matrix multiplication
        delta = delta.T.reshape(self.batch_size, -1, 1)

        # Compute the gradient of the weights of the last layer
        prev_activation = self.layers[-1].x.T.reshape(self.batch_size, 1, -1)
        self.layers[-1].dw = np.mean(delta * prev_activation, axis=0)

        # Backpropagate through the layers, starting from the second-to-last layer
        for i in range(2, len(self.layers) + 1):
            # Transpose weights for matrix multiplication
            weights_transpose = self.layers[-i + 1].w.transpose()

            # Compute the gradient of the activation function of the current layer
            z = self.layers[-i].z
            activation_grad = self.layers[-i].activation(z, grad=True)
            activation_grad = activation_grad.T.reshape(self.batch_size, -1, 1)

            # Update delta using the chain rule
            delta = np.matmul(weights_transpose, delta) * activation_grad

            # Compute the gradient of the weights of the current layer
            prev_activation = self.layers[-i].x.T.reshape(self.batch_size, 1, -1)
            self.layers[-i].dw = np.mean(np.matmul(delta, prev_activation), axis=0)

        # Update weights of all layers using gradient descent
        for layer in self.layers:
            layer.w = layer.w - self.learning_rate * layer.dw


file = r"MNIST_CSV\mnist_train.csv"
data = pd.read_csv(file, header=None).values.astype(np.float32)
samples = [(data[i, 1:] / 255, np.eye(10)[int(data[i, 0])].astype(np.float32)) for i in range(len(data))]

np.random.seed(10_000)
random.seed(10_000)

model = NeuralNetwork(learning_rate=.95, batch_size=64)
l1 = Layer(784, 50, relu)
l2 = Layer(50, 20, relu)
l3 = Layer(20, 10, softmax)

model += l1
model += l2
model += l3

epochs = 10_000
losses = []
for i in trange(epochs, ncols=1000):
    batch = random.sample(samples, model.batch_size)
    X = np.column_stack([b[0] for b in batch]).astype(np.float32)
    Y = np.column_stack([b[1] for b in batch]).astype(np.float32)

    pred = model(X)
    loss = model.loss(pred, Y)
    losses.append(loss)
    model.backwards()

plt.plot(losses)
plt.xlabel("Epoch")
plt.ylabel("Average Batch Error")
plt.title("Training Error through Time")
plt.savefig('error.png', dpi=600)

  matplotlib.style.use("seaborn")


  0%|                                                                                                         …