# Multi-layer Neural Networks

#### Define activation functions and derivatives

In [4]:
import numpy as np

# Base class for activation functions
class ActivationFunction:
    def __call__(self, x):
        raise NotImplementedError

    def derivative(self, x):
        raise NotImplementedError

# ReLU Activation Function
class ReLU(ActivationFunction):
    def __call__(self, x):
        return np.maximum(0, x)

    def derivative(self, x):
        return (x > 0).astype(float)

# Sigmoid Activation Function
class Sigmoid(ActivationFunction):
    def __call__(self, x):
        return 1 / (1 + np.exp(-x))

    def derivative(self, x):
        sigmoid_x = self.__call__(x)
        return sigmoid_x * (1 - sigmoid_x)

#### Multilayer neural network

In [7]:
class MultiLayerNet:
    def __init__(self, layer_sizes, activation_funcs, loss_function, reg_lambda=0.01):
        self.layer_sizes = layer_sizes
        self.activation_funcs = activation_funcs
        self.loss_function = loss_function
        self.reg_lambda = reg_lambda
        self.params = {}
        
        # Initialize weights and biases
        for i in range(1, len(layer_sizes)):
            self.params[f'W{i}'] = (
                    np.random.randn(layer_sizes[i-1], 
                                    layer_sizes[i]) / 
                    np.sqrt(layer_sizes[i-1]))
            self.params[f'b{i}'] = np.zeros((1, layer_sizes[i]))

    def forward(self, X):
        cache = {'A0': X}
        for i in range(1, len(self.layer_sizes)):
            W, b = self.params[f'W{i}'], self.params[f'b{i}']
            Z = np.dot(cache[f'A{i-1}'], W) + b
            activation_func = self.activation_funcs[i-1]
            A = activation_func(Z)
            cache[f'Z{i}'] = Z
            cache[f'A{i}'] = A
        return A, cache

    def backward(self, cache, Y):
        grads = {}
        output = cache[f'A{len(self.layer_sizes)-1}']
        dA_prev = self.loss_derivative(Y, output)
        
        for i in reversed(range(1, len(self.layer_sizes))):
            dZ = (dA_prev * 
                  self.activation_funcs[i-1].derivative(cache[f'Z{i}']))
            dA_prev = np.dot(dZ, self.params[f'W{i}'].T)
            grads[f'dW{i}'] = (np.dot(cache[f'A{i-1}'].T, dZ) + 
                               self.reg_lambda * self.params[f'W{i}'])
            grads[f'db{i}'] = np.sum(dZ, axis=0, keepdims=True)
        return grads

    def update_params(self, grads, learning_rate):
        for i in range(1, len(self.layer_sizes)):
            self.params[f'W{i}'] -= learning_rate * grads[f'dW{i}']
            self.params[f'b{i}'] -= learning_rate * grads[f'db{i}']

    # Adding a simple MSE loss function for demonstration
    def compute_loss(self, Y, output):
        return np.mean(np.square(Y - output))
    
    # Derivative of MSE loss with respect to output
    def loss_derivative(self, Y, output):
        return 2 * (output - Y) / Y.size

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            output, cache = self.forward(X)
            grads = self.backward(cache, Y)
            self.update_params(grads, learning_rate)
            if epoch % 100 == 0:
                loss = self.compute_loss(Y, output)
                print(f"Epoch {epoch}, Loss: {loss}")

In [None]:
#### Test

In [8]:
# Sample data generation for a simple linear relationship: y = 2x1 - 3x2 + 5
np.random.seed(42)  # For reproducibility
X_train = np.random.rand(100, 2)
Y_train = 2*X_train[:, 0] - 3*X_train[:, 1] + 5
Y_train = Y_train.reshape(-1, 1)  
# Reshape for consistency with our network's expected input

# Initialize the network
layer_sizes = [2, 4, 1]  
# Input layer (2 neurons), 
# one hidden layer (4 neurons), 
# output layer (1 neuron)

activation_funcs = [ReLU(), Sigmoid()]  
# Using ReLU for hidden layers and Sigmoid for the output layer

nn = MultiLayerNet(layer_sizes, activation_funcs, loss_function=None)

# Training the network
nn.train(X_train, Y_train, epochs=2000, learning_rate=0.01)

Epoch 0, Loss: 17.329284265847377
Epoch 100, Loss: 13.80542313681342
Epoch 200, Loss: 13.516063014546582
Epoch 300, Loss: 13.45837820447229
Epoch 400, Loss: 13.436331545397715
Epoch 500, Loss: 13.425149590372207
Epoch 600, Loss: 13.41851624253497
Epoch 700, Loss: 13.414180419127522
Epoch 800, Loss: 13.411146367981829
Epoch 900, Loss: 13.408920308228062
Epoch 1000, Loss: 13.407225978470265
Epoch 1100, Loss: 13.405898362495073
Epoch 1200, Loss: 13.404833656333055
Epoch 1300, Loss: 13.403964073058132
Epoch 1400, Loss: 13.403242769599933
Epoch 1500, Loss: 13.402636307371633
Epoch 1600, Loss: 13.402120490638245
Epoch 1700, Loss: 13.401677427614464
Epoch 1800, Loss: 13.401293449234881
Epoch 1900, Loss: 13.400958094163787


## Improvements

#### Mini-batch training

In [None]:
# Mini-batch training
batch_size = 64
num_batches = len(X) // batch_size
for epoch in range(num_epochs):
    for i in range(num_batches):
        # Select a random batch of data
        batch_mask = np.random.choice(len(X), batch_size)
        X_batch = X[batch_mask]
        y_batch = y[batch_mask]

        # Forward and backward propagation using the batch data
        # ...


#### Optimization with Adam

In [None]:
# Adam optimization
beta1, beta2 = 0.9, 0.999
eps = 1e-8
mW1, vW1 = 0, 0
mW2, vW2 = 0, 0
for epoch in range(num_epochs):
    # Forward and backward propagation
    # ...
    # Update parameters using Adam optimization
    mW1 = beta1 * mW1 + (1 - beta1) * dW1
    vW1 = beta2 * vW1 + (1 - beta2) * (dW1 ** 2)
    mW2 = beta1 * mW2 + (1 - beta1) * dW2
    vW2 = beta2 * vW2 + (1 - beta2) * (dW2 ** 2)
    self.params['W1'] -= learning_rate * mW1 / (np.sqrt(vW1) + eps)
    self.params['b1'] -= learning_rate * db1
    self.params['W2'] -= learning_rate * mW2 / (np.sqrt(vW2) + eps)
    self.params['b2'] -= learning_rate * db2
