<a href="https://colab.research.google.com/github/Jonathan-code-hub/MAT-422-Math-Methods-in-Data-Science/blob/main/Homework_3_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

3.7 Artifical Neural Networks

3.7.1 Mathematical Formulation

In [None]:
import numpy as np

# Sigmoid activation function #
def sigmoid(x):
    """
    Sigmoid function introduces non-linearity.
    Formula: σ(x) = 1 / (1 + exp(-x))
    """
    return 1 / (1 + np.exp(-x))

# Sigmoid derivative (for backpropagation) #
def sigmoid_derivative(x):
    """
    Derivative of sigmoid function.
    Formula: σ'(x) = σ(x) * (1 - σ(x))
    """
    return x * (1 - x)

# Binary cross-entropy loss function #
def binary_crossentropy(y_true, y_pred):
    """
    Binary cross-entropy loss function.
    Formula: - (y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
    """
    epsilon = 1e-15  # To avoid log(0) #
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clipping to prevent log(0) #
    return - (y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)).mean()

# Neural Network Class #
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize the weights and biases.
        Formula for weights: w_ij for weights between neurons i and j
        Formula for biases: b for each layer
        """

        # Initialize weights and biases #
        self.weights_input_hidden = np.random.rand(input_size, hidden_size)  # Weights between input and hidden layers #
        self.weights_hidden_output = np.random.rand(hidden_size, output_size)  # Weights between hidden and output layers #
        self.bias_hidden = np.zeros((1, hidden_size))  # Bias for hidden layer #
        self.bias_output = np.zeros((1, output_size))  # Bias for output layer #

    def forward(self, X):
        """
        Forward pass calculates activations layer by layer.
        Formula: Z = X * W + b, where W is weight matrix and b is bias
        """
        # Calculate the input to hidden layer ##
        self.hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
        self.hidden_output = sigmoid(self.hidden_input)  # Apply sigmoid activation #

        # Calculate the input to the output layer #
        self.output_input = np.dot(self.hidden_output, self.weights_hidden_output) + self.bias_output
        self.output = sigmoid(self.output_input)  # Apply sigmoid activation for binary output #

        return self.output

    def backward(self, X, y, learning_rate):
        """
        Backpropagation to update weights and biases.
        Uses gradient descent to update weights.
        """
        # Calculate the error at output layer #
        output_error = self.output - y
        output_delta = output_error * sigmoid_derivative(self.output)

        # Calculate the error at hidden layer #
        hidden_error = output_delta.dot(self.weights_hidden_output.T)
        hidden_delta = hidden_error * sigmoid_derivative(self.hidden_output)

        # Update weights and biases using gradient descent #
        self.weights_input_hidden -= X.T.dot(hidden_delta) * learning_rate
        self.weights_hidden_output -= self.hidden_output.T.dot(output_delta) * learning_rate
        self.bias_hidden -= np.sum(hidden_delta, axis=0, keepdims=True) * learning_rate
        self.bias_output -= np.sum(output_delta, axis=0, keepdims=True) * learning_rate

    def train(self, X, y, epochs, learning_rate):
        """
        Train the neural network by performing forward and backward passes.
        The goal is to minimize the loss function over multiple epochs.
        """
        # Train the neural network #
        for epoch in range(epochs):
            self.forward(X)
            self.backward(X, y, learning_rate)

            if epoch % 1000 == 0:
                loss = binary_crossentropy(y, self.output)
                print(f"Epoch {epoch}, Loss: {loss}")

# Testing #

# Training data: XOR problem #
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # Input #
y = np.array([[0], [1], [1], [0]])  # Output (XOR) #

# Initialize and train the neural network #
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1)  # 2 inputs, 4 hidden neurons, 1 output neuron #
nn.train(X, y, epochs=10000, learning_rate=0.1)

# Test the trained network #
predictions = nn.forward(X)
print("\nPredictions after training:")
print(predictions)

Epoch 0, Loss: 1.071208645248812
Epoch 1000, Loss: 0.6872738964694367
Epoch 2000, Loss: 0.6333346669080255
Epoch 3000, Loss: 0.4866074538371573
Epoch 4000, Loss: 0.2746997457412422
Epoch 5000, Loss: 0.15286127373626462
Epoch 6000, Loss: 0.10629882664579617
Epoch 7000, Loss: 0.08349518707257461
Epoch 8000, Loss: 0.06997241498750328
Epoch 9000, Loss: 0.060954064167824985

Predictions after training:
[[0.05586523]
 [0.94958945]
 [0.9486076 ]
 [0.05434364]]


3.7.2 Activation Functions

In [None]:
import numpy as np

# Sigmoid activation function #
def sigmoid(x):
    """
    # Sigmoid activation function: maps input between 0 and 1 #
    Formula: σ(x) = 1 / (1 + exp(-x))
    """
    return 1 / (1 + np.exp(-x))

# Sigmoid derivative #
def sigmoid_derivative(x):
    """
    # Derivative of Sigmoid function, used in backpropagation #
    Formula: σ'(x) = σ(x) * (1 - σ(x))
    """
    return x * (1 - x)

# ReLU activation function #
def relu(x):
    """
    # ReLU activation function: maps negative values to 0 #
    Formula: ReLU(x) = max(0, x)
    """
    return np.maximum(0, x)

# ReLU derivative #
def relu_derivative(x):
    """
    # Derivative of ReLU function, used in backpropagation #
    Formula: ReLU'(x) = 1 for x > 0 else 0
    """
    return np.where(x > 0, 1, 0)

# Tanh activation function #
def tanh(x):
    """
    # Tanh activation function: maps input between -1 and 1 #
    Formula: tanh(x) = (e^x - e^-x) / (e^x + e^-x)
    """
    return np.tanh(x)

# Tanh derivative #
def tanh_derivative(x):
    """
    # Derivative of Tanh function, used in backpropagation #
    Formula: tanh'(x) = 1 - tanh(x)^2
    """
    return 1 - np.tanh(x)**2

# Softmax activation function (used for multi-class classification) #
def softmax(x):
    """
    # Softmax activation function: converts logits to probabilities for multi-class classification #
    Formula: softmax(x_i) = exp(x_i) / sum(exp(x_j) for all j)
    """
    exp_values = np.exp(x - np.max(x))  # Shift for numerical stability
    return exp_values / np.sum(exp_values, axis=1, keepdims=True)

# Example Neural Network with ReLU as activation function #
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, activation_function):
        """
        # Initialize the neural network with given sizes and selected activation function #
        """
        # Random initialization of weights and biases #
        self.weights_input_hidden = np.random.rand(input_size, hidden_size)  # Weights between input and hidden layers #
        self.weights_hidden_output = np.random.rand(hidden_size, output_size)  # Weights between hidden and output layers #
        self.bias_hidden = np.zeros((1, hidden_size))  # Bias for hidden layer #
        self.bias_output = np.zeros((1, output_size))  # Bias for output layer #

        # Choose the activation function #
        self.activation_function = activation_function

    def forward(self, X):
        """
        # Forward pass through the network #
        """
        # Input to hidden layer #
        self.hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden #
        if self.activation_function == 'sigmoid':
            self.hidden_output = sigmoid(self.hidden_input)  # Apply sigmoid #
        elif self.activation_function == 'relu':
            self.hidden_output = relu(self.hidden_input)  # Apply ReLU #
        elif self.activation_function == 'tanh':
            self.hidden_output = tanh(self.hidden_input)  # Apply Tanh #

        # Hidden to output layer #
        self.output_input = np.dot(self.hidden_output, self.weights_hidden_output) + self.bias_output #
        if self.activation_function == 'sigmoid':
            self.output = sigmoid(self.output_input)  # Apply sigmoid #
        elif self.activation_function == 'relu':
            self.output = relu(self.output_input)  # Apply ReLU #
        elif self.activation_function == 'tanh':
            self.output = tanh(self.output_input)  # Apply Tanh #

        return self.output

    def backward(self, X, y, learning_rate):
        """
        # Backpropagation to adjust weights and biases #
        """
        # Compute the error at the output layer #
        output_error = self.output - y #
        if self.activation_function == 'sigmoid':
            output_delta = output_error * sigmoid_derivative(self.output)  # Sigmoid derivative #
        elif self.activation_function == 'relu':
            output_delta = output_error * relu_derivative(self.output)  # ReLU derivative #
        elif self.activation_function == 'tanh':
            output_delta = output_error * tanh_derivative(self.output)  # Tanh derivative #

        # Compute the error at the hidden layer #
        hidden_error = output_delta.dot(self.weights_hidden_output.T) #
        if self.activation_function == 'sigmoid':
            hidden_delta = hidden_error * sigmoid_derivative(self.hidden_output)  # Sigmoid derivative #
        elif self.activation_function == 'relu':
            hidden_delta = hidden_error * relu_derivative(self.hidden_output)  # ReLU derivative #
        elif self.activation_function == 'tanh':
            hidden_delta = hidden_error * tanh_derivative(self.hidden_output)  # Tanh derivative #

        # Update weights and biases using gradient descent #
        self.weights_input_hidden -= X.T.dot(hidden_delta) * learning_rate #
        self.weights_hidden_output -= self.hidden_output.T.dot(output_delta) * learning_rate #
        self.bias_hidden -= np.sum(hidden_delta, axis=0, keepdims=True) * learning_rate #
        self.bias_output -= np.sum(output_delta, axis=0, keepdims=True) * learning_rate #

    def train(self, X, y, epochs, learning_rate):
        """
        # Train the neural network using forward and backward passes #
        """
        for epoch in range(epochs):
            self.forward(X) #
            self.backward(X, y, learning_rate) #

            if epoch % 1000 == 0:
                loss = binary_crossentropy(y, self.output)  # Loss function to calculate how well the model is performing #
                print(f"Epoch {epoch}, Loss: {loss}")

# Example usage with the XOR problem with ReLU activation:

# Training data: XOR problem #
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # Input #
y = np.array([[0], [1], [1], [0]])  # Output (XOR) #

# Initialize and train the neural network with ReLU activation #
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1, activation_function='relu')  # 2 inputs, 4 hidden neurons, 1 output neuron #
nn.train(X, y, epochs=10000, learning_rate=0.1) #

# Test the trained network #
predictions = nn.forward(X) #
print("\nPredictions after training:")
print(predictions)


Epoch 0, Loss: 8.69008163154445
Epoch 1000, Loss: 9.992007221626415e-16
Epoch 2000, Loss: 9.992007221626415e-16
Epoch 3000, Loss: 9.992007221626415e-16
Epoch 4000, Loss: 9.992007221626415e-16
Epoch 5000, Loss: 9.992007221626415e-16
Epoch 6000, Loss: 9.992007221626415e-16
Epoch 7000, Loss: 9.992007221626415e-16
Epoch 8000, Loss: 9.992007221626415e-16
Epoch 9000, Loss: 9.992007221626415e-16

Predictions after training:
[[4.45549659e-16]
 [1.00000000e+00]
 [1.00000000e+00]
 [3.60450649e-16]]


3.7.3 Cost Function

In [None]:
import numpy as np

# Mean Squared Error (MSE) cost function #
def mean_squared_error(y_true, y_pred):
    """
    # Mean Squared Error (MSE) cost function: measures the average squared difference between the predicted and actual values #
    Formula: MSE = (1/n) * sum((y_true - y_pred)^2)
    """
    return np.mean((y_true - y_pred) ** 2)

# Binary Cross-Entropy (Log Loss) cost function #
def binary_crossentropy(y_true, y_pred):
    """
    # Binary Cross-Entropy cost function: used for binary classification tasks #
    Formula: BCE = -(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
    """
    epsilon = 1e-15  # To avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip values to prevent log(0)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Categorical Cross-Entropy cost function #
def categorical_crossentropy(y_true, y_pred):
    """
    # Categorical Cross-Entropy cost function: used for multi-class classification tasks #
    Formula: CCE = -sum(y_true * log(y_pred))
    """
    epsilon = 1e-15  # To avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip values to prevent log(0)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Example Neural Network using MSE as cost function #
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, activation_function, cost_function):
        """
        # Initialize the neural network with given sizes, selected activation function, and cost function #
        """
        # Random initialization of weights and biases #
        self.weights_input_hidden = np.random.rand(input_size, hidden_size)  # Weights between input and hidden layers #
        self.weights_hidden_output = np.random.rand(hidden_size, output_size)  # Weights between hidden and output layers #
        self.bias_hidden = np.zeros((1, hidden_size))  # Bias for hidden layer #
        self.bias_output = np.zeros((1, output_size))  # Bias for output layer #

        # Choose the activation function and cost function #
        self.activation_function = activation_function
        self.cost_function = cost_function

    def forward(self, X):
        """
        # Forward pass through the network #
        """
        # Input to hidden layer #
        self.hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden #
        if self.activation_function == 'sigmoid':
            self.hidden_output = sigmoid(self.hidden_input)  # Apply sigmoid #
        elif self.activation_function == 'relu':
            self.hidden_output = relu(self.hidden_input)  # Apply ReLU #
        elif self.activation_function == 'tanh':
            self.hidden_output = tanh(self.hidden_input)  # Apply Tanh #

        # Hidden to output layer #
        self.output_input = np.dot(self.hidden_output, self.weights_hidden_output) + self.bias_output #
        if self.activation_function == 'sigmoid':
            self.output = sigmoid(self.output_input)  # Apply sigmoid #
        elif self.activation_function == 'relu':
            self.output = relu(self.output_input)  # Apply ReLU #
        elif self.activation_function == 'tanh':
            self.output = tanh(self.output_input)  # Apply Tanh #

        return self.output

    def backward(self, X, y, learning_rate):
        """
        # Backpropagation to adjust weights and biases #
        """
        # Compute the error at the output layer #
        output_error = self.output - y #
        if self.activation_function == 'sigmoid':
            output_delta = output_error * sigmoid_derivative(self.output)  # Sigmoid derivative #
        elif self.activation_function == 'relu':
            output_delta = output_error * relu_derivative(self.output)  # ReLU derivative #
        elif self.activation_function == 'tanh':
            output_delta = output_error * tanh_derivative(self.output)  # Tanh derivative #

        # Compute the error at the hidden layer #
        hidden_error = output_delta.dot(self.weights_hidden_output.T) #
        if self.activation_function == 'sigmoid':
            hidden_delta = hidden_error * sigmoid_derivative(self.hidden_output)  # Sigmoid derivative #
        elif self.activation_function == 'relu':
            hidden_delta = hidden_error * relu_derivative(self.hidden_output)  # ReLU derivative #
        elif self.activation_function == 'tanh':
            hidden_delta = hidden_error * tanh_derivative(self.hidden_output)  # Tanh derivative #

        # Update weights and biases using gradient descent #
        self.weights_input_hidden -= X.T.dot(hidden_delta) * learning_rate #
        self.weights_hidden_output -= self.hidden_output.T.dot(output_delta) * learning_rate #
        self.bias_hidden -= np.sum(hidden_delta, axis=0, keepdims=True) * learning_rate #
        self.bias_output -= np.sum(output_delta, axis=0, keepdims=True) * learning_rate #

    def train(self, X, y, epochs, learning_rate):
        """
        # Train the neural network using forward and backward passes #
        """
        for epoch in range(epochs):
            self.forward(X) #
            self.backward(X, y, learning_rate) #

            # Calculate and print the cost (loss) for every epoch #
            if epoch % 1000 == 0:
                cost = self.cost_function(y, self.output)  # Use the selected cost function #
                print(f"Epoch {epoch}, Cost: {cost}")

# Example usage for XOR problem with MSE cost function:

# Training data: XOR problem #
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # Input #
y = np.array([[0], [1], [1], [0]])  # Output (XOR) #

# Initialize and train the neural network with MSE cost function #
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1, activation_function='relu', cost_function=mean_squared_error)  # 2 inputs, 4 hidden neurons, 1 output neuron #
nn.train(X, y, epochs=10000, learning_rate=0.1) #

# Test the trained network #
predictions = nn.forward(X) #
print("\nPredictions after training:")
print(predictions)


Epoch 0, Cost: 0.5319846010373317
Epoch 1000, Cost: 3.998820270181945e-31
Epoch 2000, Cost: 4.027413032227868e-31
Epoch 3000, Cost: 4.027413032227868e-31
Epoch 4000, Cost: 4.027413032227869e-31
Epoch 5000, Cost: 4.027413032227868e-31
Epoch 6000, Cost: 4.027413032227868e-31
Epoch 7000, Cost: 4.027413032227868e-31
Epoch 8000, Cost: 4.027413032227868e-31
Epoch 9000, Cost: 4.027413032227868e-31

Predictions after training:
[[8.48700474e-16]
 [1.00000000e+00]
 [1.00000000e+00]
 [3.72544853e-16]]


3.7.4 Backpropagation

In [None]:
import numpy as np

# Sigmoid activation function #
def sigmoid(x):
    """
    # Sigmoid activation function: maps input between 0 and 1 #
    Formula: σ(x) = 1 / (1 + exp(-x))
    """
    return 1 / (1 + np.exp(-x))

# Sigmoid derivative #
def sigmoid_derivative(x):
    """
    # Sigmoid derivative function: used in backpropagation #
    Formula: σ'(x) = σ(x) * (1 - σ(x))
    """
    return x * (1 - x)

# Mean Squared Error (MSE) cost function #
def mean_squared_error(y_true, y_pred):
    """
    # Mean Squared Error (MSE) cost function: measures the average squared difference between the predicted and actual values #
    Formula: MSE = (1/n) * sum((y_true - y_pred)^2)
    """
    return np.mean((y_true - y_pred) ** 2)

# Neural Network class using Backpropagation #
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate):
        """
        # Initialize the neural network with given sizes and learning rate #
        """
        # Random initialization of weights and biases #
        self.weights_input_hidden = np.random.rand(input_size, hidden_size)  # Weights between input and hidden layers #
        self.weights_hidden_output = np.random.rand(hidden_size, output_size)  # Weights between hidden and output layers #
        self.bias_hidden = np.zeros((1, hidden_size))  # Bias for hidden layer #
        self.bias_output = np.zeros((1, output_size))  # Bias for output layer #
        self.learning_rate = learning_rate  # Learning rate #

    def forward(self, X):
        """
        # Forward pass through the network #
        """
        # Input to hidden layer #
        self.hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden #
        self.hidden_output = sigmoid(self.hidden_input)  # Apply sigmoid activation function #

        # Hidden to output layer #
        self.output_input = np.dot(self.hidden_output, self.weights_hidden_output) + self.bias_output #
        self.output = sigmoid(self.output_input)  # Apply sigmoid activation function #

        return self.output

    def backward(self, X, y):
        """
        # Backpropagation to adjust weights and biases #
        """
        # Compute the error at the output layer #
        output_error = self.output - y #
        output_delta = output_error * sigmoid_derivative(self.output)  # Apply the derivative of the sigmoid function #

        # Compute the error at the hidden layer #
        hidden_error = output_delta.dot(self.weights_hidden_output.T) #
        hidden_delta = hidden_error * sigmoid_derivative(self.hidden_output)  # Apply the derivative of the sigmoid function #

        # Update weights and biases using gradient descent #
        self.weights_input_hidden -= X.T.dot(hidden_delta) * self.learning_rate # Update weights between input and hidden layers #
        self.weights_hidden_output -= self.hidden_output.T.dot(output_delta) * self.learning_rate # Update weights between hidden and output layers #
        self.bias_hidden -= np.sum(hidden_delta, axis=0, keepdims=True) * self.learning_rate # Update bias for hidden layer #
        self.bias_output -= np.sum(output_delta, axis=0, keepdims=True) * self.learning_rate # Update bias for output layer #

    def train(self, X, y, epochs):
        """
        # Train the neural network using forward and backward passes (backpropagation) #
        """
        for epoch in range(epochs):
            self.forward(X)  # Perform the forward pass #
            self.backward(X, y)  # Perform the backward pass (backpropagation) #

            if epoch % 1000 == 0:
                cost = mean_squared_error(y, self.output)  # Compute the cost (loss) using MSE #
                print(f"Epoch {epoch}, Cost: {cost}")

# Example usage for XOR problem with backpropagation:

# Training data: XOR problem #
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # Input #
y = np.array([[0], [1], [1], [0]])  # Output (XOR) #

# Initialize the neural network with 2 inputs, 4 hidden neurons, and 1 output neuron #
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1, learning_rate=0.1)

# Train the neural network for 10000 epochs #
nn.train(X, y, epochs=10000)

# Test the trained network #
predictions = nn.forward(X)  # Get predictions after training #
print("\nPredictions after training:")
print(predictions)


Epoch 0, Cost: 0.3608962283912063
Epoch 1000, Cost: 0.24938002718812224
Epoch 2000, Cost: 0.24414817447705864
Epoch 3000, Cost: 0.2052733394704204
Epoch 4000, Cost: 0.12705484851328977
Epoch 5000, Cost: 0.039235449659557727
Epoch 6000, Cost: 0.015241735558838017
Epoch 7000, Cost: 0.008408863888079824
Epoch 8000, Cost: 0.00556349673451386
Epoch 9000, Cost: 0.004074358333295972

Predictions after training:
[[0.05979164]
 [0.94586745]
 [0.94599085]
 [0.05737543]]


3.7.5 Backpropagation Algorithm

1. Initialize the weights and biases randomly
2. For each epoch:
   3. For each training example (input x, true output y):
      4. Perform a forward pass:
         - Calculate the activations for each layer of the network
      5. Calculate the loss between the predicted output and the true output
      6. Perform a backward pass:
         - Compute the gradients for each layer's weights and biases
         - Propagate the error back through the network
      7. Update the weights and biases using the gradients and the learning rate
3. Repeat steps 2-3 for a specified number of epochs

In [None]:
import numpy as np

# Sigmoid activation function #
def sigmoid(x):
    """
    # Sigmoid activation function: maps input between 0 and 1 #
    Formula: σ(x) = 1 / (1 + exp(-x))
    """
    return 1 / (1 + np.exp(-x))

# Sigmoid derivative function #
def sigmoid_derivative(x):
    """
    # Sigmoid derivative function: used in backpropagation #
    Formula: σ'(x) = σ(x) * (1 - σ(x))
    """
    return x * (1 - x)

# Mean Squared Error (MSE) cost function #
def mean_squared_error(y_true, y_pred):
    """
    # Mean Squared Error (MSE) cost function: measures the average squared difference between the predicted and actual values #
    Formula: MSE = (1/n) * sum((y_true - y_pred)^2)
    """
    return np.mean((y_true - y_pred) ** 2)

# Neural Network class with Backpropagation #
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate):
        """
        # Initialize the neural network with given sizes and learning rate #
        """
        # Random initialization of weights and biases #
        self.weights_input_hidden = np.random.rand(input_size, hidden_size)  # Weights between input and hidden layers #
        self.weights_hidden_output = np.random.rand(hidden_size, output_size)  # Weights between hidden and output layers #
        self.bias_hidden = np.zeros((1, hidden_size))  # Bias for hidden layer #
        self.bias_output = np.zeros((1, output_size))  # Bias for output layer #
        self.learning_rate = learning_rate  # Learning rate #

    def forward(self, X):
        """
        # Forward pass through the network #
        """
        # Input to hidden layer #
        self.hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden #
        self.hidden_output = sigmoid(self.hidden_input)  # Apply sigmoid activation function #

        # Hidden to output layer #
        self.output_input = np.dot(self.hidden_output, self.weights_hidden_output) + self.bias_output #
        self.output = sigmoid(self.output_input)  # Apply sigmoid activation function #

        return self.output

    def backward(self, X, y):
        """
        # Backpropagation to adjust weights and biases #
        """
        # Compute the error at the output layer #
        output_error = self.output - y #
        output_delta = output_error * sigmoid_derivative(self.output)  # Apply the derivative of the sigmoid function #

        # Compute the error at the hidden layer #
        hidden_error = output_delta.dot(self.weights_hidden_output.T) #
        hidden_delta = hidden_error * sigmoid_derivative(self.hidden_output)  # Apply the derivative of the sigmoid function #

        # Update weights and biases using gradient descent #
        self.weights_input_hidden -= X.T.dot(hidden_delta) * self.learning_rate # Update weights between input and hidden layers #
        self.weights_hidden_output -= self.hidden_output.T.dot(output_delta) * self.learning_rate # Update weights between hidden and output layers #
        self.bias_hidden -= np.sum(hidden_delta, axis=0, keepdims=True) * self.learning_rate # Update bias for hidden layer #
        self.bias_output -= np.sum(output_delta, axis=0, keepdims=True) * self.learning_rate # Update bias for output layer #

    def train(self, X, y, epochs):
        """
        # Train the neural network using forward and backward passes (backpropagation) #
        """
        for epoch in range(epochs):
            self.forward(X)  # Perform the forward pass #
            self.backward(X, y)  # Perform the backward pass (backpropagation) #

            # Calculate and print the cost (loss) every 1000 epochs #
            if epoch % 1000 == 0:
                cost = mean_squared_error(y, self.output)  # Compute the cost using MSE #
                print(f"Epoch {epoch}, Cost: {cost}")

# Example usage for XOR problem with backpropagation:

# Training data: XOR problem #
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # Input #
y = np.array([[0], [1], [1], [0]])  # Output (XOR) #

# Initialize the neural network with 2 inputs, 4 hidden neurons, and 1 output neuron #
nn = NeuralNetwork(input_size=2, hidden_size=4, output_size=1, learning_rate=0.1)

# Train the neural network for 10000 epochs #
nn.train(X, y, epochs=10000)

# Test the trained network #
predictions = nn.forward(X)  # Get predictions after training #
print("\nPredictions after training:")
print(predictions)


Epoch 0, Cost: 0.3266735875006339
Epoch 1000, Cost: 0.24796296082765684
Epoch 2000, Cost: 0.22607733967232335
Epoch 3000, Cost: 0.15680648954609744
Epoch 4000, Cost: 0.06357736031223811
Epoch 5000, Cost: 0.022064028899517127
Epoch 6000, Cost: 0.010947610937981349
Epoch 7000, Cost: 0.00680689301708027
Epoch 8000, Cost: 0.004794309559946383
Epoch 9000, Cost: 0.0036421719184096613

Predictions after training:
[[0.06082689]
 [0.94913223]
 [0.94935672]
 [0.05275093]]
