# A classification neural network

### 2 layers, sigmoid activation function, softmax activation function, cross-entropy loss

In [None]:
import numpy as np


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))


# Define the softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)


# Define the cross-entropy loss and its derivative
def cross_entropy_loss(y_true, y_pred):
    # To prevent division by zero
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))


def cross_entropy_derivative(y_true, y_pred):
    # To prevent division by zero
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -y_true / y_pred


# Initialize the network parameters
input_size = 3  # size of the input layer
hidden_size = 4  # size of the hidden layer
output_size = 2  # size of the output layer (assuming binary classification)

np.random.seed(42)  # Seed for reproducibility
W1 = np.random.randn(input_size, hidden_size)  # Weights for input to hidden layer
b1 = np.zeros(hidden_size)  # Biases for hidden layer
W2 = np.random.randn(hidden_size, output_size)  # Weights for hidden to output layer
b2 = np.zeros(output_size)  # Biases for output layer


# Forward pass of the network
def forward_pass(X):
    Z1 = np.dot(X, W1) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2


# Backward pass of the network
def backward_pass(X, Y, Z1, A1, Z2, A2):
    # Calculate the loss
    loss = cross_entropy_loss(Y, A2)

    # Calculate the gradient of the loss with respect to Z2 (A2)
    dLoss_dA2 = cross_entropy_derivative(Y, A2)

    # Calculate the gradient with respect to W2 and b2
    # For softmax and cross-entropy combined, the derivative simplifies to A2 - Y
    dLoss_dZ2 = A2 - Y
    dLoss_dW2 = np.dot(A1.T, dLoss_dZ2)
    dLoss_db2 = np.sum(dLoss_dZ2, axis=0)

    # Calculate the gradient with respect to A1
    dLoss_dA1 = np.dot(dLoss_dZ2, W2.T)

    # Calculate the gradient with respect to Z1
    dLoss_dZ1 = dLoss_dA1 * sigmoid_derivative(Z1)

    # Calculate the gradient with respect to W1 and b1
    dLoss_dW1 = np.dot(X.T, dLoss_dZ1)
    dLoss_db1 = np.sum(dLoss_dZ1, axis=0)

    return loss, dLoss_dW1, dLoss_db1, dLoss_dW2, dLoss_db2