In [14]:
import numpy as np

class LinearActivation:
    def forward(self, x):
        return x

    def backward(self, output_gradient):
        return output_gradient

class ReLU:
    def forward(self, x):
        self.input = x
        return np.maximum(0, x)

    def backward(self, output_gradient):
        return output_gradient * (self.input > 0)

class Sigmoid:
    def forward(self, x):
        self.output = 1 / (1 + np.exp(-x))
        return self.output

    def backward(self, output_gradient):
        return output_gradient * (self.output * (1 - self.output))

class Tanh:
    def forward(self, x):
        self.output = np.tanh(x)
        return self.output

    def backward(self, output_gradient):
        return output_gradient * (1 - self.output ** 2)

class Softmax:
    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
        self.output = exps / np.sum(exps, axis=-1, keepdims=True)
        return self.output

class Layer:
    def __init__(self, input_dim, output_dim, activation=None):
        self.weights = np.random.randn(input_dim, output_dim) * 0.1
        self.bias = np.zeros(output_dim)
        self.activation = activation() if activation else None

    def forward(self, input_data):
        self.input = input_data
        self.z = np.dot(input_data, self.weights) + self.bias
        self.output = self.activation.forward(self.z) if self.activation else self.z
        return self.output

    def backward(self, output_gradient):
        """Computes gradients and returns input gradient for next layer (previous in terms of backprop)"""
        activation_grad = self.activation.backward(output_gradient) if self.activation else output_gradient
        self.weights_gradient = np.dot(self.input.T, activation_grad)
        self.bias_gradient = np.sum(activation_grad, axis=0)
        return np.dot(activation_grad, self.weights.T)

    def update_params(self, learning_rate):
        """Updates weights and biases using the computed gradients"""
        self.weights -= learning_rate * self.weights_gradient
        self.bias -= learning_rate * self.bias_gradient

class NeuralNetwork:
    def __init__(self):
        self.layers = []

    def add_layer(self, input_dim, output_dim, activation=None):
        self.layers.append(Layer(input_dim, output_dim, activation))

    def predict(self, input_data):
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

    def train(self, X, Y, loss_fn, learning_rate=0.01):
        predictions = self.predict(X)
        loss = loss_fn.forward(predictions, Y)
        loss_grad = loss_fn.backward(predictions, Y)
        for layer in reversed(self.layers):
            loss_grad = layer.backward(loss_grad)
        for layer in self.layers:
            layer.update_params(learning_rate)
        return loss

class CrossEntropyLoss:
    def forward(self, predictions, labels):
        self.predictions = predictions
        self.labels = labels
        return -np.sum(labels * np.log(predictions + 1e-9)) / predictions.shape[0]

    def backward(self, predictions, labels):
        # Directly return the gradient for softmax with cross-entropy
        # This simplification works because dL/dz = y_pred - y_true for softmax + cross-entropy
        return predictions - labels

# Example usage
if __name__ == "__main__":
    nn = NeuralNetwork()
    nn.add_layer(input_dim=4, output_dim=5, activation=ReLU)
    nn.add_layer(input_dim=5, output_dim=3, activation=Sigmoid)

    X = np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]])
    Y = np.array([[1, 0, 0], [0, 1, 0]])

    loss_fn = CrossEntropyLoss()

    for epoch in range(1000):
        loss = nn.train(X, Y, loss_fn, learning_rate=0.01)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")


Epoch 0, Loss: 0.6930666078767866
Epoch 100, Loss: 0.692186232442549
Epoch 200, Loss: 0.6918591111226097
Epoch 300, Loss: 0.6915140894350671
Epoch 400, Loss: 0.6912341959759591
Epoch 500, Loss: 0.6910135049084263
Epoch 600, Loss: 0.6908470424355988
Epoch 700, Loss: 0.6907298954262662
Epoch 800, Loss: 0.6906569650214403
Epoch 900, Loss: 0.6906229894342951
