In [1]:
import numpy as np

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy(y_true, y_pred):
    m = y_true.shape[0]
    return -np.sum(y_true * np.log(y_pred + 1e-9)) / m

class MLP:
    def __init__(self, input_size, hidden_size, output_size, activation="relu"):
        self.W1 = np.random.randn(input_size, hidden_size) * 0.1
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.1
        self.b2 = np.zeros((1, output_size))

        if activation == "relu":
            self.activation = relu
            self.activation_derivative = relu_derivative
        else:
            self.activation = tanh
            self.activation_derivative = tanh_derivative

    def forward(self, X):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.activation(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.y_hat = softmax(self.z2)
        return self.y_hat

    def backward(self, X, y, lr):
        m = X.shape[0]

        dz2 = self.y_hat - y
        dW2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m

        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self.activation_derivative(self.z1)
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m

        self.W2 -= lr * dW2
        self.b2 -= lr * db2
        self.W1 -= lr * dW1
        self.b1 -= lr * db1

    def train(self, X, y, epochs=1000, lr=0.1):
        for epoch in range(epochs):
            y_hat = self.forward(X)
            loss = cross_entropy(y, y_hat)
            self.backward(X, y, lr)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

y = np.array([
    [1, 0],
    [0, 1],
    [0, 1],
    [1, 0]
])

mlp = MLP(input_size=2, hidden_size=8, output_size=2, activation="relu")
mlp.train(X, y, epochs=1000, lr=0.1)

predictions = mlp.forward(X)
print("\nPredicted probabilities:")
print(predictions)
print("\nPredicted classes:")
print(np.argmax(predictions, axis=1))


Epoch 0, Loss: 0.6915
Epoch 100, Loss: 0.6340
Epoch 200, Loss: 0.3722
Epoch 300, Loss: 0.1453
Epoch 400, Loss: 0.0712
Epoch 500, Loss: 0.0434
Epoch 600, Loss: 0.0299
Epoch 700, Loss: 0.0223
Epoch 800, Loss: 0.0175
Epoch 900, Loss: 0.0142

Predicted probabilities:
[[0.97581457 0.02418543]
 [0.00765418 0.99234582]
 [0.00757543 0.99242457]
 [0.9921216  0.0078784 ]]

Predicted classes:
[0 1 1 0]
