In [1]:
import numpy as np

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)
np.random.seed(42)
input_dim = 2
hidden1_dim = 8
hidden2_dim = 6
output_dim = 3
lr = 0.49

W1 = np.random.randn(input_dim, hidden1_dim) * 0.01
b1 = np.zeros((1, hidden1_dim))

W2 = np.random.randn(hidden1_dim, hidden2_dim) * 0.01
b2 = np.zeros((1, hidden2_dim))

W3 = np.random.randn(hidden2_dim, output_dim) * 0.01
b3 = np.zeros((1, output_dim))

X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([
    [1,0,0],  
    [0,1,0],
    [0,0,1], 
    [1,0,0]   
])
epochs = 100
for epoch in range(epochs):
    # Forward pass
    z1 = np.dot(X, W1) + b1
    a1 = relu(z1)

    z2 = np.dot(a1, W2) + b2
    a2 = tanh(z2)

    z3 = np.dot(a2, W3) + b3
    a3 = softmax(z3)

    # Error (MSE instead of loss)
    error = mse(y, a3)

    # Backpropagation
    dz3 = (a3 - y) / X.shape[0]   # gradient wrt MSE
    dW3 = np.dot(a2.T, dz3)
    db3 = np.sum(dz3, axis=0, keepdims=True)

    dz2 = np.dot(dz3, W3.T) * tanh_derivative(z2)
    dW2 = np.dot(a1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    dz1 = np.dot(dz2, W2.T) * relu_derivative(z1)
    dW1 = np.dot(X.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    W3 -= lr * dW3
    b3 -= lr * db3
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}, Error: {error:.4f}")
predictions = np.argmax(a3, axis=1)
print("Final predictions:", predictions)


Epoch 10, Error: 0.2087
Epoch 20, Error: 0.2083
Epoch 30, Error: 0.2083
Epoch 40, Error: 0.2083
Epoch 50, Error: 0.2083
Epoch 60, Error: 0.2083
Epoch 70, Error: 0.2083
Epoch 80, Error: 0.2083
Epoch 90, Error: 0.2083
Epoch 100, Error: 0.2083
Final predictions: [0 0 0 0]
