In [6]:
import numpy as np

# Activation functions
def relu(x): return np.maximum(0, x)
def relu_derivative(x): return (x > 0).astype(float)

def tanh(x): return np.tanh(x)
def tanh_derivative(x): return 1 - np.tanh(x)**2

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Training data (XOR-like problem, 3 classes for demo)
X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([
    [1,0,0],   # class 0
    [0,1,0],   # class 1
    [0,0,1],   # class 2
    [1,0,0]    # class 0 again
])

# Network architecture: input → hidden1 → hidden2 → output(3 classes)
layers = [2, 4, 3, 3]   
activations = ["relu", "tanh", "softmax"]
lr = 0.5
epochs = 10

# Initialize weights and biases
np.random.seed(42)
weights = [np.random.randn(layers[i], layers[i+1]) for i in range(len(layers)-1)]
biases  = [np.zeros((1, layers[i+1])) for i in range(len(layers)-1)]

# Training loop
for epoch in range(epochs):
    # Forward pass
    zs, activs = [], [X]
    for i in range(len(weights)):
        z = activs[-1].dot(weights[i]) + biases[i]
        zs.append(z)
        if activations[i] == "relu":
            a = relu(z)
        elif activations[i] == "tanh":
            a = tanh(z)
        elif activations[i] == "softmax":
            a = softmax(z)
        activs.append(a)

    # Error (target - output)
    error = y - activs[-1]

    # Backpropagation
    grads_w, grads_b = [None]*len(weights), [None]*len(biases)
    delta = error   # directly use error at output layer

    for i in reversed(range(len(weights))):
        grads_w[i] = activs[i].T.dot(delta)
        grads_b[i] = np.sum(delta, axis=0, keepdims=True)

        if i > 0:
            if activations[i-1] == "relu":
                delta = delta.dot(weights[i].T) * relu_derivative(zs[i-1])
            elif activations[i-1] == "tanh":
                delta = delta.dot(weights[i].T) * tanh_derivative(zs[i-1])

    # Update weights
    for i in range(len(weights)):
        weights[i] += lr * grads_w[i]   # note: using += since error = target - output
        biases[i]  += lr * grads_b[i]

# Print final results
print(f"Total Epochs: {epochs}")
print("Final Derived Output:\n", activs[-1])
print("Final Error:\n", error)
for i, w in enumerate(weights):
    print(f"Final Updated Weights W{i+1}:\n", w)
for i, b in enumerate(biases):
    print(f"Final Updated Biases b{i+1}:\n", b)


Total Epochs: 10
Final Derived Output:
 [[0.82431946 0.06147085 0.11420969]
 [0.22526869 0.38593694 0.38879437]
 [0.22208684 0.36292846 0.4149847 ]
 [0.21586051 0.36804326 0.41609623]]
Final Error:
 [[ 0.17568054 -0.06147085 -0.11420969]
 [-0.22526869  0.61406306 -0.38879437]
 [-0.22208684 -0.36292846  0.5850153 ]
 [ 0.78413949 -0.36804326 -0.41609623]]
Final Updated Weights W1:
 [[ 1.70840045 -0.1382643   1.49264767  0.87681967]
 [-0.29690438 -0.23413696  1.87221392  0.66193035]]
Final Updated Weights W2:
 [[-0.51349949  1.58973605 -0.47085281]
 [-0.46572975  0.24196227 -1.91328024]
 [-2.1629931   0.99172214 -0.77268431]
 [-0.71615633  0.02257655 -1.38315618]]
Final Updated Weights W3:
 [[ 1.51672369 -0.61169663  0.40237361]
 [-0.63813851 -1.01626855 -0.20380127]
 [-1.16375024  0.22960844 -0.44179245]]
Final Updated Biases b1:
 [[-0.07592143  0.         -0.59342585 -0.78454859]]
Final Updated Biases b2:
 [[ 0.80831402 -0.03667159 -0.55407691]]
Final Updated Biases b3:
 [[ 0.78565909 -