# Forward & Backward Pass in Neural Networks — with NumPy


### Objective

* Manual implementation of a 2-layer NN

* How gradients flow backward to update weights

* How ReLU and Sigmoid affect learning

* Training using gradient descent

In [1]:
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# Input (2 samples, 2 features)
X = np.array([[0.5, 0.1],
              [0.9, 0.8]])

# True labels (binary classification)
Y = np.array([[1],
              [0]])

# Initialize weights and biases
W1 = np.random.randn(2, 2)    # input to hidden (2x2)
b1 = np.zeros((1, 2))         # hidden layer bias

W2 = np.random.randn(2, 1)    # hidden to output (2x1)
b2 = np.zeros((1, 1))         # output layer bias

# Activation functions
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

# Training hyperparameters
learning_rate = 0.1
epochs = 1000

for epoch in range(epochs):
    # ----------- FORWARD PASS ------------
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)

    Z2 = np.dot(A1, W2) + b2
    A2 = sigmoid(Z2)  # prediction

    # ----------- LOSS (Binary Cross-Entropy) ------------
    loss = -np.mean(Y * np.log(A2 + 1e-8) + (1 - Y) * np.log(1 - A2 + 1e-8))

    # ----------- BACKWARD PASS ------------
    dZ2 = A2 - Y                            # error at output
    dW2 = np.dot(A1.T, dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)

    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    # ----------- UPDATE WEIGHTS ------------
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1

    # ----------- OPTIONAL: print loss ------------
    if epoch % 100 == 0:
        print(f"Epoch {epoch} - Loss: {loss:.4f}")

Epoch 0 - Loss: 0.6107
Epoch 100 - Loss: 0.1022
Epoch 200 - Loss: 0.0440
Epoch 300 - Loss: 0.0272
Epoch 400 - Loss: 0.0193
Epoch 500 - Loss: 0.0149
Epoch 600 - Loss: 0.0121
Epoch 700 - Loss: 0.0102
Epoch 800 - Loss: 0.0088
Epoch 900 - Loss: 0.0077


You’ll see the loss value decrease every 100 epochs, showing that the network is learning