The example MLP defined layer_dims = [2, 8, 4, 1], which corresponds to:
	•	Layer 1:  (input 2 → hidden 8)
	•	Layer 2:  (hidden 8 → hidden 4)
	•	Layer 3:  (hidden 4 → output 1)

So there are 3 trainable (dense) layers in that code.

In [1]:
import numpy as np

# ----------------------------
# 1) Synthetic Binary Dataset
# ----------------------------
def generate_binary_data(n_per_class=300, seed=1):
    """
    Two Gaussian blobs for classes 0/1 in 2D.
    Returns:
      X: (2n, 2), Y: (2n,1) with values {0,1}.
    """
    np.random.seed(seed)
    N = n_per_class
    cov = [[0.5, 0], [0, 0.5]]

    x0 = np.random.multivariate_normal(mean=[-1, -1], cov=cov, size=N)
    y0 = np.zeros((N, 1))

    x1 = np.random.multivariate_normal(mean=[1, 1], cov=cov, size=N)
    y1 = np.ones((N, 1))

    X = np.vstack([x0, x1])
    Y = np.vstack([y0, y1])

    perm = np.random.permutation(2 * N)
    return X[perm], Y[perm]

# ----------------------------
# 2) MLP Class Definition
# ----------------------------
class MLPBinary:
    def __init__(self, layer_dims, lr=0.01, seed=0):
        """
        layer_dims: list of dims, e.g. [2, 8, 4, 1]
          -> input_dim=2, hidden1=8, hidden2=4, output=1.
        """
        np.random.seed(seed)
        self.lr = lr
        self.L = len(layer_dims) - 1    # number of layers

        # Initialize weights & biases
        self.params = {}
        for ℓ in range(1, self.L + 1):
            n_in = layer_dims[ℓ-1]
            n_out = layer_dims[ℓ]
            if ℓ < self.L:
                # Hidden: He init (ReLU)
                self.params[f"W{ℓ}"] = np.random.randn(n_in, n_out) * np.sqrt(2.0 / n_in)
            else:
                # Output: small random
                self.params[f"W{ℓ}"] = np.random.randn(n_in, n_out) * 0.01
            self.params[f"b{ℓ}"] = np.zeros((1, n_out))

    def relu(self, Z):
        return np.maximum(0, Z)

    def relu_grad(self, Z):
        dZ = np.zeros_like(Z)
        dZ[Z > 0] = 1
        return dZ

    def sigmoid(self, Z):
        return 1.0 / (1.0 + np.exp(-Z))

    def forward(self, X):
        """
        Perform forward pass through L layers.
        Cache Zℓ and Aℓ for each ℓ.
        Returns:
          A_L: final activation (batch,1)
          cache: {"Z1":..., "A1":..., ..., "ZL":..., "A0":X}
        """
        cache = {}
        A_prev = X
        cache["A0"] = X

        # Hidden layers 1 to L-1
        for ℓ in range(1, self.L):
            Wℓ = self.params[f"W{ℓ}"]
            bℓ = self.params[f"b{ℓ}"]
            Zℓ = A_prev.dot(Wℓ) + bℓ              # shape (batch, hidden_dim)
            Aℓ = self.relu(Zℓ)                    # ReLU
            cache[f"Z{ℓ}"] = Zℓ
            cache[f"A{ℓ}"] = Aℓ
            A_prev = Aℓ

        # Output layer ℓ = L
        Wℓ = self.params[f"W{self.L}"]
        bℓ = self.params[f"b{self.L}"]
        Zℓ = A_prev.dot(Wℓ) + bℓ                  # shape (batch, 1)
        Aℓ = self.sigmoid(Zℓ)                     # Sigmoid
        cache[f"Z{self.L}"] = Zℓ
        cache[f"A{self.L}"] = Aℓ

        return Aℓ, cache

    def compute_loss_and_grad(self, AL, Y, cache):
        """
        Binary cross‐entropy loss and backprop gradients.
        AL: predicted (batch,1), Y: true (batch,1).
        cache: from forward.
        Returns:
          loss: scalar
          grads: dictionary of gradients dWℓ, dbℓ for ℓ=1…L
        """
        m = Y.shape[0]
        # Compute loss
        AL_clipped = np.clip(AL, 1e-8, 1 - 1e-8)
        loss = -np.sum(Y * np.log(AL_clipped) + (1 - Y) * np.log(1 - AL_clipped)) / m

        grads = {}
        # dA_L = -(Y/AL) + [(1-Y)/(1-AL)] but simplifies to AL - Y for BCE+sigmoid
        dZL = AL - Y                       # shape (m,1)
        A_prev = cache[f"A{self.L-1}"]     # shape (m, hidden_dim)
        grads[f"dW{self.L}"] = A_prev.T.dot(dZL) / m   # (hidden_dim, 1)
        grads[f"db{self.L}"] = np.sum(dZL, axis=0, keepdims=True) / m  # (1,1)

        # Backprop into hidden layers ℓ = L-1 … 1
        dA_prev = dZL.dot(self.params[f"W{self.L}"].T)  # (m, hidden_dim)

        for ℓ in reversed(range(1, self.L)):
            Zℓ = cache[f"Z{ℓ}"]                     # (m, hidden_dim)
            dZℓ = dA_prev * self.relu_grad(Zℓ)      # (m, hidden_dim)
            A_prev = cache[f"A{ℓ-1}"]               # ℓ-1=0 means X (m, D)
            grads[f"dW{ℓ}"] = A_prev.T.dot(dZℓ) / m  # (layer_dims[ℓ-1], layer_dims[ℓ])
            grads[f"db{ℓ}"] = np.sum(dZℓ, axis=0, keepdims=True) / m  # (1, hidden_dim)
            if ℓ > 1:
                dA_prev = dZℓ.dot(self.params[f"W{ℓ}"].T)

        return loss, grads

    def update_params(self, grads):
        for ℓ in range(1, self.L + 1):
            self.params[f"W{ℓ}"] -= self.lr * grads[f"dW{ℓ}"]
            self.params[f"b{ℓ}"] -= self.lr * grads[f"db{ℓ}"]

    def predict(self, X):
        AL, _ = self.forward(X)
        return (AL > 0.5).astype(int)

# ----------------------------
# 3) Training Loop
# ----------------------------
if __name__ == "__main__":
    # Generate data
    X, Y = generate_binary_data(n_per_class=300, seed=1)
    split = int(0.8 * X.shape[0])
    X_train, Y_train = X[:split], Y[:split]
    X_val,   Y_val   = X[split:], Y[split:]

    # Network dims: [input=2, hidden1=8, hidden2=4, output=1]
    layer_dims = [2, 8, 4, 1]
    model = MLPBinary(layer_dims, lr=0.05, seed=2)
    epochs = 200

    for epoch in range(1, epochs + 1):
        AL, cache = model.forward(X_train)
        loss, grads = model.compute_loss_and_grad(AL, Y_train, cache)
        model.update_params(grads)

        if epoch % 50 == 0 or epoch == 1:
            train_preds = model.predict(X_train)
            val_preds   = model.predict(X_val)
            train_acc = np.mean(train_preds == Y_train)
            val_acc   = np.mean(val_preds   == Y_val)
            print(f"Epoch {epoch:3d} | Loss: {loss:.4f} | Train ∕ Val Acc: {train_acc:.4f} ∕ {val_acc:.4f}")

    # Final
    train_acc = np.mean(model.predict(X_train) == Y_train)
    val_acc   = np.mean(model.predict(X_val)   == Y_val)
    print(f"\nFinal Train Acc: {train_acc:.4f} | Final Val Acc: {val_acc:.4f}")

Epoch   1 | Loss: 0.6976 | Train ∕ Val Acc: 0.5458 ∕ 0.5917
Epoch  50 | Loss: 0.3189 | Train ∕ Val Acc: 0.9771 ∕ 0.9750
Epoch 100 | Loss: 0.2427 | Train ∕ Val Acc: 0.9771 ∕ 0.9750
Epoch 150 | Loss: 0.1976 | Train ∕ Val Acc: 0.9771 ∕ 0.9750
Epoch 200 | Loss: 0.1684 | Train ∕ Val Acc: 0.9771 ∕ 0.9750

Final Train Acc: 0.9771 | Final Val Acc: 0.9750
