03_mlp_binary_classification.ipynb: 2 layers (1 hidden + 1 output)

In [1]:
import numpy as np

# ----------------------------
# 1) Synthetic Binary Dataset
#    (reuse from Example 1)
# ----------------------------
def generate_binary_data(n_per_class=200, seed=0):
    np.random.seed(seed)
    N = n_per_class
    cov = [[0.5, 0], [0, 0.5]]
    x0 = np.random.multivariate_normal(mean=[-1, -1], cov=cov, size=N)
    y0 = np.zeros((N, 1))
    x1 = np.random.multivariate_normal(mean=[1, 1], cov=cov, size=N)
    y1 = np.ones((N, 1))
    X = np.vstack([x0, x1])
    Y = np.vstack([y0, y1])
    perm = np.random.permutation(2 * N)
    return X[perm], Y[perm]

# ----------------------------
# 2) Model: MLP with one hidden layer
# ----------------------------
class MLPBinary:
    def __init__(self, in_dim, hidden_dim, lr=0.1):
        """
        A 2‐layer MLP for binary classification:
          Input → Dense(hidden_dim) → ReLU → Dense(1) → Sigmoid
        """
        # Weights & biases
        self.W1 = np.random.randn(in_dim, hidden_dim) * np.sqrt(2 / in_dim)  # He init for ReLU
        self.b1 = np.zeros((1, hidden_dim))
        self.W2 = np.random.randn(hidden_dim, 1) * 0.01  # small random
        self.b2 = 0.0
        self.lr = lr

    def sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def sigmoid_grad(self, a):
        # a = sigmoid(z)
        return a * (1 - a)

    def relu(self, z):
        return np.maximum(0, z)

    def relu_grad(self, z):
        grad = np.zeros_like(z)
        grad[z > 0] = 1
        return grad

    def forward(self, X):
        """
        X: (batch, 2)
        Returns a tuple of caches and final output a2 (batch,1).
        """
        # Layer 1
        Z1 = X.dot(self.W1) + self.b1       # (batch, hidden_dim)
        A1 = self.relu(Z1)                  # (batch, hidden_dim)

        # Layer 2 (output)
        Z2 = A1.dot(self.W2) + self.b2      # (batch,1)
        A2 = self.sigmoid(Z2)               # (batch,1)

        cache = (X, Z1, A1, Z2, A2)
        return cache, A2

    def compute_loss_and_grad(self, cache, A2, Y):
        """
        cache: (X, Z1, A1, Z2, A2), Y: (batch,1)
        Returns: loss, grads dict
        """
        X, Z1, A1, Z2, A2 = cache
        m = X.shape[0]

        # Loss: binary cross‐entropy
        A2_clipped = np.clip(A2, 1e-8, 1 - 1e-8)
        loss = -np.sum(Y * np.log(A2_clipped) + (1 - Y) * np.log(1 - A2_clipped)) / m

        # Backprop
        dZ2 = A2 - Y                      # (batch,1)
        dW2 = A1.T.dot(dZ2) / m           # (hidden_dim,1)
        db2 = np.sum(dZ2) / m             # scalar

        dA1 = dZ2.dot(self.W2.T)          # (batch, hidden_dim)
        dZ1 = dA1 * self.relu_grad(Z1)    # (batch, hidden_dim)
        dW1 = X.T.dot(dZ1) / m            # (2, hidden_dim)
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # (1, hidden_dim)

        grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
        return loss, grads

    def update_params(self, grads):
        self.W1 -= self.lr * grads["dW1"]
        self.b1 -= self.lr * grads["db1"]
        self.W2 -= self.lr * grads["dW2"]
        self.b2 -= self.lr * grads["db2"]

    def predict(self, X):
        _, A2 = self.forward(X)
        return (A2 > 0.5).astype(int)

# ----------------------------
# 3) Training Loop
# ----------------------------
if __name__ == "__main__":
    # Generate data
    X, Y = generate_binary_data(n_per_class=200, seed=0)
    split = int(0.8 * X.shape[0])
    X_train, Y_train = X[:split], Y[:split]
    X_val,   Y_val   = X[split:], Y[split:]

    # Instantiate model
    model = MLPBinary(in_dim=2, hidden_dim=8, lr=0.05)
    epochs = 200

    for epoch in range(1, epochs + 1):
        cache, A2 = model.forward(X_train)
        loss, grads = model.compute_loss_and_grad(cache, A2, Y_train)
        model.update_params(grads)

        if epoch % 50 == 0 or epoch == 1:
            train_preds = model.predict(X_train)
            val_preds   = model.predict(X_val)
            train_acc = np.mean(train_preds == Y_train)
            val_acc   = np.mean(val_preds   == Y_val)
            print(f"Epoch {epoch:3d} | Loss: {loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Final evaluation
    train_acc = np.mean(model.predict(X_train) == Y_train)
    val_acc   = np.mean(model.predict(X_val)   == Y_val)
    print(f"\nFinal Train Acc: {train_acc:.4f} | Final Val Acc: {val_acc:.4f}")

Epoch   1 | Loss: 0.6959 | Train Acc: 0.9594 | Val Acc: 0.9625
Epoch  50 | Loss: 0.1855 | Train Acc: 0.9563 | Val Acc: 0.9750
Epoch 100 | Loss: 0.1338 | Train Acc: 0.9625 | Val Acc: 0.9750
Epoch 150 | Loss: 0.1136 | Train Acc: 0.9625 | Val Acc: 0.9750
Epoch 200 | Loss: 0.1032 | Train Acc: 0.9625 | Val Acc: 0.9750

Final Train Acc: 0.9625 | Final Val Acc: 0.9750
