04_mlp_multiclass_classification.ipynb: 2 layers (1 hidden + 1 output)

In [1]:
import numpy as np

# ----------------------------
# 1) Synthetic Multiclass Dataset
#    (reuse from Example 2)
# ----------------------------
def generate_multiclass_data(n_per_class=200, seed=0):
    np.random.seed(seed)
    N = n_per_class
    cov = [[0.3, 0], [0, 0.3]]
    x0 = np.random.multivariate_normal(mean=[-1, 0], cov=cov, size=N)
    y0 = np.zeros((N, 3)); y0[:, 0] = 1
    x1 = np.random.multivariate_normal(mean=[1, 0], cov=cov, size=N)
    y1 = np.zeros((N, 3)); y1[:, 1] = 1
    x2 = np.random.multivariate_normal(mean=[0, 1.5], cov=cov, size=N)
    y2 = np.zeros((N, 3)); y2[:, 2] = 1
    X = np.vstack([x0, x1, x2])
    Y = np.vstack([y0, y1, y2])
    perm = np.random.permutation(3 * N)
    return X[perm], Y[perm]

# ----------------------------
# 2) Model: MLP with one hidden layer
# ----------------------------
class MLPMulti:
    def __init__(self, in_dim, hidden_dim, out_dim, lr=0.1):
        """
        2‐layer MLP for K‐class classification:
          Input → Dense(hidden_dim) → ReLU → Dense(out_dim) → Softmax
        """
        # He initialization for hidden (ReLU)
        self.W1 = np.random.randn(in_dim, hidden_dim) * np.sqrt(2 / in_dim)
        self.b1 = np.zeros((1, hidden_dim))
        # Small random for output
        self.W2 = np.random.randn(hidden_dim, out_dim) * 0.01
        self.b2 = np.zeros((1, out_dim))
        self.lr = lr

    def relu(self, z):
        return np.maximum(0, z)

    def relu_grad(self, z):
        grad = np.zeros_like(z)
        grad[z > 0] = 1
        return grad

    def softmax(self, z):
        z_shift = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_shift)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def forward(self, X):
        """
        X: (batch, 2)
        Returns cache and output probabilities (batch, 3).
        """
        # Hidden layer
        Z1 = X.dot(self.W1) + self.b1     # (batch, hidden_dim)
        A1 = self.relu(Z1)                # (batch, hidden_dim)

        # Output layer
        Z2 = A1.dot(self.W2) + self.b2    # (batch, out_dim)
        A2 = self.softmax(Z2)             # (batch, out_dim)

        cache = (X, Z1, A1, Z2, A2)
        return cache, A2

    def compute_loss_and_grad(self, cache, A2, Y_onehot):
        """
        Y_onehot: (batch, out_dim)
        Returns: loss, grads dict
        """
        X, Z1, A1, Z2, A2 = cache
        m = X.shape[0]

        # Loss: multiclass cross‐entropy
        A2_clipped = np.clip(A2, 1e-8, 1 - 1e-8)
        loss = -np.sum(Y_onehot * np.log(A2_clipped)) / m

        # Backprop
        dZ2 = (A2 - Y_onehot) / m         # (batch, out_dim)
        dW2 = A1.T.dot(dZ2)               # (hidden_dim, out_dim)
        db2 = np.sum(dZ2, axis=0, keepdims=True)  # (1, out_dim)

        dA1 = dZ2.dot(self.W2.T)          # (batch, hidden_dim)
        dZ1 = dA1 * self.relu_grad(Z1)    # (batch, hidden_dim)
        dW1 = X.T.dot(dZ1)                # (in_dim, hidden_dim)
        db1 = np.sum(dZ1, axis=0, keepdims=True)  # (1, hidden_dim)

        grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
        return loss, grads

    def update_params(self, grads):
        self.W1 -= self.lr * grads["dW1"]
        self.b1 -= self.lr * grads["db1"]
        self.W2 -= self.lr * grads["dW2"]
        self.b2 -= self.lr * grads["db2"]

    def predict(self, X):
        _, A2 = self.forward(X)
        return np.argmax(A2, axis=1)  # (batch,)

# ----------------------------
# 3) Training Loop
# ----------------------------
if __name__ == "__main__":
    # Generate data
    X, Y = generate_multiclass_data(n_per_class=200, seed=0)
    Y_int = np.argmax(Y, axis=1)

    # Split 80% train, 20% val
    split = int(0.8 * X.shape[0])
    X_train, Y_train = X[:split], Y[:split]
    X_val,   Y_val   = X[split:], Y[split:]
    Y_val_int = Y_int[split:]

    # Instantiate model
    model = MLPMulti(in_dim=2, hidden_dim=8, out_dim=3, lr=0.1)
    epochs = 200

    for epoch in range(1, epochs + 1):
        cache, A2 = model.forward(X_train)
        loss, grads = model.compute_loss_and_grad(cache, A2, Y_train)
        model.update_params(grads)

        if epoch % 50 == 0 or epoch == 1:
            train_preds = model.predict(X_train)
            val_preds   = model.predict(X_val)
            train_acc = np.mean(train_preds == np.argmax(Y_train, axis=1))
            val_acc   = np.mean(val_preds   == Y_val_int)
            print(f"Epoch {epoch:3d} | Loss: {loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Final evaluation
    train_acc = np.mean(model.predict(X_train) == np.argmax(Y_train, axis=1))
    val_acc   = np.mean(model.predict(X_val)   == Y_val_int)
    print(f"\nFinal Train Acc: {train_acc:.4f} | Final Val Acc: {val_acc:.4f}")

Epoch   1 | Loss: 1.0826 | Train Acc: 0.7979 | Val Acc: 0.7500
Epoch  50 | Loss: 0.3472 | Train Acc: 0.9000 | Val Acc: 0.9000
Epoch 100 | Loss: 0.2255 | Train Acc: 0.9250 | Val Acc: 0.9167
Epoch 150 | Loss: 0.1962 | Train Acc: 0.9271 | Val Acc: 0.9333
Epoch 200 | Loss: 0.1854 | Train Acc: 0.9271 | Val Acc: 0.9333

Final Train Acc: 0.9271 | Final Val Acc: 0.9333
