In [1]:
import numpy as np

# Softmax function
def softmax(z):
    e_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return e_z / e_z.sum(axis=1, keepdims=True)

# Cross-entropy loss
def cross_entropy(y_true, y_pred):
    m = y_true.shape[0]
    return -np.sum(y_true * np.log(y_pred + 1e-8)) / m

# One-hot encode labels
def one_hot(y, num_classes):
    return np.eye(num_classes)[y]

# Batch GD with early stopping
def softmax_regression(X_train, y_train, X_val, y_val, lr=0.1, epochs=500, patience=10):
    m, n = X_train.shape
    num_classes = np.unique(y_train).size
    W = np.zeros((n, num_classes))
    b = np.zeros((1, num_classes))

    y_train_oh = one_hot(y_train, num_classes)
    y_val_oh = one_hot(y_val, num_classes)

    best_val_loss = np.inf
    best_W, best_b = None, None
    wait = 0

    for epoch in range(epochs):
        # Forward pass
        logits = np.dot(X_train, W) + b
        probs = softmax(logits)
        loss = cross_entropy(y_train_oh, probs)

        # Backpropagation
        grad_W = np.dot(X_train.T, (probs - y_train_oh)) / m
        grad_b = np.sum(probs - y_train_oh, axis=0, keepdims=True) / m

        # Update weights
        W -= lr * grad_W
        b -= lr * grad_b

        # Validation
        val_logits = np.dot(X_val, W) + b
        val_probs = softmax(val_logits)
        val_loss = cross_entropy(y_val_oh, val_probs)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_W, best_b = W.copy(), b.copy()
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

    return best_W, best_b
