In [2]:
import numpy as np

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def compute_loss_and_gradients(X, y, theta):
    m = X.shape[0]
    logits = X.dot(theta)
    y_proba = softmax(logits)
    loss = -np.mean(np.log(y_proba[np.arange(m), y]))
    y_one_hot = np.zeros_like(y_proba)
    y_one_hot[np.arange(m), y] = 1
    gradients = (1 / m) * X.T.dot(y_proba - y_one_hot)
    return loss, gradients

def batch_gradient_descent(X, y, learning_rate=0.01, n_epochs=1000, batch_size=32, early_stopping_rounds=10):
    m, n = X.shape
    n_classes = len(np.unique(y))
    theta = np.random.randn(n, n_classes)
    best_loss = np.inf
    epochs_no_improvement = 0

    for epoch in range(n_epochs):
        loss, gradients = compute_loss_and_gradients(X, y, theta)
        theta -= learning_rate * gradients

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

        if loss < best_loss:
            best_loss = loss
            epochs_no_improvement = 0
        else:
            epochs_no_improvement += 1

        if epochs_no_improvement > early_stopping_rounds:
            print("Early stopping...")
            break

    return theta

np.random.seed(42)
X = np.random.rand(1000, 3)  # 1000 instances, 3 features
y = np.random.randint(0, 3, 1000)  # 3 classes

X_bias = np.c_[np.ones((X.shape[0], 1)), X]

theta = batch_gradient_descent(X_bias, y)
print("Training complete.")

Epoch 0, Loss: 1.6817859304548384
Epoch 10, Loss: 1.628480953990373
Epoch 20, Loss: 1.5796805155007905
Epoch 30, Loss: 1.535215639901795
Epoch 40, Loss: 1.4948728023419178
Epoch 50, Loss: 1.458405248834145
Epoch 60, Loss: 1.4255444611825523
Epoch 70, Loss: 1.396010840780338
Epoch 80, Loss: 1.3695229693406223
Epoch 90, Loss: 1.3458051039228396
Epoch 100, Loss: 1.3245928238657652
Epoch 110, Loss: 1.3056369393839782
Epoch 120, Loss: 1.288705889712266
Epoch 130, Loss: 1.2735869123057537
Epoch 140, Loss: 1.2600862703869111
Epoch 150, Loss: 1.2480288015980923
Epoch 160, Loss: 1.237257010453709
Epoch 170, Loss: 1.227629882399196
Epoch 180, Loss: 1.2190215543167984
Epoch 190, Loss: 1.2113199388380085
Epoch 200, Loss: 1.2044253691793414
Epoch 210, Loss: 1.1982493073966365
Epoch 220, Loss: 1.192713141214395
Epoch 230, Loss: 1.1877470818986202
Epoch 240, Loss: 1.1832891669368601
Epoch 250, Loss: 1.1792843656000334
Epoch 260, Loss: 1.1756837819754242
Epoch 270, Loss: 1.1724439481330264
Epoch 280, 