### Batch Gradient Descent with early stopping for Softmax Regression

In [1]:
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris(as_frame=True)
list(iris)

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename',
 'data_module']

In [2]:
X = iris.data[["petal length (cm)", "petal width (cm)"]].values
y = iris["target"].values

Adding bias

In [3]:
X_with_bias = np.c_[np.ones(len(X)), X]

Making a train test split

In [4]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

np.random.seed(42)
rnd_indices = np.random.permutation(total_size)

X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [5]:
def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]

In [6]:
y_train[:10]

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1])

In [7]:
to_one_hot(y_train[:10])

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [8]:
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [9]:
mean = X_train[:, 1:].mean(axis=0)
std = X_train[:, 1:].std(axis=0)
X_train[:, 1:] = (X_train[:, 1:] - mean) / std
X_valid[:, 1:] = (X_valid[:, 1:] - mean) / std
X_test[:, 1:] = (X_test[:, 1:] - mean) / std

In [10]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis=1, keepdims=True)
    return exps / exp_sums

In [11]:
n_inputs = X_train.shape[1]  # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))  # == 3 (there are 3 iris classes)

In [20]:
eta = 0.5
n_epochs = 5001
m = len(X_train)
epsilon = 1e-5

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    logits = X_train @ Theta
    Y_proba = softmax(logits)
    if epoch % 1000 == 0:
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_losses = -(Y_valid_one_hot * np.log(Y_proba_valid + epsilon))
        print(epoch, xentropy_losses.sum(axis=1).mean())
    error = Y_proba - Y_train_one_hot
    gradients = 1 / m * X_train.T @ error
    Theta = Theta - eta * gradients

0 3.7085808486476917
1000 0.14519367480830644
2000 0.1301309575504088
3000 0.12009639326384539
4000 0.11372961364786884
5000 0.11002459532472425


In [22]:
error

array([[ 4.66800613e-05, -5.88018995e-03,  5.83350989e-03],
       [-1.90755932e-03,  1.90755932e-03,  1.05295323e-15],
       [ 2.23643901e-18,  1.66893328e-07, -1.66893328e-07],
       [ 1.00652123e-05, -2.77405207e-02,  2.77304555e-02],
       [ 7.61172155e-06, -5.23982815e-02,  5.23906697e-02],
       [-2.04768137e-03,  2.04768137e-03,  9.70581574e-16],
       [ 8.12331322e-04, -8.63656122e-04,  5.13247994e-05],
       [ 7.42398502e-12,  1.60323392e-03, -1.60323393e-03],
       [ 1.00652123e-05, -2.77405207e-02,  2.77304555e-02],
       [ 6.30387562e-04, -7.29891140e-04,  9.95035771e-05],
       [ 8.51176665e-10,  2.11675000e-02, -2.11675008e-02],
       [-1.70741651e-04,  1.70741651e-04,  3.61035371e-18],
       [-2.53616664e-04,  2.53616664e-04,  7.66511406e-18],
       [-2.36232711e-04,  2.36232711e-04,  8.31461676e-18],
       [-9.97277324e-04,  9.97277324e-04,  1.98684103e-16],
       [ 2.21441154e-06, -1.58258172e-01,  1.58255957e-01],
       [ 1.05770932e-13,  1.07901098e-04