Task4. Add l2 regularization (or l1-regularization) of the weights to your loss function. For neural networks this is
called weight decay. An alternate form of regularization that is sometimes used is early stopping, which is
stopping training when the error on a validation set stops decreasing.

To check whether the model is overfitting, we print both the training and validation losses.

In [1]:
import numpy as np
from scipy.io import loadmat


def standardizeCols(M, mu=None, sigma2=None):
    M = M.astype(float)  # Ensure M is float for precision
    nrows, ncols = M.shape
    if mu is None or sigma2 is None:
        mu = np.mean(M, axis=0)
        sigma2 = np.std(M, axis=0)
        sigma2[sigma2 < np.finfo(float).eps] = 1  # Avoid division by zero
    S = M - mu  # Subtract mean
    if ncols > 0:
        S = S / sigma2  # Divide by standard deviation
    return S, mu, sigma2


def form_weights(w, nVars, nHidden, nLabels):
    offset = 0
    inputWeights = w[offset:nVars * nHidden[0]].reshape(nVars, nHidden[0])
    offset += nVars * nHidden[0]
    hiddenWeights = []
    for h in range(1, len(nHidden)):
        size = nHidden[h-1] * nHidden[h]
        hiddenWeights.append(
            w[offset:offset+size].reshape(nHidden[h-1], nHidden[h]))
        offset += size
    outputWeights = w[offset:offset + nHidden[-1]
                      * nLabels].reshape(nHidden[-1], nLabels)
    return inputWeights, hiddenWeights, outputWeights


def sech2(x):
    return 1 - np.tanh(x)**2

In [2]:
def MLPclassificationPredict(w, X, nHidden, nLabels):
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)
    activations = X
    for h in range(len(nHidden)):
        activations = np.tanh(activations @ (inputWeights if h == 0 else hiddenWeights[h-1]))
    y = activations @ outputWeights
    y = np.argmax(y, axis=1, keepdims=True) + 1
    return y

def MLPclassificationLoss(w, X, y, nHidden, nLabels):
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)

    activations = [X]
    for h in range(len(nHidden)):
        z = activations[-1] @ (inputWeights if h == 0 else hiddenWeights[h-1])
        a = np.tanh(z)
        activations.append(a)
    yhat = activations[-1] @ outputWeights
    f = np.sum((yhat - y)**2)  # Loss

    gOutput = 2 * activations[-1].T @ (yhat - y)
    gHidden = []
    delta = 2 * (yhat - y) @ outputWeights.T * sech2(activations[-1])
    for h in range(len(nHidden) - 1, 0, -1):
        gHidden.append(activations[h].T @ delta)
        delta = (delta @ hiddenWeights[h-1].T) * sech2(activations[h])
    gHidden.append(activations[0].T @ delta)
    gHidden.reverse()

    gradients = [gHidden[0].flatten()]
    for g in gHidden[1:]:
        gradients.append(g.flatten())
    gradients.append(gOutput.flatten())
    g = np.concatenate(gradients)

    return f, g.reshape(-1, 1)

### Early stopping

In [9]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']

n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000


# Standardize columns and add bias
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])

# Apply the same transformation to the validation/test data
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])

Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])


nHidden = [10]
d += 1

nParams = d * nHidden[0]
nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
nParams += nHidden[-1] * nLabels
maxIter = 100000
initialStepSize = 1e-3
stepSize = initialStepSize
decayRate = 1e-5
momentum = 0.9

w = np.random.randn(nParams, 1)
w_diff = np.zeros_like(w)
best_w = np.copy(w)
best_validation_error = float('inf')
patience = 5  # Number of iterations to wait before early stopping
patience_counter = 0


for iter in range(0, maxIter):
    stepSize = initialStepSize * (1 / (1 + decayRate * iter))
    if (iter + 1) % (maxIter // 10) == 0:
        yhat1 = MLPclassificationPredict(w, X, nHidden, nLabels)
        train_error = np.mean(yhat1 != y.reshape(-1, 1))
        yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
        validation_error = np.mean(yhat != yvalid)
        print(f'Training iteration = {iter + 1}, training error = {train_error:.6f}, validation error = {validation_error:.6f}')
            
        # Early Stopping check
        if validation_error < best_validation_error:
            best_validation_error = validation_error
            best_w = np.copy(w)
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after {iter + 1} iterations with validation error {best_validation_error:.6f}")
                break
    i = np.random.randint(n)
    f, g = MLPclassificationLoss(
        w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels)
    w_diff = momentum * w_diff - stepSize * g
    w += w_diff

# Evaluate test error
yhat = MLPclassificationPredict(best_w, Xtest, nHidden, nLabels)
test_error = np.mean(yhat != ytest)
print(f'Test error with final model = {test_error:.6f}')

Training iteration = 10000, training error = 0.295600, validation error = 0.315600
Training iteration = 20000, training error = 0.290000, validation error = 0.313800
Training iteration = 30000, training error = 0.186800, validation error = 0.197600
Training iteration = 40000, training error = 0.206000, validation error = 0.227600
Training iteration = 50000, training error = 0.187600, validation error = 0.214600
Training iteration = 60000, training error = 0.174800, validation error = 0.200200
Training iteration = 70000, training error = 0.176800, validation error = 0.202600
Training iteration = 80000, training error = 0.199200, validation error = 0.221800
Early stopping triggered after 80000 iterations with validation error 0.197600
Test error with final model = 0.211000


To show that the code actually works, we set the `patience` parameter to a small value. And indeed this gets better test accuracy than without early stopping.

### Add L2 regularization

In [10]:
def MLPclassificationLoss_l2(w, X, y, nHidden, nLabels, reg_lambda):
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(
        w, nVars, nHidden, nLabels)

    activations = [X]
    for h in range(len(nHidden)):
        z = activations[-1] @ (inputWeights if h == 0 else hiddenWeights[h-1])
        a = np.tanh(z)
        activations.append(a)
    yhat = activations[-1] @ outputWeights

    # Add L2 regularization term to the loss function
    reg_loss = (reg_lambda / 2) * np.sum(w**2)
    total_loss = np.sum((yhat - y)**2) + reg_loss

    gOutput = 2 * activations[-1].T @ (yhat - y) + reg_lambda * outputWeights
    gHidden = []
    delta = 2 * (yhat - y) @ outputWeights.T * sech2(activations[-1])
    for h in range(len(nHidden) - 1, 0, -1):
        gHidden.append(activations[h].T @ delta +
                       reg_lambda * hiddenWeights[h-1])
        delta = (delta @ hiddenWeights[h-1].T) * sech2(activations[h])
    gHidden.append(activations[0].T @ delta + reg_lambda * inputWeights)
    gHidden.reverse()

    gradients = [gHidden[0].flatten()]
    for g in gHidden[1:]:
        gradients.append(g.flatten())
    gradients.append(gOutput.flatten())
    g = np.concatenate(gradients)

    return total_loss, g.reshape(-1, 1)

In [11]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']
n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])
Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])

nHidden = [10]
d += 1

nParams = d * nHidden[0]
nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
nParams += nHidden[-1] * nLabels
maxIter = 100000
initialStepSize = 1e-3
decayRate = 1e-5
momentum = 0.9
reg_lambdas = [1.0, 0.5, 0.2, 0.1, 0.01, 0.001]

w = np.random.randn(nParams, 1)
w_diff = np.zeros_like(w)
best_w = np.copy(w)
best_validation_error = float('inf')

for reg_lambda in reg_lambdas:
    print(f"-----Training with regularization lambda = {reg_lambda}-----")
    for iter in range(0, maxIter):
        stepSize = initialStepSize * (1 / (1 + decayRate * iter))
        if (iter + 1) % (maxIter // 10) == 0:
            yhat1 = MLPclassificationPredict(w, X, nHidden, nLabels)
            train_error = np.mean(yhat1 != y.reshape(-1, 1))
            yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
            validation_error = np.mean(yhat != yvalid)
            print(f'Training iteration = {iter + 1}, training error = {train_error:.6f}, validation error = {validation_error:.6f}')

            if validation_error < best_validation_error:
                best_validation_error = validation_error
                best_w = np.copy(w)
        
        i = np.random.randint(n)
        _, g = MLPclassificationLoss_l2(
            w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels, reg_lambda)
        w_diff = momentum * w_diff - stepSize * g
        w += w_diff

    # Evaluate test error
    yhat = MLPclassificationPredict(best_w, Xtest, nHidden, nLabels)
    test_error = np.mean(yhat != ytest)
    print(f'Test error with final model = {test_error:.6f}')

-----Training with regularization lambda = 1.0-----
Training iteration = 10000, training error = 0.588800, validation error = 0.596200
Training iteration = 20000, training error = 0.503400, validation error = 0.524800
Training iteration = 30000, training error = 0.463800, validation error = 0.461600
Training iteration = 40000, training error = 0.529200, validation error = 0.540400
Training iteration = 50000, training error = 0.365400, validation error = 0.370000
Training iteration = 60000, training error = 0.480400, validation error = 0.510400
Training iteration = 70000, training error = 0.493200, validation error = 0.527600
Training iteration = 80000, training error = 0.455400, validation error = 0.463600
Training iteration = 90000, training error = 0.376400, validation error = 0.377000
Training iteration = 100000, training error = 0.388000, validation error = 0.393800
Test error with final model = 0.386000
-----Training with regularization lambda = 0.5-----
Training iteration = 10000

Note that a small value of `l2` is preferred, as a large value will make the model underfit.