Task2. Change the training procedure by modifying the sequence of step-sizes or using different step-sizes for different
variables (using momentum).

In [23]:
import numpy as np
from scipy.io import loadmat


def standardizeCols(M, mu=None, sigma2=None):
    M = M.astype(float)  # Ensure M is float for precision
    nrows, ncols = M.shape
    if mu is None or sigma2 is None:
        mu = np.mean(M, axis=0)
        sigma2 = np.std(M, axis=0)
        sigma2[sigma2 < np.finfo(float).eps] = 1  # Avoid division by zero
    S = M - mu  # Subtract mean
    if ncols > 0:
        S = S / sigma2  # Divide by standard deviation
    return S, mu, sigma2


def form_weights(w, nVars, nHidden, nLabels):
    offset = 0
    inputWeights = w[offset:nVars * nHidden[0]].reshape(nVars, nHidden[0])
    offset += nVars * nHidden[0]
    hiddenWeights = []
    for h in range(1, len(nHidden)):
        size = nHidden[h-1] * nHidden[h]
        hiddenWeights.append(
            w[offset:offset+size].reshape(nHidden[h-1], nHidden[h]))
        offset += size
    outputWeights = w[offset:offset + nHidden[-1]
                      * nLabels].reshape(nHidden[-1], nLabels)
    return inputWeights, hiddenWeights, outputWeights


def sech2(x):
    return 1 - np.tanh(x)**2

In [24]:
# Forward pass
def MLPclassificationPredict(w, X, nHidden, nLabels):
    # w: 1-D, stores all weights
    nInstances, nVars = X.shape

    # Form Weights
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)

    # Compute Output in batch
    activations = X
    for h in range(len(nHidden)):
        activations = np.tanh(activations @ (inputWeights if h == 0 else hiddenWeights[h-1]))
    y = activations @ outputWeights

    # Pick the class with the highest score
    y = np.argmax(y, axis=1, keepdims=True) + 1
    return y

In [25]:
def MLPclassificationLoss(w, X, y, nHidden, nLabels):
    # y should be one-hot encoded
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)

    # Forward pass
    activations = [X]
    for h in range(len(nHidden)):
        z = activations[-1] @ (inputWeights if h == 0 else hiddenWeights[h-1])
        a = np.tanh(z)
        activations.append(a)

    # Output layer
    yhat = activations[-1] @ outputWeights
    f = np.sum((yhat - y)**2)  # Loss

    # Backpropagation
    gOutput = 2 * activations[-1].T @ (yhat - y)

    # Gradients for hidden and input weights
    gHidden = []
    delta = 2 * (yhat - y) @ outputWeights.T * sech2(activations[-1])
    for h in range(len(nHidden) - 1, 0, -1):
        gHidden.append(activations[h].T @ delta)
        delta = (delta @ hiddenWeights[h-1].T) * sech2(activations[h])
    gHidden.append(activations[0].T @ delta)  # Input weights gradient
    gHidden.reverse()

    # Flatten gradients into vector
    gradients = [gHidden[0].flatten()]
    for g in gHidden[1:]:
        gradients.append(g.flatten())
    gradients.append(gOutput.flatten())
    g = np.concatenate(gradients)

    return f, g.reshape(-1, 1)

### Modify the sequence of step-sizes

We implement learning rate decay: current_lr = initial_lr / (1 + decayRate * iteration), and test with different initial learning rates.

In [26]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']

n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000


# Standardize columns and add bias
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])

# Apply the same transformation to the validation/test data
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])

Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])


nHidden = [10]  # Single hidden layer with 10 neurons
d += 1  # Adjust (n, d) = X.shape

# Count number of parameters
nParams = d * nHidden[0]  # Input layer and first hidden layer
nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
nParams += nHidden[-1] * nLabels  # Last hidden layer and output layer


# Train with stochastic gradient
maxIter = 100000
initialStepSizes = [0.1, 0.01, 1e-3, 1e-4, 1e-5]
decayRate = 1e-5

for initialStepSize in initialStepSizes:
    print(f'\n-----Training with initialStepSize = {initialStepSize}-----')
    w = np.random.randn(nParams, 1)
    for iter in range(0, maxIter):
        stepSize = initialStepSize * (1 / (1 + decayRate * iter))
        if (iter + 1) % (maxIter // 5) == 0:
            yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
            validation_error = np.mean(yhat != yvalid)
            print(f'Training iteration = {iter + 1}, \
                validation error = {validation_error:.6f}')

        # batch = 1
        i = np.random.randint(n)
        f, g = MLPclassificationLoss(w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels)
        w -= stepSize * g

    # Evaluate test error
    yhat = MLPclassificationPredict(w, Xtest, nHidden, nLabels)
    test_error = np.mean(yhat != ytest)
    print(f'Test error with final model = {test_error:.6f}')


-----Training with initialStepSize = 0.1-----
Training iteration = 20000,                 validation error = 0.879600
Training iteration = 40000,                 validation error = 0.895400
Training iteration = 60000,                 validation error = 0.881800
Training iteration = 80000,                 validation error = 0.873200
Training iteration = 100000,                 validation error = 0.901200
Test error with final model = 0.900000

-----Training with initialStepSize = 0.01-----
Training iteration = 20000,                 validation error = 0.212200
Training iteration = 40000,                 validation error = 0.210200
Training iteration = 60000,                 validation error = 0.228600
Training iteration = 80000,                 validation error = 0.220600
Training iteration = 100000,                 validation error = 0.214600
Test error with final model = 0.230000

-----Training with initialStepSize = 0.001-----
Training iteration = 20000,                 validation e

An initial learning rate of 0.01 or 0.001 is good for this problem.

## Use momentum

In [36]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']

n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000


# Standardize columns and add bias
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])

# Apply the same transformation to the validation/test data
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])

Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])


nHidden = [10]  # Single hidden layer with 10 neurons
d += 1  # Adjust (n, d) = X.shape

nParams = d * nHidden[0]
nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
nParams += nHidden[-1] * nLabels
maxIter = 100000
initialStepSize = 1e-3
stepSize = initialStepSize
decayRate = 1e-5
momentums = [0.5, 0.6, 0.7, 0.8, 0.9]

for momentum in momentums:
    print(f'\n-----Training with momentum = {momentum}-----')
    w = np.random.randn(nParams, 1)
    w_diff = np.zeros_like(w)
    for iter in range(0, maxIter):
        stepSize = initialStepSize * (1 / (1 + decayRate * iter))
        if (iter + 1) % (maxIter // 5) == 0:
            yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
            validation_error = np.mean(yhat != yvalid)
            print(f'Training iteration = {iter + 1}, \
                validation error = {validation_error:.6f}')
        i = np.random.randint(n)
        f, g = MLPclassificationLoss(
            w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels)
        w_diff = momentum * w_diff - stepSize * g
        w += w_diff

    # Evaluate test error
    yhat = MLPclassificationPredict(w, Xtest, nHidden, nLabels)
    test_error = np.mean(yhat != ytest)
    print(f'Test error with final model = {test_error:.6f}')


-----Training with momentum = 0.5-----
Training iteration = 20000,                 validation error = 0.295000
Training iteration = 40000,                 validation error = 0.240200
Training iteration = 60000,                 validation error = 0.228600
Training iteration = 80000,                 validation error = 0.212200
Training iteration = 100000,                 validation error = 0.213800
Test error with final model = 0.214000

-----Training with momentum = 0.6-----
Training iteration = 20000,                 validation error = 0.306600
Training iteration = 40000,                 validation error = 0.244000
Training iteration = 60000,                 validation error = 0.216000
Training iteration = 80000,                 validation error = 0.214400
Training iteration = 100000,                 validation error = 0.202400
Test error with final model = 0.211000

-----Training with momentum = 0.7-----
Training iteration = 20000,                 validation error = 0.254400
Training

A momentum strength of 0.9 is good for this problem.