Task8. Do ‘fine-tuning’ of the last layer. Fix the parameters of all the layers except the last one, and solve for the parameters of the last layer exactly as a convex optimization problem. E.g., treat the input to the last layer as the features and use techniques from earlier in the course (this is particularly fast if you use the squared error, since it has a closed-form solution).

To gain a closed-form solution, we can use the squared error loss function, as used in Tasks 1-4.

In [16]:
import numpy as np
from scipy.io import loadmat


def standardizeCols(M, mu=None, sigma2=None):
    M = M.astype(float)  # Ensure M is float for precision
    nrows, ncols = M.shape
    if mu is None or sigma2 is None:
        mu = np.mean(M, axis=0)
        sigma2 = np.std(M, axis=0)
        sigma2[sigma2 < np.finfo(float).eps] = 1  # Avoid division by zero
    S = M - mu  # Subtract mean
    if ncols > 0:
        S = S / sigma2  # Divide by standard deviation
    return S, mu, sigma2


def form_weights(w, nVars, nHidden, nLabels):
    offset = 0
    inputWeights = w[offset:nVars * nHidden[0]].reshape(nVars, nHidden[0])
    offset += nVars * nHidden[0]
    hiddenWeights = []
    for h in range(1, len(nHidden)):
        size = nHidden[h-1] * nHidden[h]
        hiddenWeights.append(
            w[offset:offset+size].reshape(nHidden[h-1], nHidden[h]))
        offset += size
    outputWeights = w[offset:offset + nHidden[-1]
                      * nLabels].reshape(nHidden[-1], nLabels)
    return inputWeights, hiddenWeights, outputWeights


def sech2(x):
    return 1 - np.tanh(x)**2

In [17]:
def MLPclassificationPredict(w, X, nHidden, nLabels):
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(
        w, nVars, nHidden, nLabels)
    activations = X
    for h in range(len(nHidden)):
        activations = np.tanh(
            activations @ (inputWeights if h == 0 else hiddenWeights[h-1]))
    y = activations @ outputWeights
    y = np.argmax(y, axis=1, keepdims=True) + 1
    return y


def MLPclassificationLoss(w, X, y, nHidden, nLabels, reg_lambda):
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(
        w, nVars, nHidden, nLabels)

    activations = [X]
    for h in range(len(nHidden)):
        z = activations[-1] @ (inputWeights if h == 0 else hiddenWeights[h-1])
        a = np.tanh(z)
        activations.append(a)
    yhat = activations[-1] @ outputWeights

    # Add L2 regularization term to the loss function
    reg_loss = (reg_lambda / 2) * np.sum(w**2)
    total_loss = 1/2 * np.sum((yhat - y)**2) + reg_loss

    gOutput = activations[-1].T @ (yhat - y) + reg_lambda * outputWeights
    gHidden = []
    delta = (yhat - y) @ outputWeights.T * sech2(activations[-1])
    for h in range(len(nHidden) - 1, 0, -1):
        gHidden.append(activations[h].T @ delta +
                       reg_lambda * hiddenWeights[h-1])
        delta = (delta @ hiddenWeights[h-1].T) * sech2(activations[h])
    gHidden.append(activations[0].T @ delta + reg_lambda * inputWeights)
    gHidden.reverse()

    gradients = [gHidden[0].flatten()]
    for g in gHidden[1:]:
        gradients.append(g.flatten())
    gradients.append(gOutput.flatten())
    g = np.concatenate(gradients)

    return total_loss, g.reshape(-1, 1)

In [18]:
def fine_tuned_w(w, X, y, nHidden, nLabels, reg_lambda):
    nInstances, nVars = X.shape
    offset = 0
    inputWeights = w[offset:nVars * nHidden[0]].reshape(nVars, nHidden[0])
    offset += nVars * nHidden[0]
    hiddenWeights = []
    for h in range(1, len(nHidden)):
        size = nHidden[h-1] * nHidden[h]
        hiddenWeights.append(
            w[offset:offset+size].reshape(nHidden[h-1], nHidden[h]))
        offset += size
    outputWeights = w[offset:offset + nHidden[-1]
                      * nLabels].reshape(nHidden[-1], nLabels)

    # Compute activations of the last hidden layer
    activations = X
    for h in range(len(nHidden)):
        activations = np.tanh(activations @ (inputWeights if h == 0 else hiddenWeights[h-1]))

    # Solve for the output weights using the normal equation
    A = activations
    A_T_A = A.T @ A
    regularization_term = reg_lambda * np.eye(A_T_A.shape[0])
    inverse_term = np.linalg.inv(A_T_A + regularization_term)
    solution = inverse_term @ A.T @ y
    w[offset:offset + nHidden[-1]
      * nLabels] = solution.reshape(-1, 1)
    return w

In [23]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']
n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])
Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])

nHidden = [10]
d += 1

nParams = d * nHidden[0]
nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
nParams += nHidden[-1] * nLabels
maxIter = 100000
initialStepSize = 1e-3
decayRate = 1e-5
momentum = 0.9
reg_lambda = 0.1

w = np.random.randn(nParams, 1)
w_diff = np.zeros_like(w)
best_w = np.copy(w)
best_validation_error = float('inf')


for iter in range(0, maxIter):
    stepSize = initialStepSize * (1 / (1 + decayRate * iter))
    if (iter + 1) % (maxIter // 10) == 0:
        yhat1 = MLPclassificationPredict(w, X, nHidden, nLabels)
        train_error = np.mean(yhat1 != y.reshape(-1, 1))
        yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
        validation_error = np.mean(yhat != yvalid)
        print(f'Training iteration = {iter + 1}, training error = {train_error:.6f}, validation error = {validation_error:.6f}')

        if validation_error < best_validation_error:
            best_validation_error = validation_error
            best_w = np.copy(w)

    i = np.random.randint(n)
    _, g = MLPclassificationLoss(
        w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels, reg_lambda)
    w_diff = momentum * w_diff - stepSize * g
    w += w_diff

# Evaluate test error
yhat = MLPclassificationPredict(best_w, Xtest, nHidden, nLabels)
test_error = np.mean(yhat != ytest)
print(f'Test error with final model = {test_error:.6f}')

# Finetuning the output weights
fine_tuned_w(best_w, X, yExpanded, nHidden, nLabels, reg_lambda)
yhat = MLPclassificationPredict(best_w, Xtest, nHidden, nLabels)
test_error = np.mean(yhat != ytest)
print(f'Test error after fine-tuning = {test_error:.6f}')

Training iteration = 10000, training error = 0.165000, validation error = 0.174000
Training iteration = 20000, training error = 0.177400, validation error = 0.202200
Training iteration = 30000, training error = 0.157800, validation error = 0.167800
Training iteration = 40000, training error = 0.143400, validation error = 0.163000
Training iteration = 50000, training error = 0.146800, validation error = 0.170800
Training iteration = 60000, training error = 0.164800, validation error = 0.190200
Training iteration = 70000, training error = 0.136000, validation error = 0.146800
Training iteration = 80000, training error = 0.138000, validation error = 0.156800
Training iteration = 90000, training error = 0.129200, validation error = 0.143000
Training iteration = 100000, training error = 0.151200, validation error = 0.174800
Test error with final model = 0.149000
Test error after fine-tuning = 0.151000
