Task1. Change the network structure: the vector *nHidden* specifies the number of hidden units in each layer.


Some points to note:
1. We use as many matrix operations as possible to make the code more efficient, such as making forward passes for all the training data at once (eliminating the need for a for loop of instances).
2. We encapsulate the weight forming process in a function.

In [5]:
import numpy as np
from scipy.io import loadmat


def standardizeCols(M, mu=None, sigma2=None):
    """
    Standardize each column of matrix M to have zero mean and unit standard deviation.
    
    Parameters:
    M (numpy.ndarray): The input matrix.
    mu (numpy.ndarray, optional): Precomputed mean of the columns.
    sigma2 (numpy.ndarray, optional): Precomputed standard deviation of the columns.
    
    Returns:
    S (numpy.ndarray): The standardized matrix.
    mu (numpy.ndarray): Mean of the columns.
    sigma2 (numpy.ndarray): Standard deviation of the columns.
    """
    M = M.astype(float)  # Ensure M is float for precision
    nrows, ncols = M.shape

    if mu is None or sigma2 is None:
        mu = np.mean(M, axis=0)
        sigma2 = np.std(M, axis=0)
        sigma2[sigma2 < np.finfo(float).eps] = 1  # Avoid division by zero

    S = M - mu  # Subtract mean
    if ncols > 0:
        S = S / sigma2  # Divide by standard deviation

    return S, mu, sigma2


def form_weights(w, nVars, nHidden, nLabels):
    offset = 0
    inputWeights = w[offset:nVars * nHidden[0]].reshape(nVars, nHidden[0])
    offset += nVars * nHidden[0]
    hiddenWeights = []
    for h in range(1, len(nHidden)):
        size = nHidden[h-1] * nHidden[h]
        hiddenWeights.append(
            w[offset:offset+size].reshape(nHidden[h-1], nHidden[h]))
        offset += size
    outputWeights = w[offset:offset + nHidden[-1]
                      * nLabels].reshape(nHidden[-1], nLabels)
    return inputWeights, hiddenWeights, outputWeights


def sech2(x):
    return 1 - np.tanh(x)**2

In [6]:
# Forward pass
def MLPclassificationPredict(w, X, nHidden, nLabels):
    # w: 1-D, stores all weights
    nInstances, nVars = X.shape

    # Form Weights
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)

    # Compute Output in batch
    activations = X
    for h in range(len(nHidden)):
        activations = np.tanh(activations @ (inputWeights if h == 0 else hiddenWeights[h-1]))
    y = activations @ outputWeights

    # Pick the class with the highest score
    y = np.argmax(y, axis=1, keepdims=True) + 1
    return y

In [7]:
def MLPclassificationLoss(w, X, y, nHidden, nLabels):
    # y should be one-hot encoded
    nInstances, nVars = X.shape
    inputWeights, hiddenWeights, outputWeights = form_weights(w, nVars, nHidden, nLabels)

    # Forward pass
    activations = [X]
    for h in range(len(nHidden)):
        z = activations[-1] @ (inputWeights if h == 0 else hiddenWeights[h-1])
        a = np.tanh(z)
        activations.append(a)

    # Output layer
    yhat = activations[-1] @ outputWeights
    f = np.sum((yhat - y)**2)  # Loss

    # Backpropagation
    gOutput = 2 * activations[-1].T @ (yhat - y)

    # Gradients for hidden and input weights
    gHidden = []
    delta = 2 * (yhat - y) @ outputWeights.T * sech2(activations[-1])
    for h in range(len(nHidden) - 1, 0, -1):
        gHidden.append(activations[h].T @ delta)
        delta = (delta @ hiddenWeights[h-1].T) * sech2(activations[h])
    gHidden.append(activations[0].T @ delta)  # Input weights gradient
    gHidden.reverse()

    # Flatten gradients into vector
    gradients = [gHidden[0].flatten()]
    for g in gHidden[1:]:
        gradients.append(g.flatten())
    gradients.append(gOutput.flatten())
    g = np.concatenate(gradients)

    return f, g.reshape(-1, 1)

In [8]:
data = loadmat('digits.mat')
X = data['X']
y = data['y'].flatten()
yvalid = data['yvalid']
ytest = data['ytest']

n, d = X.shape  # 5000, 256
nLabels = np.max(y)  # 10
yExpanded = 2 * np.eye(nLabels)[y - 1] - 1  # turn into one-hot vector
t = data['Xvalid'].shape[0]  # 5000
t2 = data['Xtest'].shape[0]  # 1000


# Standardize columns and add bias
X, mu, sigma = standardizeCols(X)
X = np.hstack([np.ones((n, 1)), X])

# Apply the same transformation to the validation/test data
Xvalid, _, _ = standardizeCols(data['Xvalid'], mu, sigma)
Xvalid = np.hstack([np.ones((t, 1)), Xvalid])

Xtest, _, _ = standardizeCols(data['Xtest'], mu, sigma)
Xtest = np.hstack([np.ones((t2, 1)), Xtest])


# Choose network structure
nHiddens = [[10], [128], [10, 10], [128, 128], [10, 10, 10], [128, 128, 128]]
d += 1  # Adjust (n, d) = X.shape

for nHidden in nHiddens:
    print(f'\n-----Training with nHiddens = {nHidden}-----')
    nParams = d * nHidden[0]  # Input layer and first hidden layer
    nParams += sum(nHidden[h-1] * nHidden[h] for h in range(1, len(nHidden)))
    nParams += nHidden[-1] * nLabels  # Last hidden layer and output layer
    # initialize weights
    w = np.random.randn(nParams, 1)

    # Train with stochastic gradient
    maxIter = 100000
    stepSize = 1e-3

    for iter in range(0, maxIter):
        if (iter + 1) % (maxIter // 5) == 0:
            yhat = MLPclassificationPredict(w, Xvalid, nHidden, nLabels)
            validation_error = np.mean(yhat != yvalid)
            print(f'Training iteration = {iter + 1}, \
                validation error = {validation_error:.6f}')

        # batch = 1
        i = np.random.randint(n)
        f, g = MLPclassificationLoss(w, X[i:i+1], yExpanded[i:i+1], nHidden, nLabels)
        w -= stepSize * g

    # Evaluate test error
    yhat = MLPclassificationPredict(w, Xtest, nHidden, nLabels)
    test_error = np.mean(yhat != ytest)
    print(f'Test error with final model = {test_error:.6f}')


-----Training with nHiddens = [10]-----
Training iteration = 20000,                 validation error = 0.420000
Training iteration = 40000,                 validation error = 0.308400
Training iteration = 60000,                 validation error = 0.250800
Training iteration = 80000,                 validation error = 0.229400
Training iteration = 100000,                 validation error = 0.225000
Test error with final model = 0.213000

-----Training with nHiddens = [128]-----
Training iteration = 20000,                 validation error = 0.224400
Training iteration = 40000,                 validation error = 0.205400
Training iteration = 60000,                 validation error = 0.195400
Training iteration = 80000,                 validation error = 0.210600
Training iteration = 100000,                 validation error = 0.174000
Test error with final model = 0.167000

-----Training with nHiddens = [10, 10]-----
Training iteration = 20000,                 validation error = 0.469200


We observe that validation error decreases as the number of hidden units increases or the number of layers increases. The increase of unit numbers affects the result more significantly.