In [53]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import time, random, numpy as np, torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import zipfile
import os

In [None]:
if not os.path.exists("data"):
    with zipfile.ZipFile("data.zip", "r") as zip_ref:
        zip_ref.extractall("data")

In [None]:
# hyperparameters
NUM_EPOCHS = 5
NUM_TRAINING = 1000
NUM_TESTING = 500
NUM_VALIDATION = 500

NUM_FACE_TRAINING = 451
NUM_FACE_VALIDATION = 301
NUM_FACE_TESTING = 150

IMAGE_HEIGHT = 28
IMAGE_WIDTH = 28
NUM_CLASSES = 10

In [None]:
# filepaths
train_data_file = "data/digitdata/trainingimages"
train_label_file = "data/digitdata/traininglabels"
val_data_file = "data/digitdata/validationimages"
val_label_file = "data/digitdata/validationlabels"
test_data_file = "data/digitdata/testimages"
test_label_file = "data/digitdata/testlabels"

face_train_data_file = "data/facedata/facedatatrain"
face_train_label_file = "data/facedata/facedatatrainlabels"
face_val_data_file   = "data/facedata/facedatavalidation"
face_val_label_file  = "data/facedata/facedatavalidationlabels"
face_test_data_file  = "data/facedata/facedatatest"
face_test_label_file = "data/facedata/facedatatestlabels"


## Data Loading and Preprocessing

In [None]:

def read_data_file(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return [line.rstrip("\n") for line in lines]

def extract_features(raw_data):
    features = []
    for i in range(0, len(raw_data), 28):
        image = raw_data[i:i+28]
        feature = [1 if ch != ' ' else 0 for row in image for ch in row]
        features.append(feature)
    return features


def read_labels(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return [int(line.strip()) for line in lines]

def load_dataset(data_file, label_file, size=None):
    raw_data = read_data_file(data_file)
    raw_labels = read_labels(label_file)

    features = extract_features(raw_data)
    if size is not None:
        combined = list(zip(features, raw_labels))
        random.shuffle(combined)
        features, raw_labels = zip(*combined[:size])

    return list(features), list(raw_labels)

def one_hot_encode(y, num_classes=10):
    encoded = np.zeros((num_classes, len(y)))
    for idx, val in enumerate(y):
        encoded[val][idx] = 1
    return encoded

def evaluate(predictions, labels):
    correct = sum(p == t for p, t in zip(predictions, labels))
    return correct / len(labels)


In [None]:
def extract_face_features(raw_data):
    features = []
    for i in range(0, len(raw_data), 70):  # 70 rows per image
        image = raw_data[i:i+70]
        assert all(len(row) == 60 for row in image), "Expected 60 columns per row in face image"
        feature = [1 if ch != ' ' else 0 for row in image for ch in row]
        features.append(feature)
    return features

def load_face_dataset(data_file, label_file, size=None):
    raw_data = read_data_file(data_file)
    raw_labels = read_labels(label_file)

    features = extract_face_features(raw_data)
    if size is not None:
        combined = list(zip(features, raw_labels))
        random.shuffle(combined)
        features, raw_labels = zip(*combined[:size])

    return list(features), list(raw_labels)

def one_hot_encode_face(y, num_classes=2):
    encoded = np.zeros((num_classes, len(y)))
    for idx, val in enumerate(y):
        encoded[val][idx] = 1
    return encoded


# Three-Layer Neural Network: Manual Implementations of Forward Pass, Back-Propagation, and Weight Update

## Neural Network Functions

In [45]:
import numpy as np

# Activation functions
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

def softmax(Z):
    e_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return e_Z / np.sum(e_Z, axis=0, keepdims=True)


# Initialize weights and biases
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(42)
    return {
        'W1': np.random.randn(hidden1_size, input_size) * 0.01,
        'b1': np.zeros((hidden1_size, 1)),
        'W2': np.random.randn(hidden2_size, hidden1_size) * 0.01,
        'b2': np.zeros((hidden2_size, 1)),
        'W3': np.random.randn(output_size, hidden2_size) * 0.01,
        'b3': np.zeros((output_size, 1))
    }

# Forward pass
def forward_propagation(X, parameters):
    W1, b1 = parameters['W1'], parameters['b1']
    W2, b2 = parameters['W2'], parameters['b2']
    W3, b3 = parameters['W3'], parameters['b3']

    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)

    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)

    Z3 = np.dot(W3, A2) + b3
    A3 = sigmoid(Z3)

    cache = (Z1, A1, Z2, A2, Z3, A3)
    return A3, cache


def forward_propagation_face(X, parameters, dropout_rate=0.5, training=True):
    W1, b1 = parameters['W1'], parameters['b1']
    W2, b2 = parameters['W2'], parameters['b2']
    W3, b3 = parameters['W3'], parameters['b3']

    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)

    if training:
        D1 = (np.random.rand(*A1.shape) < dropout_rate).astype(float)
        A1 *= D1
        A1 /= dropout_rate
    else:
        D1 = None

    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)

    if training:
        D2 = (np.random.rand(*A2.shape) < dropout_rate).astype(float)
        A2 *= D2
        A2 /= dropout_rate
    else:
        D2 = None

    Z3 = np.dot(W3, A2) + b3
    A3 = softmax(Z3)

    # Include dropout masks in the cache
    cache = (Z1, A1, D1, Z2, A2, D2, Z3, A3)
    return A3, cache



# Loss
def compute_loss(Y_hat, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(Y_hat + 1e-8) + (1 - Y) * np.log(1 - Y_hat + 1e-8)) / m

def compute_loss_l2(Y_hat, Y, parameters, lambda_reg=0.1):
    m = Y.shape[1]
    cross_entropy = -np.sum(Y * np.log(Y_hat + 1e-8)) / m
    l2 = (lambda_reg / (2 * m)) * (
        np.sum(np.square(parameters['W1'])) +
        np.sum(np.square(parameters['W2'])) +
        np.sum(np.square(parameters['W3']))
    )
    return cross_entropy + l2

# Backward pass
def backward_propagation(X, Y, parameters, cache):
    m = X.shape[1]
    W2, W3 = parameters['W2'], parameters['W3']
    Z1, A1, Z2, A2, Z3, A3 = cache

    dZ3 = A3 - Y
    dW3 = (1/m) * np.dot(dZ3, A2.T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = dA2 * relu_derivative(Z2)
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return {
        'dW1': dW1, 'db1': db1,
        'dW2': dW2, 'db2': db2,
        'dW3': dW3, 'db3': db3
    }


def backward_propagation_face(X, Y, parameters, cache, dropout_rate=0.5):
    m = X.shape[1]
    W2, W3 = parameters['W2'], parameters['W3']
    Z1, A1, D1, Z2, A2, D2, Z3, A3 = cache

    dZ3 = A3 - Y
    dW3 = (1/m) * np.dot(dZ3, A2.T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dA2 *= D2
    dA2 /= dropout_rate
    dZ2 = dA2 * relu_derivative(Z2)
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dA1 *= D1
    dA1 /= dropout_rate
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return {
        'dW1': dW1, 'db1': db1,
        'dW2': dW2, 'db2': db2,
        'dW3': dW3, 'db3': db3
    }

# Gradient descent update
def update_parameters(params, grads, lr):
    for key in params:
        params[key] -= lr * grads['d' + key]
    return params

# Prediction
def predict_nn(X, parameters):
    Y_hat, _ = forward_propagation(X, parameters)
    return np.argmax(Y_hat, axis=0)

def predict_nn_face(X, parameters):
    Y_hat, _ = forward_propagation_face(X, parameters, training=False)
    return np.argmax(Y_hat, axis=0)

# Training loop
def train_neural_net(X_train, y_train, X_test, y_test,
                     input_size, h1, h2, output_size,
                     epochs=1000, lr=0.1, print_loss=True,
                     X_val=None, y_val=None, early_stopping=False, patience=10):
    
    parameters = initialize_parameters(input_size, h1, h2, output_size)
    best_params = None
    best_val_acc = 0
    val_acc_counter = 0

    for epoch in range(epochs):
        # Forward and backpropagation
        Y_hat, cache = forward_propagation(X_train, parameters)
        loss = compute_loss(Y_hat, y_train)
        grads = backward_propagation(X_train, y_train, parameters, cache)
        parameters = update_parameters(parameters, grads, lr)

        # Check performance every 100 epochs
        if epoch % 100 == 0 or epoch == epochs - 1:
            train_preds = predict_nn(X_train, parameters)
            train_acc = evaluate(train_preds, np.argmax(y_train, axis=0))
            
            if X_val is not None and y_val is not None:
                val_preds = predict_nn(X_val, parameters)
                val_acc = evaluate(val_preds, np.argmax(y_val, axis=0))

                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f} | Val Acc = {val_acc:.4f}")

                # Save best model
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_params = {k: v.copy() for k, v in parameters.items()}
                    val_acc_counter = 0
                else:
                    val_acc_counter += 1
                    if early_stopping and val_acc_counter >= patience:
                        print("Early stopping triggered.")
                        break
            else:
                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f}")

    final_params = best_params if best_params is not None else parameters

    # Final test evaluation
    test_preds = predict_nn(X_test, final_params)
    test_acc = evaluate(test_preds, np.argmax(y_test, axis=0))
    print(f"Final Test Accuracy: {test_acc:.4f}")
    return final_params



def train_neural_net_face(X_train, y_train, X_test, y_test,
                     input_size, h1, h2, output_size,
                     epochs=1000, lr=0.1, print_loss=True,
                     X_val=None, y_val=None, early_stopping=False,
                     patience=10, dropout_rate=0.5, lambda_reg=0.1):  
    
    parameters = initialize_parameters(input_size, h1, h2, output_size)
    best_params = None
    best_val_acc = 0
    val_acc_counter = 0

    for epoch in range(epochs):
        # === DROPOUT + L2 ===
        Y_hat, cache = forward_propagation_face(X_train, parameters, dropout_rate=dropout_rate, training=True)
        loss = compute_loss_l2(Y_hat, y_train, parameters, lambda_reg=lambda_reg)
        grads = backward_propagation_face(X_train, y_train, parameters, cache, dropout_rate=dropout_rate)
        parameters = update_parameters(parameters, grads, lr)

        if epoch % 100 == 0 or epoch == epochs - 1:
            train_preds = predict_nn_face(X_train, parameters)
            train_acc = evaluate(train_preds, np.argmax(y_train, axis=0))

            if X_val is not None and y_val is not None:
                val_preds = predict_nn_face(X_val, parameters)
                val_acc = evaluate(val_preds, np.argmax(y_val, axis=0))

                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f} | Val Acc = {val_acc:.4f}")

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_params = {k: v.copy() for k, v in parameters.items()}
                    val_acc_counter = 0
                else:
                    val_acc_counter += 1
                    if early_stopping and val_acc_counter >= patience:
                        print("Early stopping triggered.")
                        break
            else:
                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f}")

    final_params = best_params if best_params is not None else parameters
    test_preds = predict_nn_face(X_test, final_params)
    test_acc = evaluate(test_preds, np.argmax(y_test, axis=0))
    print(f"Final Test Accuracy: {test_acc:.4f}")
    return final_params


## Digit Classification

In [94]:
print("Testing neural net on digit data")

X_train_raw, y_train_raw = load_dataset(train_data_file, train_label_file, size=NUM_TRAINING)
X_val_raw, y_val_raw = load_dataset(val_data_file, val_label_file, size=NUM_VALIDATION)
X_test_raw, y_test_raw = load_dataset(test_data_file, test_label_file, size=NUM_TESTING)

X_train = np.array(X_train_raw).T
X_val = np.array(X_val_raw).T
X_test = np.array(X_test_raw).T

y_train = one_hot_encode(y_train_raw)
y_val = one_hot_encode(y_val_raw)
y_test = one_hot_encode(y_test_raw)

# Train on increasing percentages of DIGIT data 
percentages = [0.1 * i for i in range(1, 11)]  # 10% to 100%
total_digit_samples = X_train.shape[1]

digit_results = []

for pct in percentages:
    n = int(pct * total_digit_samples)    
    X_subset = X_train[:, :n]
    y_subset = y_train[:, :n]

    print(f"\n DIGITS: Training on {n} samples ({int(pct * 100)}%)")
    
    trained_params = train_neural_net(
        X_subset, y_subset,
        X_test, y_test,
        input_size=784, h1=128, h2=64, output_size=10,
        epochs=1000, lr=0.1,
        X_val=X_val, y_val=y_val,
        early_stopping=True, patience=10
    )

    test_preds = predict_nn(X_test, trained_params)
    test_acc = evaluate(test_preds, np.argmax(y_test, axis=0))
    digit_results.append((n, test_acc))
    print(f"DIGITS Test Accuracy with {n} samples: {test_acc:.4f}")

Testing neural net on digit data

 DIGITS: Training on 100 samples (10%)
Epoch 0: Loss = 6.9313 | Train Acc = 0.1200 | Val Acc = 0.0900
Epoch 100: Loss = 3.2256 | Train Acc = 0.1400 | Val Acc = 0.0960
Epoch 200: Loss = 3.1577 | Train Acc = 0.2000 | Val Acc = 0.1460
Epoch 300: Loss = 1.5617 | Train Acc = 0.7700 | Val Acc = 0.5240
Epoch 400: Loss = 0.3130 | Train Acc = 1.0000 | Val Acc = 0.6300
Epoch 500: Loss = 0.0584 | Train Acc = 1.0000 | Val Acc = 0.6380
Epoch 600: Loss = 0.0249 | Train Acc = 1.0000 | Val Acc = 0.6400
Epoch 700: Loss = 0.0146 | Train Acc = 1.0000 | Val Acc = 0.6420
Epoch 800: Loss = 0.0099 | Train Acc = 1.0000 | Val Acc = 0.6380
Epoch 900: Loss = 0.0074 | Train Acc = 1.0000 | Val Acc = 0.6400
Epoch 999: Loss = 0.0058 | Train Acc = 1.0000 | Val Acc = 0.6380
Final Test Accuracy: 0.5900
DIGITS Test Accuracy with 100 samples: 0.5900

 DIGITS: Training on 200 samples (20%)
Epoch 0: Loss = 6.9313 | Train Acc = 0.1100 | Val Acc = 0.0840
Epoch 100: Loss = 3.2451 | Train Acc 

## Face Classification

In [None]:

print("Testing neural net on face data")

# Load and process face data
X_face_train_raw, y_face_train_raw = load_face_dataset(face_train_data_file, face_train_label_file, size=NUM_FACE_TRAINING)
X_face_val_raw, y_face_val_raw = load_face_dataset(face_val_data_file, face_val_label_file, size=NUM_FACE_VALIDATION)
X_face_test_raw, y_face_test_raw = load_face_dataset(face_test_data_file, face_test_label_file, size=NUM_FACE_TESTING)

X_face_train = np.array(X_face_train_raw).T
X_face_val = np.array(X_face_val_raw).T
X_face_test = np.array(X_face_test_raw).T

y_face_train = one_hot_encode_face(y_face_train_raw)
y_face_val = one_hot_encode_face(y_face_val_raw)
y_face_test = one_hot_encode_face(y_face_test_raw)

# Train on increasing percentages of FACE data 
percentages = [0.1 * i for i in range(1, 11)]  # 10% to 100%
total_face_samples = X_face_train.shape[1]

face_results = []

for pct in percentages:
    n = int(pct * total_face_samples)

    X_subset = X_face_train[:, :n]
    y_subset = y_face_train[:, :n]

    print(f"\n FACES: Training on {n} samples ({int(pct * 100)}%)")

    trained_params = train_neural_net_face(
        X_subset, y_subset,
        X_face_test, y_face_test,
        input_size=4200, h1=32, h2=16, output_size=2,
        epochs=1000, lr=0.1,
        X_val=X_face_val, y_val=y_face_val,
        early_stopping=True, patience=10,
        dropout_rate=0.5,  # dropout to decrease overfitting   
        lambda_reg=0.5  #l2 regularization to decrease overfitting
    )


    test_preds = predict_nn_face(X_face_test, trained_params)
    test_acc = evaluate(test_preds, np.argmax(y_face_test, axis=0))
    face_results.append((n, test_acc))
    print(f"FACES Test Accuracy with {n} samples: {test_acc:.4f}")

Testing neural net on face data

 FACES: Training on 45 samples (10%)
Epoch 0: Loss = 0.7682 | Train Acc = 0.5556 | Val Acc = 0.5183
Epoch 100: Loss = 0.7618 | Train Acc = 0.5556 | Val Acc = 0.5183
Epoch 200: Loss = 0.7194 | Train Acc = 0.5556 | Val Acc = 0.5183
Epoch 300: Loss = 0.1878 | Train Acc = 1.0000 | Val Acc = 0.7143
Epoch 400: Loss = 0.1487 | Train Acc = 1.0000 | Val Acc = 0.6910
Epoch 500: Loss = 0.1312 | Train Acc = 1.0000 | Val Acc = 0.6146
Epoch 600: Loss = 0.1297 | Train Acc = 1.0000 | Val Acc = 0.6478
Epoch 700: Loss = 0.1343 | Train Acc = 1.0000 | Val Acc = 0.6744
Epoch 800: Loss = 0.1378 | Train Acc = 1.0000 | Val Acc = 0.6512
Epoch 900: Loss = 0.1354 | Train Acc = 1.0000 | Val Acc = 0.6512
Epoch 999: Loss = 0.1407 | Train Acc = 1.0000 | Val Acc = 0.6512
Final Test Accuracy: 0.6733
FACES Test Accuracy with 45 samples: 0.6733

 FACES: Training on 90 samples (20%)
Epoch 0: Loss = 0.7307 | Train Acc = 0.5000 | Val Acc = 0.4751
Epoch 100: Loss = 0.7306 | Train Acc = 0.744

# Three-Layer Neural Network: PyTorch Implementation

## Helper Functions

In [100]:
def to_tensor(x, dtype=torch.float32):
    return torch.tensor(np.asarray(x), dtype=dtype)

def make_loader(X, y, batch_size=32, shuffle=True):
    ds = TensorDataset(to_tensor(X), torch.tensor(y, dtype=torch.long))
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

def accuracy(logits, y):
    return (logits.argmax(1) == y).float().mean().item()

## Neural Network Functions

In [95]:
# three-layer neural network w/ dropout
class MLP(nn.Module):
    def __init__(self, dim_in, h1, h2, n_out, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim_in, h1),
            nn.BatchNorm1d(h1),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(h1, h2),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(h2, n_out)
        )
    def forward(self, x):
        return self.net(x)

In [123]:
class ClassificationNetwork(nn.Module):
    def __init__(self, in_shape, n_out, dropout=0.5): # in_shape formatted like (channels, height, width)
        super().__init__()
        C, H, W = in_shape
        self.features = nn.Sequential(
            nn.Unflatten(dim=1, unflattened_size=in_shape),
            nn.Conv2d(C, 32, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        H2, W2 = H // 2, W // 2 # after pool, divide by 2
        fc_in  = 32 * H2 * W2

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=dropout),
            nn.Linear(fc_in, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, n_out) # output layer
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

In [126]:
def get_accuracy(model, loader, device="cpu"):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb).argmax(1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)
    return correct / total

# training loop
def train(model, train_loader, val_loader, epochs=200, lr=1e-3, patience=15, weight_decay=1e-4, log_every=10, device="cpu"):
    model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_func = nn.CrossEntropyLoss()

    best_val_acc, best_state, bad_epochs = 0.0, None, 0
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            running_loss += loss.item() * yb.size(0)

        train_acc = get_accuracy(model, train_loader, device)
        val_acc = get_accuracy(model, val_loader, device)

        if epoch % log_every == 0 or epoch == epochs:
            avg_loss = running_loss / len(train_loader.dataset)
            print(f"Epoch {epoch:3d}: "
                  f"Loss = {avg_loss:.4f} | "
                  f"Train Acc = {train_acc:.4f} | "
                  f"Val Acc = {val_acc:.4f}")

        if val_acc > best_val_acc + 1e-5:
            best_val_acc, best_state = val_acc, model.state_dict()
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping triggered.")
                break

    model.load_state_dict(best_state)
    return model, best_val_acc

def test_neural_net(X_train, y_train, X_val, y_val, X_test, y_test, channels, height, width, n_out,
                    dropout, lr, epochs, weight_decay, patience):
    results = []
    fractions = np.linspace(0.1, 1.0, 10)
    for frac in fractions:
        n = int(frac * len(y_train))
        print(f"\nTraining on {n} samples ({frac:.0%})")
        train_loader = make_loader(X_train[:n], y_train[:n])
        val_loader = make_loader(X_val, y_val, shuffle=True)
        test_loader = make_loader(X_test, y_test, shuffle=True)

        # model = MLP(dim_in, h1, h2, n_out, dropout)
        model = ClassificationNetwork((channels, height, width), n_out)
        model, _ = train(model, train_loader, val_loader, lr=lr, epochs=epochs, weight_decay=weight_decay, patience=patience)

        # final test
        model.eval()
        with torch.no_grad():
            test_acc = np.mean([
                accuracy(model(xb), yb) for xb, yb in test_loader
            ])

        print(f"Final Test Accuracy: {test_acc:.4f}")
        results.append((n, test_acc))
    return results

## Digit Classification

In [131]:
X_digit_train_raw, y_digit_train_raw = load_dataset(train_data_file, train_label_file, size=NUM_TRAINING)
X_digit_val_raw, y_digit_val_raw = load_dataset(val_data_file, val_label_file, size=NUM_VALIDATION)
X_digit_test_raw, y_digit_test_raw = load_dataset(test_data_file, test_label_file, size=NUM_TESTING)

Xd_train, yd_train = np.array(X_digit_train_raw), np.array(y_digit_train_raw)
Xd_val, yd_val = np.array(X_digit_val_raw), np.array(y_digit_val_raw)
Xd_test, yd_test = np.array(X_digit_test_raw), np.array(y_digit_test_raw)

digit_results = test_neural_net(
    Xd_train, yd_train, Xd_val, yd_val, Xd_test, yd_test,
    channels=1, height=28, width=28, n_out=10,
    dropout=0.1,
    lr=0.005, epochs=100,
    weight_decay=1e-4, patience=15
)


Training on 100 samples (10%)
Epoch  10: Loss = 0.0349 | Train Acc = 1.0000 | Val Acc = 0.6840
Epoch  20: Loss = 0.0031 | Train Acc = 1.0000 | Val Acc = 0.7300
Epoch  30: Loss = 0.0013 | Train Acc = 1.0000 | Val Acc = 0.7320
Epoch  40: Loss = 0.0011 | Train Acc = 1.0000 | Val Acc = 0.7220
Epoch  50: Loss = 0.0013 | Train Acc = 1.0000 | Val Acc = 0.7220
Epoch  60: Loss = 0.0011 | Train Acc = 1.0000 | Val Acc = 0.7280
Epoch  70: Loss = 0.2841 | Train Acc = 0.6300 | Val Acc = 0.4520
Early stopping triggered.
Final Test Accuracy: 0.6770

Training on 200 samples (20%)
Epoch  10: Loss = 0.0300 | Train Acc = 1.0000 | Val Acc = 0.8020
Epoch  20: Loss = 0.0057 | Train Acc = 1.0000 | Val Acc = 0.7860
Early stopping triggered.
Final Test Accuracy: 0.7324

Training on 300 samples (30%)
Epoch  10: Loss = 0.0265 | Train Acc = 1.0000 | Val Acc = 0.8440
Epoch  20: Loss = 0.0148 | Train Acc = 1.0000 | Val Acc = 0.8420
Epoch  30: Loss = 0.0034 | Train Acc = 1.0000 | Val Acc = 0.8580
Early stopping trig

## Face Classification

In [134]:
X_face_train_raw, y_face_train_raw = load_face_dataset(face_train_data_file, face_train_label_file, size=NUM_FACE_TRAINING)
X_face_val_raw, y_face_val_raw = load_face_dataset(face_val_data_file, face_val_label_file, size=NUM_FACE_VALIDATION)
X_face_test_raw, y_face_test_raw = load_face_dataset(face_test_data_file, face_test_label_file, size=NUM_FACE_TESTING)

Xd_train, yd_train = np.array(X_face_train_raw), np.array(y_face_train_raw)
Xd_val, yd_val = np.array(X_face_val_raw), np.array(y_face_val_raw)
Xd_test, yd_test = np.array(X_face_test_raw), np.array(y_face_test_raw)

digit_results = test_neural_net(
    Xd_train, yd_train, Xd_val, yd_val, Xd_test, yd_test,
    channels=1, height=70, width=60, n_out=2,
    dropout=0.3,
    lr=0.01, epochs=100,
    weight_decay=1e-3, patience=15
)


Training on 45 samples (10%)
Epoch  10: Loss = 0.3688 | Train Acc = 1.0000 | Val Acc = 0.5216
Epoch  20: Loss = 0.0000 | Train Acc = 1.0000 | Val Acc = 0.8904
Early stopping triggered.
Final Test Accuracy: 0.9125

Training on 90 samples (20%)
Epoch  10: Loss = 0.6484 | Train Acc = 0.4889 | Val Acc = 0.5183
Epoch  20: Loss = 0.0010 | Train Acc = 1.0000 | Val Acc = 0.9402
Epoch  30: Loss = 0.0058 | Train Acc = 1.0000 | Val Acc = 0.8970
Early stopping triggered.
Final Test Accuracy: 0.8443

Training on 135 samples (30%)
Epoch  10: Loss = 0.0021 | Train Acc = 1.0000 | Val Acc = 0.8870
Epoch  20: Loss = 0.0029 | Train Acc = 1.0000 | Val Acc = 0.9402
Early stopping triggered.
Final Test Accuracy: 0.9472

Training on 180 samples (40%)
Epoch  10: Loss = 0.0039 | Train Acc = 1.0000 | Val Acc = 0.9302
Epoch  20: Loss = 0.0051 | Train Acc = 1.0000 | Val Acc = 0.9468
Epoch  30: Loss = 0.0138 | Train Acc = 1.0000 | Val Acc = 0.9402
Early stopping triggered.
Final Test Accuracy: 0.6750

Training on