In [61]:
# unzip data
import zipfile
import os

if not os.path.exists("data"):
    with zipfile.ZipFile("data.zip", "r") as zip_ref:
        zip_ref.extractall("data")

In [62]:
# hyperparameters
NUM_EPOCHS = 5
NUM_TRAINING = 1000
NUM_TESTING = 500
NUM_VALIDATION = 500

NUM_FACE_TRAINING = 3000
NUM_FACE_VALIDATION = 800
NUM_FACE_TESTING = 800


IMAGE_HEIGHT = 28
IMAGE_WIDTH = 28
NUM_CLASSES = 10

In [63]:
# filepaths
train_data_file = "data/digitdata/trainingimages"
train_label_file = "data/digitdata/traininglabels"
val_data_file = "data/digitdata/validationimages"
val_label_file = "data/digitdata/validationlabels"
test_data_file = "data/digitdata/testimages"
test_label_file = "data/digitdata/testlabels"



face_train_data_file = "data/facedata/facedatatrain"
face_train_label_file = "data/facedata/facedatatrainlabels"
face_val_data_file   = "data/facedata/facedatavalidation"
face_val_label_file  = "data/facedata/facedatavalidationlabels"
face_test_data_file  = "data/facedata/facedatatest"
face_test_label_file = "data/facedata/facedatatestlabels"


In [64]:

# imports
import numpy as np
import matplotlib.pyplot as plt
import random
import time


## Data Loading and Preprocessing

In [65]:

def read_data_file(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return [line.rstrip("\n") for line in lines]

def extract_features(raw_data):
    features = []
    for i in range(0, len(raw_data), 28):
        image = raw_data[i:i+28]
        feature = [1 if ch != ' ' else 0 for row in image for ch in row]
        features.append(feature)
    return features

def read_labels(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return [int(line.strip()) for line in lines]

def load_dataset(data_file, label_file, size=None):
    raw_data = read_data_file(data_file)
    raw_labels = read_labels(label_file)

    features = extract_features(raw_data)
    if size is not None:
        combined = list(zip(features, raw_labels))
        random.shuffle(combined)
        features, raw_labels = zip(*combined[:size])

    return list(features), list(raw_labels)

def one_hot_encode(y, num_classes=10):
    encoded = np.zeros((num_classes, len(y)))
    for idx, val in enumerate(y):
        encoded[val][idx] = 1
    return encoded

def evaluate(predictions, labels):
    correct = sum(p == t for p, t in zip(predictions, labels))
    return correct / len(labels)


## Neural Network Functions

In [None]:
import numpy as np

# Activation functions
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

# Initialize weights and biases
def initialize_parameters(input_size, hidden1_size, hidden2_size, output_size):
    np.random.seed(42)
    return {
        'W1': np.random.randn(hidden1_size, input_size) * 0.01,
        'b1': np.zeros((hidden1_size, 1)),
        'W2': np.random.randn(hidden2_size, hidden1_size) * 0.01,
        'b2': np.zeros((hidden2_size, 1)),
        'W3': np.random.randn(output_size, hidden2_size) * 0.01,
        'b3': np.zeros((output_size, 1))
    }

# Forward pass
def forward_propagation(X, parameters):
    W1, b1 = parameters['W1'], parameters['b1']
    W2, b2 = parameters['W2'], parameters['b2']
    W3, b3 = parameters['W3'], parameters['b3']

    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)

    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)

    Z3 = np.dot(W3, A2) + b3
    A3 = sigmoid(Z3)

    cache = (Z1, A1, Z2, A2, Z3, A3)
    return A3, cache

# Loss
def compute_loss(Y_hat, Y):
    m = Y.shape[1]
    return -np.sum(Y * np.log(Y_hat + 1e-8) + (1 - Y) * np.log(1 - Y_hat + 1e-8)) / m

# Backward pass
def backward_propagation(X, Y, parameters, cache):
    m = X.shape[1]
    W2, W3 = parameters['W2'], parameters['W3']
    Z1, A1, Z2, A2, Z3, A3 = cache

    dZ3 = A3 - Y
    dW3 = (1/m) * np.dot(dZ3, A2.T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = dA2 * relu_derivative(Z2)
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return {
        'dW1': dW1, 'db1': db1,
        'dW2': dW2, 'db2': db2,
        'dW3': dW3, 'db3': db3
    }

# Gradient descent update
def update_parameters(params, grads, lr):
    for key in params:
        params[key] -= lr * grads['d' + key]
    return params

# Prediction
def predict_nn(X, parameters):
    Y_hat, _ = forward_propagation(X, parameters)
    return np.argmax(Y_hat, axis=0)

# Training loop
def train_neural_net(X_train, y_train, X_test, y_test,
                     input_size, h1, h2, output_size,
                     epochs=1000, lr=0.1, print_loss=True,
                     X_val=None, y_val=None, early_stopping=False, patience=10):
    
    parameters = initialize_parameters(input_size, h1, h2, output_size)
    best_params = None
    best_val_acc = 0
    val_acc_counter = 0

    for epoch in range(epochs):
        # Forward and backpropagation
        Y_hat, cache = forward_propagation(X_train, parameters)
        loss = compute_loss(Y_hat, y_train)
        grads = backward_propagation(X_train, y_train, parameters, cache)
        parameters = update_parameters(parameters, grads, lr)

        # Check performance every 100 epochs
        if epoch % 100 == 0 or epoch == epochs - 1:
            train_preds = predict_nn(X_train, parameters)
            train_acc = evaluate(train_preds, np.argmax(y_train, axis=0))
            
            if X_val is not None and y_val is not None:
                val_preds = predict_nn(X_val, parameters)
                val_acc = evaluate(val_preds, np.argmax(y_val, axis=0))

                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f} | Val Acc = {val_acc:.4f}")

                # Save best model
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_params = {k: v.copy() for k, v in parameters.items()}
                    val_acc_counter = 0
                else:
                    val_acc_counter += 1
                    if early_stopping and val_acc_counter >= patience:
                        print("Early stopping triggered.")
                        break
            else:
                if print_loss:
                    print(f"Epoch {epoch}: Loss = {loss:.4f} | Train Acc = {train_acc:.4f}")

    final_params = best_params if best_params is not None else parameters

    # Final test evaluation
    test_preds = predict_nn(X_test, final_params)
    test_acc = evaluate(test_preds, np.argmax(y_test, axis=0))
    print(f"Final Test Accuracy: {test_acc:.4f}")
    return final_params


In [None]:
print("Testing neural net on digit data")

X_train_raw, y_train_raw = load_dataset(train_data_file, train_label_file, size=NUM_TRAINING)
X_val_raw, y_val_raw     = load_dataset(val_data_file, val_label_file, size=NUM_VALIDATION)
X_test_raw, y_test_raw   = load_dataset(test_data_file, test_label_file, size=NUM_TESTING)

X_train = np.array(X_train_raw).T
X_val   = np.array(X_val_raw).T
X_test  = np.array(X_test_raw).T

y_train = one_hot_encode(y_train_raw)
y_val   = one_hot_encode(y_val_raw)
y_test  = one_hot_encode(y_test_raw)

# Train on increasing percentages of DIGIT data 
percentages = [0.1 * i for i in range(1, 11)]  # 10% to 100%
total_digit_samples = X_train.shape[1]

digit_results = []

for pct in percentages:
    n = int(pct * total_digit_samples)
    X_subset = X_train[:, :n]
    y_subset = y_train[:, :n]

    print(f"\n DIGITS: Training on {n} samples ({int(pct * 100)}%)")
    
    trained_params = train_neural_net(
        X_subset, y_subset,
        X_test, y_test,
        input_size=784, h1=128, h2=64, output_size=10,
        epochs=1000, lr=0.1,
        X_val=X_val, y_val=y_val,
        early_stopping=True, patience=10
    )

    test_preds = predict_nn(X_test, trained_params)
    test_acc = evaluate(test_preds, np.argmax(y_test, axis=0))
    digit_results.append((n, test_acc))
    print(f"DIGITS Test Accuracy with {n} samples: {test_acc:.4f}")

Testing neural net on digit data

 DIGITS: Training on 100 samples (10%)
Epoch 0: Loss = 6.9313 | Train Acc = 0.1300 | Val Acc = 0.1180
Epoch 100: Loss = 3.2307 | Train Acc = 0.1300 | Val Acc = 0.1280
Epoch 200: Loss = 3.1547 | Train Acc = 0.2200 | Val Acc = 0.2280
Epoch 300: Loss = 2.2593 | Train Acc = 0.5700 | Val Acc = 0.3740
Epoch 400: Loss = 0.6183 | Train Acc = 0.9600 | Val Acc = 0.6280
Epoch 500: Loss = 0.0800 | Train Acc = 1.0000 | Val Acc = 0.6220
Epoch 600: Loss = 0.0308 | Train Acc = 1.0000 | Val Acc = 0.6180
Epoch 700: Loss = 0.0174 | Train Acc = 1.0000 | Val Acc = 0.6180
Epoch 800: Loss = 0.0116 | Train Acc = 1.0000 | Val Acc = 0.6180
Epoch 900: Loss = 0.0085 | Train Acc = 1.0000 | Val Acc = 0.6180
Epoch 999: Loss = 0.0067 | Train Acc = 1.0000 | Val Acc = 0.6180
Final Test Accuracy: 0.5980
DIGITS Test Accuracy with 100 samples: 0.5980

 DIGITS: Training on 200 samples (20%)
Epoch 0: Loss = 6.9313 | Train Acc = 0.1350 | Val Acc = 0.1180
Epoch 100: Loss = 3.2361 | Train Acc 

In [None]:
print("Testing neural net on face data")

X_face_train_raw, y_face_train_raw = load_dataset(face_train_data_file, face_train_label_file, size=NUM_FACE_TRAINING)
X_face_test_raw, y_face_test_raw   = load_dataset(face_test_data_file, face_test_label_file, size=NUM_FACE_TESTING)
X_face_val_raw, y_face_val_raw   = load_dataset(face_test_data_file, face_test_label_file, size=NUM_FACE_VALIDATION)

X_face_train = np.array(X_face_train_raw).T
X_face_test = np.array(X_face_test_raw).T
X_face_val = np.array(X_face_val_raw).T
y_face_train = one_hot_encode(y_face_train_raw, num_classes=2)
y_face_test = one_hot_encode(y_face_test_raw, num_classes=2)
y_face_val = one_hot_encode(y_face_val_raw, num_classes=2)

# Train on increasing percentages of FACE data 
percentages = [0.1 * i for i in range(1, 11)]  # 10% to 100%
total_face_samples = X_face_train.shape[1]

face_results = []

for pct in percentages:
    n = int(pct * total_face_samples)
    X_face_subset = X_face_train[:, :n]
    y_face_subset = y_face_train[:, :n]

    print(f"\n FACES: Training on {n} samples ({int(pct * 100)}%)")

    trained_face_params = train_neural_net( # ??? what should this be, still overfitting ???
        X_face_subset, y_face_subset,
        X_face_test, y_face_test,
        input_size=1680, 
        h1=64, h2=32, output_size=2,
        epochs=1000, lr=0.1,
        X_val=X_face_val, y_val=y_face_val,
        early_stopping=True, patience=10
    )

    test_preds = predict_nn(X_face_test, trained_face_params)
    test_acc = evaluate(test_preds, np.argmax(y_face_test, axis=0))
    face_results.append((n, test_acc))
    print(f"FACES Test Accuracy with {n} samples: {test_acc:.4f}")


Testing neural net on face data

 FACES: Training on 45 samples (10%)
Epoch 0: Loss = 1.3863 | Train Acc = 0.5111 | Val Acc = 0.5133
Epoch 100: Loss = 1.3851 | Train Acc = 0.5111 | Val Acc = 0.5133
Epoch 200: Loss = 1.0757 | Train Acc = 1.0000 | Val Acc = 0.4933
Epoch 300: Loss = 0.0199 | Train Acc = 1.0000 | Val Acc = 0.5067
Epoch 400: Loss = 0.0109 | Train Acc = 1.0000 | Val Acc = 0.4867
Epoch 500: Loss = 0.0076 | Train Acc = 1.0000 | Val Acc = 0.4933
Epoch 600: Loss = 0.0056 | Train Acc = 1.0000 | Val Acc = 0.5000
Epoch 700: Loss = 0.0044 | Train Acc = 1.0000 | Val Acc = 0.5000
Epoch 800: Loss = 0.0035 | Train Acc = 1.0000 | Val Acc = 0.5067
Epoch 900: Loss = 0.0028 | Train Acc = 1.0000 | Val Acc = 0.5067
Epoch 999: Loss = 0.0024 | Train Acc = 1.0000 | Val Acc = 0.4933
Early stopping triggered.
Final Test Accuracy: 0.5133
FACES Test Accuracy with 45 samples: 0.5133

 FACES: Training on 90 samples (20%)
Epoch 0: Loss = 1.3863 | Train Acc = 0.5333 | Val Acc = 0.4867
Epoch 100: Loss = 