In [38]:
# Imports and Consts
SEED = 42
import struct as st
import numpy as np

In [39]:
# Helper function to parse IDX files
def parse_idx(file_path):
    with open(file_path, 'rb') as file:
        magic = st.unpack('>I', file.read(4))[0]  # Magic number (4 bytes)
        num_items = st.unpack('>I', file.read(4))[0]  # Number of items (4 bytes)

        if magic == 2051:  # Magic number for images
            num_rows = st.unpack('>I', file.read(4))[0]
            num_cols = st.unpack('>I', file.read(4))[0]
            num_bytes = num_items * num_rows * num_cols
            data = np.frombuffer(file.read(num_bytes), dtype=np.uint8)
            return data.reshape(num_items, num_rows, num_cols)
        elif magic == 2049:  # Magic number for labels
            data = np.frombuffer(file.read(num_items), dtype=np.uint8)
            return data
        else:
            raise ValueError(f"Unknown magic number: {magic}")

# Parse the training data
x_train = parse_idx('DigitData/train-images.idx3-ubyte')
y_train = parse_idx('DigitData/train-labels.idx1-ubyte')

x_test = parse_idx('DigitData/t10k-images.idx3-ubyte')
y_test = parse_idx('DigitData/t10k-labels.idx1-ubyte')

# Reshape and scale down
x_train = x_train.reshape(x_train.shape[0], -1) / 255.0
x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

In [40]:
# Normalize
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

# Add extra ones column
x_train = np.hstack([x_train, np.ones((len(x_train), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])

In [None]:
# Helpers

def Pr(arr: np.ndarray, param: np.ndarray) -> np.ndarray:
    z = arr @ param
    z_clipped = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z_clipped))

def construct_w(p: np.ndarray) -> np.ndarray:
    return (1 - p) * p

def compute_cholesky(X, w, lam=1e-2):
    H = X.T @ (X * w[:, None]) + lam * np.eye(X.shape[1])

    return np.linalg.cholesky(H)

def compute_delta(X: np.ndarray, L: np.ndarray, Y: np.ndarray, p: np.ndarray, lam=1e-2) -> np.ndarray:
    B = X.T @ (Y-p)
    
    # Solve first only with L
    temp = np.linalg.solve(L, B)

    # Return final solution by solving for temp with L.T
    return np.linalg.solve(L.T, temp)

In [54]:
def train(X, Y, epochs=10, batch_size=1000):
    thetas = np.random.randn(10, X.shape[1])

    # To Shuffle Batches:
    gen = np.random.default_rng(SEED)

    # Build accumulated gradient
    for epoch in range(epochs):
        idx = gen.permutation(len(X))
        X_epoch = X[idx]
        Y_epoch = Y[idx]
        for digit in range(len(thetas)):
            theta = thetas[digit]
            p_full = Pr(X_epoch, thetas[digit])
            w_full = construct_w(p_full)
            L = compute_cholesky(X_epoch, w_full)
            for start in range(0, len(X), batch_size):
                end = start + batch_size
                X_batch = X_epoch[start:end]
                Y_batch = (Y_epoch[start:end] == digit).astype(float)
                
                p = Pr(X_batch, theta)
                theta += compute_delta(X_batch, L, Y_batch, p)
            thetas[digit] = theta
        print(f"Epoch: {epoch + 1}/{epochs} done.")
    return thetas

thetas = train(x_train, y_train)

Epoch: 1/10 done.
Epoch: 2/10 done.
Epoch: 3/10 done.
Epoch: 4/10 done.
Epoch: 5/10 done.
Epoch: 6/10 done.
Epoch: 7/10 done.
Epoch: 8/10 done.
Epoch: 9/10 done.
Epoch: 10/10 done.


In [55]:
# Make predictions

def predict(X, thetas):
    probs = np.array([Pr(X, theta) for theta in thetas])
    return np.argmax(probs, axis=0)

def accuracy(X, Y, thetas):
    preds = predict(X, thetas)
    return np.mean(preds==Y)

print(accuracy(x_test, y_test, thetas))


0.8452
