In [1]:
# Imports and Consts
SEED = 42
import struct as st
import numpy as np

In [None]:
# Helper function to parse IDX files
def parse_idx(file_path):
    with open(file_path, 'rb') as file:
        magic = st.unpack('>I', file.read(4))[0]  # Magic number (4 bytes)
        num_items = st.unpack('>I', file.read(4))[0]  # Number of items (4 bytes)

        if magic == 2051:  # Magic number for images
            num_rows = st.unpack('>I', file.read(4))[0]
            num_cols = st.unpack('>I', file.read(4))[0]
            num_bytes = num_items * num_rows * num_cols
            data = np.frombuffer(file.read(num_bytes), dtype=np.uint8)
            return data.reshape(num_items, num_rows, num_cols)
        elif magic == 2049:  # Magic number for labels
            data = np.frombuffer(file.read(num_items), dtype=np.uint8)
            return data
        else:
            raise ValueError(f"Unknown magic number: {magic}")

# Parse the training data
x_train = parse_idx('DigitData/train-images.idx3-ubyte')
y_train = parse_idx('DigitData/train-labels.idx1-ubyte')

x_test = parse_idx('DigitData/t10k-images.idx3-ubyte')
y_test = parse_idx('DigitData/t10k-labels.idx1-ubyte')

# Reshape and scale down
x_train = x_train.reshape(x_train.shape[0], -1) / 255.0
x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

In [None]:
# Normalize
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

# Add extra ones column
x_train = np.hstack([x_train, np.ones((len(x_train), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])

(60000, 787)


In [None]:
# Full Probability Matrix
def Pr(X, theta):
    """ 
    X: (n, d)
    theta: (d, c)
    """
    logits = X @ theta
    logits = np.hstack([logits, np.zeros((X.shape[0], 1))]) # Add Last class
    print(logits.shape)
    logits -= logits.max(axis=1, keepdims=True)
    # logits = np.clip(logits, -500, 500)
    exp = np.exp(logits)
    row_sums = np.sum(exp, axis=1, keepdims=True)
    return exp / row_sums

# One Hot Encoding
def one_hot(Y, d=10):
    """
    Y: (n) -> (n, d) 
    """
    Y_oh = np.zeros(shape=(len(Y), d), dtype=np.int64)
    idxs = np.arange(start=0, stop=len(Y), step=1)
    Y_oh[idxs, Y] = 1
    return Y_oh 

def jacobian(X, y_onehot, P):
    total = X.T @ (y_onehot - P)
    return total[:, :-1]

theta = np.random.randn(787, 9)
P = Pr(x_train, theta)
y_onehot = one_hot(y_train)
J = jacobian(x_train, y_onehot, P)


(60000, 10)
(60000, 787) (60000, 10) (60000, 10)


In [55]:
# Conjugate Gradient Helper
def H_dot(X, P, V):
    """
    X: (n, d)
    P: (n, c) --> doesn't contain probability of last class
    V: (d * c)
    """
    d = X.shape[1]
    c = P.shape[1]
    V = V.reshape(shape=(d, c))

    

    Z = X @ V.T
    A = P * Z
    s = A.sum(axis=1, keepdims=True)
    Z_next = A - P * s
    Hv = (X.T @ Z_next).T
    return np.reshape(Hv, d * c)

def conj_grad(X, P, theta, g):
    pass