In [640]:
# Imports and Consts
SEED = 42
import numpy as np
from util import get_data

In [641]:

# Read data and construct data sets
NAME = "uciml/iris"
data = get_data("uciml/iris")

In [642]:
data = data[1:]
xs = data[:, 1:-1]
xs = xs.astype(float)
ys = data[:, -1]
stoi = {y: i for i, y in enumerate(list(set(ys)))}
for i in range(ys.size):
    ys[i] = stoi[ys[i]]
ys = ys.astype(int)
itos = {stoi[y]: y for y in stoi}

In [643]:
# Shuffle
np.random.seed(SEED)
idxs = np.random.permutation(len(xs))

xs = xs[idxs]
ys = ys[idxs]

# Split Data
x_train = xs[:120]
y_train = ys[:120]
x_test = xs[120:]
y_test = ys[120:]

# Normalizing data
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

# Extra column for bias term
x_train = np.hstack([x_train, np.ones((len(x_train), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])

In [644]:
# Full Probability Matrix
def Pr(X, theta):
    """ 
    X: (n, d)
    theta: (d, c)
    """
    logits = X @ theta
    logits = np.hstack([logits, np.zeros((X.shape[0], 1))]) # Add Last class
    logits -= logits.max(axis=1, keepdims=True)
    # logits = np.clip(logits, -500, 500)
    exp = np.exp(logits)
    row_sums = np.sum(exp, axis=1, keepdims=True)
    return exp / row_sums

# One Hot Encoding
def one_hot(Y, d):
    """
    Y: (n) -> (n, d) 
    """
    Y_oh = np.zeros(shape=(len(Y), d), dtype=np.int64)
    idxs = np.arange(start=0, stop=len(Y), step=1)
    Y_oh[idxs, Y] = 1
    return Y_oh 

def jacobian(X, y_onehot, P):
    total = X.T @ (y_onehot - P)
    return total[:, :-1]

def cross_entropy(P, Y_oh):
    return -np.mean(np.sum(Y_oh * np.log(P + 1e-12), axis=1))


In [645]:
# Conjugate Gradient Helper
def H_dot(X, P, V, _lambda):
    """
    X: (n, d)
    P: (n, c) --> doesn't contain probability of last class
    V: (d * c)
    """
    d = X.shape[1]
    c = P.shape[1]
    V = np.reshape(V, shape=(d, c))

    Z = X @ V
    A = P * Z
    s = A.sum(axis=1, keepdims=True)
    ZW = A - P * s
    Hv = X.T @ ZW 
    Hv += _lambda * V
    return Hv.flatten()

def conj_grad(X, P, g, _lambda, tol=1e-6):
    d = X.shape[1]
    c = P.shape[1]
    res = g
    p = g
    k = 0
    x = np.zeros(g.shape)
    while k < g.shape[0]:
        Ap =  H_dot(X, P, p, _lambda)
        rr = res @ res
        alpha = (res @ res) / (p @ Ap + 1e-12)
        x += alpha * p
        res -= alpha * Ap

        if np.linalg.norm(res) < tol:
            break
        beta = (res @ res) / (rr + 1e-12)
        p = res + beta * p
        k += 1
    return np.reshape(x, shape=(d, c))

In [646]:
def train(X, Y, epochs=200, _lambda=1e-4, _alpha=0.5):
    C = np.unique(Y).size
    d = X.shape[1]
    y_onehot = one_hot(Y, C)
    thetas = np.random.randn(X.shape[1], C-1)

    # To Shuffle Batches:
    gen = np.random.default_rng(SEED)

    for epoch in range(epochs):
        if epoch % 25 == 0:
            _alpha /= 2
        idx = gen.permutation(len(X))
        X_epoch = X[idx]
        Y_epoch = y_onehot[idx]

        P = Pr(X_epoch, thetas)
        J = jacobian(X_epoch, Y_epoch, P)
        J += thetas * _lambda
        J = np.reshape(J, shape=(d * (C-1)))

        if (epoch + 1) % 25 == 0:
            loss = cross_entropy(P, Y_epoch)
            print(f"Epoch {epoch + 1}, loss: {loss:.4f}")

        P = P[:, :-1]
        thetas += _alpha * conj_grad(X_epoch, P, J, _lambda)
    return thetas
thetas = train(x_train, y_train)

Epoch 25, loss: 0.0551
Epoch 50, loss: 0.0475
Epoch 75, loss: 0.0475
Epoch 100, loss: 0.0475
Epoch 125, loss: 0.0475
Epoch 150, loss: 0.0475
Epoch 175, loss: 0.0475
Epoch 200, loss: 0.0475


In [647]:
# Predictions
def predict(xs, thetas):
    probs = Pr(xs, thetas)
    preds = np.argmax(probs, axis=1)
    return preds

def acc(xs, ys, thetas):
    preds = predict(xs, thetas)
    preds = preds.astype(int)
    return np.mean(preds == ys)

print(acc(x_test, y_test, thetas))

1.0


In [648]:
# Load from local file
X_train = np.loadtxt("data/PenData/pendigits.tra", delimiter=",")
X_test = np.loadtxt("data/PenData/pendigits.tes", delimiter=",")

# Split features and labels
x_train, y_train = X_train[:, :-1], X_train[:, -1].astype(int)
x_test, y_test = X_test[:, :-1], X_test[:, -1].astype(int)

In [649]:
# Normalizing data
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

# Extra column for bias term
x_train = np.hstack([x_train, np.ones((len(x_train), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])

In [650]:
thetas = train(x_train, y_train, _lambda=1e-4)
print(acc(x_test, y_test, thetas))

Epoch 25, loss: 3.0197
Epoch 50, loss: 3.0087
Epoch 75, loss: 3.0087
Epoch 100, loss: 3.0087
Epoch 125, loss: 3.0087
Epoch 150, loss: 3.0087
Epoch 175, loss: 3.0087
Epoch 200, loss: 3.0087
0.8622069754145226
