In [29]:
# Imports and Consts
SEED = 42
import numpy as np
from util import get_data

In [30]:

# Read data and construct data sets
NAME = "uciml/iris"
data = get_data("uciml/iris")

In [54]:
data = data[1:]
xs = data[:, 1:-1]
xs = xs.astype(float)
ys = data[:, -1]
stoi = {y: i for i, y in enumerate(list(set(ys)))}
for i in range(ys.size):
    ys[i] = stoi[ys[i]]
ys = ys.astype(int)
itos = {stoi[y]: y for y in stoi}

In [55]:
# Shuffle
np.random.seed(SEED)
idxs = np.random.permutation(len(xs))

xs = xs[idxs]
ys = ys[idxs]

# Split Data
x_train = xs[:120]
y_train = ys[:120]
x_test = xs[120:]
y_test = ys[120:]

# Normalizing data
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std

# Extra column for bias term
x_train = np.hstack([x_train, np.ones((len(x_train), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])

In [57]:
# Full Probability Matrix
def Pr(X, theta):
    """ 
    X: (n, d)
    theta: (d, c)
    """
    logits = X @ theta
    logits = np.hstack([logits, np.zeros((X.shape[0], 1))]) # Add Last class
    logits -= logits.max(axis=1, keepdims=True)
    # logits = np.clip(logits, -500, 500)
    exp = np.exp(logits)
    row_sums = np.sum(exp, axis=1, keepdims=True)
    return exp / row_sums

# One Hot Encoding
def one_hot(Y, d=3):
    """
    Y: (n) -> (n, d) 
    """
    Y_oh = np.zeros(shape=(len(Y), d), dtype=np.int64)
    idxs = np.arange(start=0, stop=len(Y), step=1)
    Y_oh[idxs, Y] = 1
    return Y_oh 

def jacobian(X, y_onehot, P):
    total = X.T @ (y_onehot - P)
    return total[:, :-1]

theta = np.random.randn(5, 2)
P = Pr(x_train, theta)
y_onehot = one_hot(y_train)
J = jacobian(x_train, y_onehot, P)


In [58]:
# Conjugate Gradient Helper
def H_dot(X, P, V):
    """
    X: (n, d)
    P: (n, c) --> doesn't contain probability of last class
    V: (d * c)
    """
    d = X.shape[1]
    c = P.shape[1]
    V = np.reshape(V, shape=(d, c))

    Z = X @ V
    W = P * (1 - P)
    ZW = P * Z
    Hv = X.T @ ZW
    return np.reshape(Hv, d * c)

def conj_grad(X, P, g, tol=1e-1):
    d = X.shape[1]
    c = P.shape[1]
    res = g
    p = g
    k = 0
    x = np.zeros(g.shape)
    while k < g.shape[0]:
        Ap =  H_dot(X, P, p)
        rr = res @ res
        alpha = (res @ res) / (p @ Ap + 1e-12)
        x += alpha * p
        res -= alpha * Ap

        if np.linalg.norm(res) < tol:
            break
        beta = (res @ res) / (rr + 1e-12)
        p = res + beta * p
        k += 1
    return np.reshape(x, shape=(d, c))

In [60]:
def train(X, Y, epochs=10):
    C = np.unique(Y).size
    d = X.shape[1]
    y_onehot = one_hot(Y) # Remove last class
    thetas = np.random.randn(X.shape[1], C-1)

    # To Shuffle Batches:
    gen = np.random.default_rng(SEED)

    for epoch in range(epochs):
        idx = gen.permutation(len(X))
        X_epoch = X[idx]
        Y_epoch = y_onehot[idx]

        P = Pr(X_epoch, thetas)
        J = jacobian(X_epoch, Y_epoch, P)
        J = np.reshape(J, shape=(d * (C-1)))
        P = P[:, :-1]

        thetas += conj_grad(X_epoch, P, J)
        print(f"Epoch: {epoch + 1}/{epochs} done.")
    return thetas
train(x_train, y_train)

Epoch: 1/10 done.
Epoch: 2/10 done.
Epoch: 3/10 done.
Epoch: 4/10 done.
Epoch: 5/10 done.
Epoch: 6/10 done.
Epoch: 7/10 done.
Epoch: 8/10 done.
Epoch: 9/10 done.
Epoch: 10/10 done.


array([[ 1.64165751,  2.26464457],
       [-1.54891715, -1.4441551 ],
       [ 4.38765728,  1.24717958],
       [ 3.00097108, -0.4692936 ],
       [-2.62668675,  1.64162948]])

In [None]:
# Predictions
