In [None]:
# Imports and Consts
SEED = 42 # For determnistic testing
import numpy as np
from src.Binary.util import get_data

In [126]:
# Read data and construct data sets
data = get_data()

# Some preprocessing
cols_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
cols = [i for i, val in enumerate(list(data[0])) if val in cols_missing]

data = data[1:]
data = data.astype(float)
xs = data[:, :-1]
ys = data[:, -1]

for i in cols:
    m = np.median(xs[:, i][xs[:, i] > 0])
    xs[:, i][xs[:, i] == 0] = m

In [127]:
# Shuffle
np.random.seed(42)
idxs = np.random.permutation(len(xs))

xs = xs[idxs]
ys = ys[idxs]

# Split Data
x_train = xs[:614]
y_train = ys[:614]
x_dev = xs[614:691]
y_dev = ys[614:691]
x_test = xs[691:]
y_test = ys[691:]

# Normalizing data
train_mean = np.mean(x_train, axis=0)
train_std = np.std(x_train, axis=0) + 1e-12

x_train = (x_train - train_mean) / train_std
x_test = (x_test - train_mean) / train_std
x_dev = (x_dev - train_mean) / train_std


# Extra column for bias term
x_train = np.hstack([ x_train, np.ones((len(x_train), 1)) ])
x_dev = np.hstack([ x_dev, np.ones((len(x_dev), 1)) ])
x_test = np.hstack([ x_test, np.ones((len(x_test), 1)) ])


In [128]:
# Helpers

def Pr(arr: np.ndarray, param: np.ndarray) -> np.ndarray:
    z = arr @ param
    z_clipped = np.clip(z, -500, +500)
    return 1.0 / (1.0 + np.exp(-z_clipped))

def construct_w(p: np.ndarray) -> np.ndarray:
    return np.diag((1 - p) * p)

def compute_delta(X: np.ndarray, diag: np.ndarray, Y: np.ndarray, p: np.ndarray) -> np.ndarray:
    H = X.T @ diag @ X + 1e-2 * np.diag(np.ones(X.shape[1]))
    B = X.T @ (Y-p)

    # Speed up computation via decomposition
    L = np.linalg.cholesky(H)
    
    # Solve first only with L
    temp = np.linalg.solve(L, B)

    # Return final solution by solving for temp with L.T
    return np.linalg.solve(L.T, temp)

def loglik(p, y, tol=1e-15):
    ll = np.sum(y * np.log(p + tol) + (1 - y) * (np.log(1 - p + tol)))
    return ll

def scale_const(X, Y, p_old, theta, delta, alpha):
    ll_old = loglik(p_old, Y)

    while True:
        p_new = Pr(X, theta + alpha * delta)
        ll_new = loglik(p_new, Y)

        if ll_new >= ll_old:
            return alpha
        
        if alpha < 1e-8:
            return None
        
        alpha *= 0.5

In [129]:
# Model Training

def newton_method(X, Y, iters=1000):
    # Initialize Theta
    np.random.seed(42)
    theta = np.random.randn(X.shape[1])

    alpha = 1.0
    for _ in range(iters):
        # Calc Values
        p = Pr(X, theta)
        W = construct_w(p)
        delta = compute_delta(X, W, Y, p)

        # Prevent exploding step
        alpha = scale_const(X, Y, p, theta, delta, alpha)

        # If next step would lower LL
        if alpha is None:
            return theta
        else:
            # Apply Newton method
            theta += alpha * delta
    return theta


In [130]:
# Predictor

def log_regression(X, theta, tol=0.5):
    logits = X @ theta

    # Calculate probability of a 1
    probs = 1.0 / (1.0 + np.exp(-logits))

    # Round to get preds
    return np.where(probs > tol, 1, 0)

def accuracy(X, Y, theta, tol=0.5):
    preds = log_regression(X, theta, tol)
    return np.mean(preds == Y)

In [131]:
# Find optimal tolerance
theta = newton_method(x_train, y_train)
tols = np.linspace(0, 1, 100)
accs = np.array([accuracy(x_dev, y_dev, theta, tol) for tol in tols])
opt_tol = tols[np.argmax(accs)]

In [132]:
theta = newton_method(x_train, y_train)
print(accuracy(x_test, y_test, theta, opt_tol))

0.7792207792207793
