In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load data
url = pd.read_csv("../data/urls.csv", encoding='latin1')

# Clean boolean column
url["has_suspicious_word"] = url["has_suspicious_word"].map(
    {True: 1, False: 0, "True": 1, "False": 0}
)

# Feature columns
feature_cols = [
    'url_length','num_digits','digit_ratio','special_char_ratio','num_hyphens',
    'num_underscores','num_slashes','num_dots','num_question_marks',
    'num_equals','num_at_symbols','num_percent','num_hashes','num_ampersands',
    'num_subdomains','is_https','has_suspicious_word'
]

url_features = url[feature_cols].astype(float)
url_response = url['status'].fillna(0).astype(int)

# Standardize
x_mean = url_features.mean(axis=0)
x_std_vec = url_features.std(axis=0) + 1e-8
x_std = (url_features - x_mean) / x_std_vec

# Drop rows with NaNs
mask = x_std.notna().all(axis=1) & url_response.notna()

A = x_std[mask].values
b = url_response[mask].values.astype(float)

print("A shape:", A.shape, "b shape:", b.shape)


  url = pd.read_csv("../data/urls.csv", encoding='latin1')


A shape: (822010, 17) b shape: (822010,)


$$\text{Sigmoid function for logistic regression: }\sigma = \frac{1}{(1+ e^{-z})}$$
$$\text{Logistic Loss(using negative log likelihood): }f(x) = -\sum_i [b_i \log(1-p_i) + (1-b_i) \log(1-p_i)] + (\lambda \ 2) ||x||^2$$

We will use batch, mini-batch, and Newton's method for our algorithms using the above function as out minimizer. Due to the nature of the logistic regression model classifying 1 or 0, we can use that as out data is perfectly set up. The negative log likelihood function is a convex function which means that we can rely on these methods to find a global minimizer instead of stopping at a local mimimizer. We will compare the convergance and accuracy to asses the models.

In [85]:
# Helpers
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def logistic_loss(w, b0, A, b, lam=0.0):
    z = A @ w + b0
    p = sigmoid(z)
    eps = 1e-12
    n = len(b)
    nll = -np.mean(b * np.log(p + eps) + (1 - b) * np.log(1 - p + eps))
    reg = 0.5 * lam * np.sum(w**2)
    return nll + reg

def logistic_grad(w, b0, A, b, lam=0.0):
    n = A.shape[0]
    z = A @ w + b0
    p = sigmoid(z)
    grad_w = A.T @ (p - b) / n + lam * w
    grad_b = float(np.mean(p - b))
    return grad_w, grad_b

def predict_proba(A, w, b0):
    return sigmoid(A @ w)

def predict_label(A, w, b0, threshold=0.5):
    return (predict_proba(A, w, b0) >= threshold).astype(int)

def accuracy(A, b, w, b0):
    return float((predict_label(A, w, b0) == b).mean())

def logistic_hessian_w(w, b0, A, b, lam=0.0):
    """
    Hessian of the logistic loss w.r.t. w only (d x d matrix).

    H = (1/n) * A^T diag(p(1-p)) A + lam * I
    """
    n, d = A.shape
    z = A @ w + b0
    p = sigmoid(z)
    s = p * (1 - p)          # shape (n,)

    # Each row of A scaled by s_i
    AS = A * s[:, None]      # shape (n, d)

    H = (AS.T @ A) / n       # (d, d)
    H += lam * np.eye(d)
    return H



In [86]:
# Gradient Descent
def gd_step(w, b0, A, b, lr, lam=0.0):
    grad_w, grad_b = logistic_grad(w, b0, A, b, lam)
    return w - lr * grad_w, b0 - lr * grad_b

def fit_batch_gd(A, b, lr=0.1, n_iters=40, lam=0.0):
    n, d = A.shape
    w = np.zeros(d)
    b0 = 0.0
    loss_hist = []
    acc_hist = []
    for k in range(n_iters):
        w, b0 = gd_step(w, b0, A, b, lr, lam)
        loss_hist.append(logistic_loss(w, b0, A, b, lam))
        acc_hist.append(accuracy(A, b, w, b0))
    return w, b0, loss_hist, acc_hist


In [87]:
# Mini-batch SGD
def make_minibatches(n, batch_size, rng):
    perm = rng.permutation(n)
    return [perm[i:i+batch_size] for i in range(0, n, batch_size)]

def sgd_minibatch_step(w, b0, A_batch, b_batch, lr, lam=0.0):
    grad_w, grad_b = logistic_grad(w, b0, A_batch, b_batch, lam)
    return w - lr * grad_w, b0 - lr * grad_b

def fit_minibatch_sgd(A, b, lr=0.05, n_epochs=10, batch_size=512, lam=0.0, seed=0):
    n, d = A.shape
    w = np.zeros(d)
    b0 = 0.0
    rng = np.random.default_rng(seed)
    loss_hist = []
    acc_hist = []
    for epoch in range(n_epochs):
        batches = make_minibatches(n, batch_size, rng)
        for idx in batches:
            w, b0 = sgd_minibatch_step(w, b0, A[idx], b[idx], lr, lam)
        loss_hist.append(logistic_loss(w, b0, A, b, lam))
        acc_hist.append(accuracy(A, b, w, b0))
    return w, b0, loss_hist, acc_hist



In [88]:
# Newton's Method
def newton_step(w, b0, A, b, lam=0.0):
    """
    Perform one Newton step for (w, b0).
    Uses gradient and Hessian for w, scalar Hessian for b0.
    """
    n, d = A.shape

    # gradient
    grad_w, grad_b = logistic_grad(w, b0, A, b, lam)

    # Hessian for w
    H = logistic_hessian_w(w, b0, A, b, lam)

    # Solve H * delta_w = grad_w
    try:
        delta_w = np.linalg.solve(H, grad_w)
    except np.linalg.LinAlgError:
        # If Hessian is singular, skip Newton update
        delta_w = grad_w  # fall back to gradient step
    w_new = w - delta_w

    # For b0, scalar second derivative:
    z = A @ w + b0
    p = sigmoid(z)
    s = p * (1 - p)
    H_b = float(np.mean(s))  # approximate d^2 f / db0^2

    if H_b > 0:
        delta_b = grad_b / H_b
        b0_new = b0 - delta_b
    else:
        b0_new = b0  # no update if curvature is degenerate

    return w_new, b0_new

def fit_newton(A, b, n_iters=10, lam=0.0):
    """
    Train logistic regression with Newton's method.
    Returns (w, b0, loss_history, acc_history).
    """
    n, d = A.shape
    w = np.zeros(d)
    b0 = 0.0

    loss_hist = []
    acc_hist  = []

    for k in range(n_iters):
        w, b0 = newton_step(w, b0, A, b, lam)
        loss_hist.append(logistic_loss(w, b0, A, b, lam))
        acc_hist.append(accuracy(A, b, w, b0))

    return w, b0, loss_hist, acc_hist


In [None]:

# Train methods
lam = 0.0
x0 = np.zeros(A.shape[1])

w_gd, b_gd, loss_gd, acc_gd = fit_batch_gd(A, b, lr=0.1, n_iters=500, lam=lam)
w_sgd, b_sgd, loss_sgd, acc_sgd = fit_minibatch_sgd(A, b, lr=0.05, n_epochs=500, batch_size=1024, lam=lam)
w_new, b_new, loss_new, acc_new = fit_newton(A, b, n_iters=500, lam=lam)

print("Batch GD final accuracy:    ", acc_gd[-1])
print("Mini-batch SGD final acc:   ", acc_sgd[-1])
print("Newton final acc:           ", acc_new[-1])

# Loss plot 
plt.figure()
plt.plot(loss_gd,   label="Batch GD")
plt.plot(loss_sgd,  label="Mini-batch SGD (per epoch)")
plt.plot(loss_new,  label="Newton")
plt.title("Training Loss: GD vs SGD vs Newton")
plt.xlabel("Iteration / Epoch")
plt.ylabel("Loss")
plt.legend()
plt.savefig("../img/loss_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

# Accuracy plot
plt.figure()
plt.plot(acc_gd,   label="Batch GD")
plt.plot(acc_sgd,  label="Mini-batch SGD (per epoch)")
plt.plot(acc_new,  label="Newton")
plt.title("Training Accuracy: GD vs SGD vs Newton")
plt.xlabel("Iteration / Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("../img/accuracy_comparison.png", dpi=300, bbox_inches="tight")
plt.show()



BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.