Logistic Regression on breast cancer dataset.

In [252]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

data = load_breast_cancer()
X = data.data
y = data.target

print(f"X shape: {X.shape}, y shape: {y.shape}")


X shape: (569, 30), y shape: (569,)


In [253]:
unique, counts = np.unique(y, return_counts=True)
for cls, cnt in zip(unique, counts):
    print(f"class {cls}: {cnt}")

class 0: 212
class 1: 357


Slight imbalance.

In [254]:
X_train_pre, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=6)
X_val_pre, X_test_pre, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=6)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_pre)
X_val = scaler.fit_transform(X_val_pre)
X_test = scaler.fit_transform(X_test_pre)

![Gradients](figures/paperwork.jpeg)

With the L2 regularization, I switched to dividing by 2, instead of 2m.

In [255]:
w = np.random.random((1, X.shape[1]))
b = np.random.random(1)

In [256]:
def sigmoid(z):
    return 1 / (1+np.exp(-z))

In [257]:
epochs = 200
lambda_ = 0.1
bs = 4
lr = 0.001
eps = 1e-12
M = X_train.shape[0]
for epoch in range(epochs):
    total_loss = 0
    c = 0
    for i in range(int(np.ceil(M / bs))):
        x, y = X_train[c:c+bs], y_train[c:c+bs].reshape(-1, 1)
        m = x.shape[0]
        p = sigmoid(x @ w.T + b)
        p = np.clip(p, eps, 1-eps)
        loss = np.sum(-y * np.log(p) - (1-y) * np.log(1 - p)) / m + (lambda_ / (2)) * np.sum(w**2)
        d_w = (((p - y).T @ x) / m) + lambda_ * w
        d_b = np.mean(p - y)

        w -= lr * d_w
        b -= lr * d_b

        total_loss += loss
        c += bs
    
    val_p = sigmoid(X_val @ w.T + b).flatten()
    val_p = np.clip(val_p, eps, 1-eps)
    val_loss = np.mean(-y_val * np.log(val_p) - (1-y_val) * np.log(1 - val_p)) + (lambda_ / 2) * np.sum(w**2)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1} train loss: {total_loss / int(np.ceil(M / bs))}, validation loss: {val_loss}")




Epoch 10 train loss: 0.8425136817315212, validation loss: 0.9479056436095008
Epoch 20 train loss: 0.3590239509140598, validation loss: 0.4529945997112484
Epoch 30 train loss: 0.2890649968787759, validation loss: 0.3708312255660373
Epoch 40 train loss: 0.26157479261184774, validation loss: 0.33996885768294854
Epoch 50 train loss: 0.24471527327670523, validation loss: 0.3225622870415416
Epoch 60 train loss: 0.23257965392465862, validation loss: 0.3107879909496402
Epoch 70 train loss: 0.2233248421447084, validation loss: 0.30215451404430765
Epoch 80 train loss: 0.2160952484527097, validation loss: 0.29556773039790557
Epoch 90 train loss: 0.21038085447968186, validation loss: 0.29042943756850703
Epoch 100 train loss: 0.20583353480363667, validation loss: 0.28636360880976786
Epoch 110 train loss: 0.2021989925826744, validation loss: 0.283113115479323
Epoch 120 train loss: 0.1992848905498313, validation loss: 0.28049299832338614
Epoch 130 train loss: 0.19694288270417482, validation loss: 0.2

0 = malignant, 1 = benign

In [258]:
val_p = sigmoid(X_val @ w.T + b).flatten()

target_recall = 1.0
best_t = None

for t in np.linspace(0, 1, 1001):
    y_pred = (val_p > t).astype(int)

    tp = np.sum((y_val == 0) & (y_pred == 0))
    fn = np.sum((y_val == 0) & (y_pred == 1))
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    if recall >= target_recall:
        best_t = t
        break

print("Chosen threshold:", best_t)


Chosen threshold: 0.901


Round down to 0.9

In [260]:
test_p = sigmoid(X_test @ w.T + b).flatten()
y_pred = (test_p > 0.90).astype(int) # If p(benign) > 0.90 predict benign

tp = np.sum((y_test == 0) & (y_pred == 0))
fn = np.sum((y_test == 0) & (y_pred == 1))
fp = np.sum((y_test == 1) & (y_pred == 0))
recall = tp / (tp + fn)
precision = tp / (tp + fp)

from sklearn.metrics import roc_auc_score
print(f"ROC AUC score: {roc_auc_score(y_test, test_p)}")
print(f"Recall score: {recall}")
print(f"Precision score: {precision}")
print(f"False Negatives: {fn}")
print(f"False Positives: {fp}")

ROC AUC score: 0.996606334841629
Recall score: 1.0
Precision score: 0.85
False Negatives: 0
False Positives: 6


While these results are good, they are evaluated on a small test set. Recall is perfect on this test split, meaning no malignant tumors are missed. Precision is lower at 0.85, 6 benign cases were incorrectly flagged as malignant. This is intentional, this model prioritizes as little false negatives as possible at the cost of a small number of false positives which is acceptable.

In [268]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, recall_score, precision_score, confusion_matrix

model = LogisticRegression(penalty='l2')

model.fit(X_train, y_train)
p = model.predict_proba(X_test)[:, 1]
t = 0.9
y_pred = (p > t).astype(int)

recall = recall_score(y_test, y_pred, pos_label=0)
precision = precision_score(y_test, y_pred, pos_label=0)
roc = roc_auc_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
fn = cm[0, 1]
fp = cm[1, 0]

print(f"ROC AUC score: {roc}")
print(f"Recall score: {recall}")
print(f"Precision score: {precision}")
print(f"False Negatives: {fn}")
print(f"False Positives: {fp}")

ROC AUC score: 0.9660633484162896
Recall score: 0.9705882352941176
Precision score: 0.9428571428571428
False Negatives: 1
False Positives: 2


