# Breast Cancer Detection with Logistic Regression (Step by Step)

We model the probability that a tumor is malignant (label 1) given features $x \in \mathbb{R}^n$.

**Sigmoid (logistic) function**
\[
\sigma(z) = \frac{1}{1 + e^{-z}}
\]

**Linear score**
\[
z = w_0 + w_1 x_1 + \cdots + w_n x_n = \mathbf{w}^\top \tilde{\mathbf{x}}
\]
where $\tilde{\mathbf{x}}$ is $x$ with a leading 1 for the intercept.

**Hypothesis**
\[
h_\theta(x) = \sigma(z)
\]

**Binary cross-entropy (log loss) over $m$ samples**
\[
J(\theta) = -\frac{1}{m}\sum_{i=1}^m \left[ y^{(i)} \log h_\theta(x^{(i)}) + (1 - y^{(i)}) \log \big(1 - h_\theta(x^{(i)})\big) \right]
\]

**Gradient (vectorized)**
\[
\nabla_{\theta} J = \frac{1}{m}\, X^\top \big( \hat{\mathbf{y}} - \mathbf{y} \big),
\quad \text{where } \hat{\mathbf{y}}=\sigma(X\theta)
\]

**Gradient descent update**
\[
\theta \leftarrow \theta - \alpha \, \nabla_{\theta} J
\]

In [ ]:
import io, zipfile, requests
import pandas as pd
import numpy as np

# Direct ZIP from UCI (contains wdbc.data and wdbc.names)
UCI_ZIP_URL = "https://archive.ics.uci.edu/static/public/17/breast%2Bcancer%2Bwisconsin%2Bdiagnostic.zip"

resp = requests.get(UCI_ZIP_URL)
resp.raise_for_status()

with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
    with zf.open('wdbc.data') as f:
        base = ['radius','texture','perimeter','area','smoothness',
                'compactness','concavity','concave_points','symmetry','fractal_dimension']
        feature_names = [f"{b}_{stat}" for stat in ['mean','se','worst'] for b in base]
        cols = ['ID','Diagnosis'] + feature_names
        df = pd.read_csv(f, header=None, names=cols)

df['target'] = (df['Diagnosis'] == 'M').astype(int)
df.head()

In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[feature_names].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std  = scaler.transform(X_test)
X_train_std.shape, X_test_std.shape, y_train.shape

In [ ]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def log_loss(y_true, y_hat, eps=1e-12):
    y_hat = np.clip(y_hat, eps, 1-eps)
    return -np.mean(y_true*np.log(y_hat) + (1-y_true)*np.log(1-y_hat))

def add_intercept(X):
    return np.c_[np.ones((X.shape[0], 1)), X]

Xtr = add_intercept(X_train_std)
Xte = add_intercept(X_test_std)

rng = np.random.default_rng(0)
theta = rng.normal(scale=0.01, size=Xtr.shape[1])

alpha = 0.1
epochs = 2000

loss_history = []
for t in range(epochs):
    z = Xtr @ theta
    yhat = sigmoid(z)
    grad = (Xtr.T @ (yhat - y_train)) / Xtr.shape[0]
    theta -= alpha * grad
    if t % 50 == 0 or t == epochs-1:
        loss = log_loss(y_train, yhat)
        loss_history.append((t, loss))

loss_history[-5:]

In [ ]:
import matplotlib.pyplot as plt

its, losses = zip(*loss_history)
plt.figure()
plt.plot(its, losses, marker='o')
plt.xlabel("Iteration")
plt.ylabel("Train Log Loss")
plt.title("From-scratch logistic regression: training loss");

In [ ]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

proba_te = sigmoid(Xte @ theta)
pred_te = (proba_te >= 0.5).astype(int)

print("Accuracy :", accuracy_score(y_test, pred_te))
print("Precision:", precision_score(y_test, pred_te))
print("Recall   :", recall_score(y_test, pred_te))
print("F1       :", f1_score(y_test, pred_te))
print("ROC AUC  :", roc_auc_score(y_test, proba_te))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_te))
print("\nClassification report:\n", classification_report(y_test, pred_te))

In [ ]:
thresholds = np.linspace(0.1, 0.9, 9)
rows = []
for thr in thresholds:
    pred = (proba_te >= thr).astype(int)
    rows.append({
        "threshold": thr,
        "precision": precision_score(y_test, pred),
        "recall":    recall_score(y_test, pred),
        "f1":        f1_score(y_test, pred)
    })
pd.DataFrame(rows)

In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=500, solver="lbfgs"))
])

pipe.fit(X_train, y_train)
sk_proba = pipe.predict_proba(X_test)[:,1]
sk_pred  = (sk_proba >= 0.5).astype(int)

print("sklearn Accuracy :", accuracy_score(y_test, sk_pred))
print("sklearn Precision:", precision_score(y_test, sk_pred))
print("sklearn Recall   :", recall_score(y_test, sk_pred))
print("sklearn F1       :", f1_score(y_test, sk_pred))
print("sklearn ROC AUC  :", roc_auc_score(y_test, sk_proba))

In [ ]:
from_scratch = pd.Series(theta[1:], index=feature_names).sort_values(key=np.abs, ascending=False)
from_scratch.head(10)