# Logistic Regression (Binary & Multinomial)

**Scenario**: Lead qualification, predict whether a marketing lead will convert.

Use a chronological split, evaluate ROC/PR, pick an operating threshold, and calibrate probabilities.

Diagnostics
- Base rate & class balance
- PR‑AUC / ROC‑AUC + calibration curves
- Thresholds, confusion matrix & expected‑value rationale
- Top coefficients with odds‑ratio translations

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix, f1_score

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification_utils.py
import classification_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification.csv"

df = pd.read_csv(csv_path, parse_dates=["ts"]).sort_values("ts")
train, test = utils.chrono_split(df, "ts", test_frac=0.2)

features = ["ad_channel","device","region","campaign","spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","email_opens_l30","past_purchases","tenure_days","discount_flag","competitor_visits"]
target = "converted"

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

pre = ColumnTransformer([
    ("num", StandardScaler(), ["spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","email_opens_l30","past_purchases","tenure_days"]),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["ad_channel","device","region","campaign"]),
    ("bin", "passthrough", ["discount_flag","competitor_visits"])
])

clf = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=500, class_weight=None))])
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:,1]
print("Log loss:", log_loss(y_test, probs))
metrics = utils.evaluate_classifier(y_test, probs, threshold=0.5, title_prefix="Logistic (baseline)")

### Advanced Diagnostic

#### Base rate & class balance

In [None]:
def class_balance(y):
    y = pd.Series(y)
    return pd.DataFrame({
        "count": y.value_counts(),
        "rate": y.value_counts(normalize=True)
    }).rename_axis("class").reset_index()

cb_train = class_balance(y_train)
cb_test  = class_balance(y_test)

print("Train set")
display(pd.DataFrame({"split":["train"]*len(cb_train), **cb_train.to_dict("list")}))
print("Test set")
display(pd.DataFrame({"split":["test"]*len(cb_test),  **cb_test.to_dict("list")}))

# Quick bar: positive class rate over time (optional)
if "ts" in test.columns:
    tmp = test[["ts", target]].copy()
    tmp["week"] = tmp["ts"].dt.to_period("W").dt.to_timestamp()
    agg = tmp.groupby("week")[target].mean().reset_index()
    plt.figure(figsize=(8,3))
    plt.plot(agg["week"], agg[target], marker="o")
    plt.ylim(0, 1)
    plt.title("Weekly positive rate (test)")
    plt.xlabel("week"); plt.ylabel("P(y=1)")
    plt.tight_layout(); plt.show()

#### PR‑AUC / ROC‑AUC + calibration curves

In [None]:
roc_auc = roc_auc_score(y_test, probs)
pr_auc  = average_precision_score(y_test, probs)
print(f"ROC‑AUC: {roc_auc:.3f} | PR‑AUC: {pr_auc:.3f}")

# ROC
fpr, tpr, roc_th = roc_curve(y_test, probs)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
plt.plot([0,1],[0,1], linestyle="--", linewidth=1)
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(); plt.tight_layout(); plt.show()

# PR
prec, rec, pr_th = precision_recall_curve(y_test, probs)
plt.figure(figsize=(5,4))
plt.plot(rec, prec, label=f"AP={pr_auc:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("Precision‑Recall curve")
plt.legend(); plt.tight_layout(); plt.show()

# Calibration curve (reliability)
prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=10, strategy="quantile")
plt.figure(figsize=(5,4))
plt.plot(prob_pred, prob_true, marker="o", label="model")
plt.plot([0,1],[0,1], linestyle="--", linewidth=1, label="perfect")
plt.xlabel("Predicted probability (bin avg)")
plt.ylabel("Observed frequency")
plt.title("Calibration curve (test)")
plt.legend(); plt.tight_layout(); plt.show()

#### Thresholds, confusion matrix & expected‑value rationale

In [None]:
# Candidate thresholds
# t_f1: maximizes F1
f1s = []
grid = np.linspace(0.01, 0.99, 99)
for t in grid:
    yhat = (probs >= t).astype(int)
    f1s.append(f1_score(y_test, yhat))
t_f1 = float(grid[int(np.argmax(f1s))])

# t_ks: Youden/KS (maximize TPR - FPR) on ROC
tpr_m, fpr_m, th_m = tpr, fpr, roc_th
ks_vals = tpr_m - fpr_m
t_ks = float(th_m[int(np.argmax(ks_vals))])

# t_cost: expected value threshold (set your economics here)
# EV = TP * gain_tp - FP * cost_fp - FN * cost_fn - TN * cost_tn (defaults)
gain_tp = 100.0   # e.g., expected profit if we act and a user converts
cost_fp = 10.0    # e.g., cost of acting on a non‑converter
cost_fn = 40.0    # e.g., opportunity cost of missing a converter
cost_tn = 0.0     # cost of correctly not acting
def expected_value(y_true, p, t, g_tp=gain_tp, c_fp=cost_fp, c_fn=cost_fn, c_tn=cost_tn):
    yhat = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
    return tp*g_tp - fp*c_fp - fn*c_fn - tn*c_tn

evs = [expected_value(y_test, probs, t) for t in grid]
t_cost = float(grid[int(np.argmax(evs))])

print(f"Proposed thresholds → F1-opt: {t_f1:.2f} | KS/Youden: {t_ks:.2f} | EV-opt: {t_cost:.2f}")

# Confusion matrices & EV at proposed thresholds
def cm_report(y_true, p, t, label):
    yhat = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
    total = tn+fp+fn+tp
    ev = expected_value(y_true, p, t)
    print(f"[{label}] thresh={t:.2f} | TP={tp} FP={fp} FN={fn} TN={tn} | "
          f"TPR={tp/(tp+fn+1e-12):.3f} FPR={fp/(fp+tn+1e-12):.3f} Precision={tp/(tp+fp+1e-12):.3f} | EV={ev:,.2f}")

cm_report(y_test, probs, 0.50, "Baseline 0.5")
cm_report(y_test, probs, t_f1, "F1‑opt")
cm_report(y_test, probs, t_ks, "KS/Youden")
cm_report(y_test, probs, t_cost, "EV‑opt")

# Plot EV vs threshold
plt.figure(figsize=(6,4))
plt.plot(grid, evs)
plt.axvline(t_cost, linestyle="--")
plt.xlabel("threshold"); plt.ylabel("Expected value")
plt.title("Expected value vs threshold (test)")
plt.tight_layout(); plt.show()

#### Top coefficients with odds‑ratio translations

In [None]:
def get_feature_names(preprocessor, input_features):
    """Extract fully expanded feature names from ColumnTransformer + inner transformers."""
    out = []
    for name, trans, cols in preprocessor.transformers_:
        if name == "remainder" and trans == "drop":
            continue
        if hasattr(trans, "get_feature_names_out"):
            # e.g., OneHotEncoder, Polynomial, etc.
            fn = trans.get_feature_names_out(cols)
        elif trans == "passthrough":
            fn = np.array(cols)
        else:
            # StandardScaler or others without names_out: pass through column names
            fn = np.array(cols)
        out.extend(fn.tolist())
    return np.array(out)

# Pull LR from pipeline and map coefs to names
assert isinstance(clf, Pipeline) and "pre" in clf.named_steps and "lr" in clf.named_steps
lr: LogisticRegression = clf.named_steps["lr"]
preproc = clf.named_steps["pre"]

feat_names = get_feature_names(preproc, features)
coef = lr.coef_.ravel()
odds_ratio = np.exp(coef)

coef_df = pd.DataFrame({
    "feature": feat_names,
    "coef": coef,
    "odds_ratio": odds_ratio,
}).sort_values("coef", ascending=False)

# Top positive & negative drivers
top_k = 12
top_pos = coef_df.head(top_k)
top_neg = coef_df.tail(top_k).sort_values("coef")

print(f"Top {top_k} Positive Drivers")
display(top_pos)
print(f"Top {top_k} Negative Drivers")
display(top_neg)

# Quick text translation helper
def plain_explain(row):
    f = row["feature"]
    orv = row["odds_ratio"]
    if "ad_channel_" in f:
        base = f.replace("ad_channel_", "")
        return f"If ad_channel = '{base}', odds of conversion are ×{orv:.2f} vs baseline channel (holding others constant)."
    if "device_" in f:
        base = f.replace("device_", "")
        return f"Using device '{base}' multiplies conversion odds by ×{orv:.2f} vs the reference device."
    if "region_" in f:
        base = f.replace("region_", "")
        return f"Region '{base}' changes conversion odds by ×{orv:.2f} vs the reference region."
    if "campaign_" in f:
        base = f.replace("campaign_", "")
        return f"Campaign '{base}' changes conversion odds by ×{orv:.2f} vs the reference campaign."
    if f in ["discount_flag","competitor_visits"]:
        return f"Flag '{f}=1' multiplies odds by ×{orv:.2f} compared with {f}=0."
    return f"A one‑SD increase in '{f}' multiplies conversion odds by ×{orv:.2f}."

print("\nPlain-language explanations (sample):")
for _, r in pd.concat([top_pos.head(6), top_neg.head(6)]).iterrows():
    print(" •", plain_explain(r))

print("\n")
print(f"Warning: The dataset used in this example was programmatically generated, which does not reflect real-world information.")