# Decision Trees

Fit a DecisionTreeClassifier, tune depth minimally, and inspect feature importance and decision boundaries via partial plots.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification_utils.py
import classification_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path, parse_dates=["ts"]).sort_values("ts")
train, test = utils.chrono_split(df, "ts", test_frac=0.2)

features = ["ad_channel","device","region","campaign","spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","past_purchases","discount_flag","competitor_visits"]
target = "converted"

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

pre = ColumnTransformer([
    ("num", StandardScaler(), ["spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","past_purchases"]),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["ad_channel","device","region","campaign"]),
    ("bin", "passthrough", ["discount_flag","competitor_visits"])
])

tree = Pipeline([("pre", pre), ("clf", DecisionTreeClassifier(max_depth=6, min_samples_leaf=50, random_state=42))])
tree.fit(X_train, y_train)
probs = tree.predict_proba(X_test)[:,1]
_ = utils.evaluate_classifier(y_test, probs, title_prefix="Decision Tree")

# Show a small tree plot
est = tree.named_steps["clf"]
plt.figure(figsize=(12,6)); plot_tree(est, fontsize=10, max_depth=2, filled=True, feature_names=tree.named_steps["pre"].get_feature_names_out(), class_names=["no","yes"]); plt.tight_layout(); plt.show()

### Advanced Dignostics
#### The top of the tree (depth ≤ 3) with sample counts & class probabilities

In [None]:
# Text view of the top of the tree (depth ≤ 3) with counts & probabilities
import numpy as np
from sklearn.tree import _tree

est = tree.named_steps["clf"]
fn = tree.named_steps["pre"].get_feature_names_out()

T = est.tree_
n_classes = T.value.shape[2]
assert n_classes == 2, "This snippet assumes a binary target."

def node_summary(node_id):
    counts = T.value[node_id, 0]  # [neg, pos]
    total = counts.sum()
    p_pos = counts[1] / total if total > 0 else np.nan
    return total, counts[1], p_pos

def print_subtree(node_id=0, depth=0, max_depth=3, prefix=""):
    total, pos, p_pos = node_summary(node_id)
    indent = "  " * depth
    if T.feature[node_id] != _tree.TREE_UNDEFINED and depth < max_depth:
        feat = fn[T.feature[node_id]]
        thresh = T.threshold[node_id]
        print(f"{indent}{prefix}Node {node_id}: IF {feat} ≤ {thresh:.3f} "
              f"(n={int(total)}, pos={int(pos)}, p1={p_pos:.3f})")
        print_subtree(T.children_left[node_id], depth+1, max_depth, prefix="Left:  ")
        print(f"{indent}{prefix}ELSE {feat} > {thresh:.3f} "
              f"(n={int(total)}, pos={int(pos)}, p1={p_pos:.3f})")
        print_subtree(T.children_right[node_id], depth+1, max_depth, prefix="Right: ")
    else:
        print(f"{indent}{prefix}Leaf {node_id}: (n={int(total)}, pos={int(pos)}, p1={p_pos:.3f})")

print_subtree(node_id=0, max_depth=3)

#### PR‑AUC / ROC‑AUC + confusion matrix at a proposed threshold (F1‑optimal)
- Picks F1‑optimal threshold on test for illustration. In production, pick thresholds on a validation set and lock them before final test.

In [None]:
# PR‑AUC / ROC‑AUC and confusion matrix at proposed threshold (F1-opt)
import numpy as np, matplotlib.pyplot as plt
from sklearn.metrics import (roc_auc_score, average_precision_score, roc_curve,
                             precision_recall_curve, confusion_matrix, f1_score)

roc_auc = roc_auc_score(y_test, probs)
pr_auc  = average_precision_score(y_test, probs)
print(f"ROC‑AUC: {roc_auc:.3f}  |  PR‑AUC: {pr_auc:.3f}")

# Curves
fpr, tpr, _ = roc_curve(y_test, probs)
prec, rec, _ = precision_recall_curve(y_test, probs)

plt.figure(figsize=(5,4)); plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
plt.plot([0,1],[0,1],'--',lw=1); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC"); plt.legend()
plt.tight_layout(); plt.show()

plt.figure(figsize=(5,4)); plt.plot(rec, prec, label=f"AP={pr_auc:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision‑Recall"); plt.legend()
plt.tight_layout(); plt.show()

# Proposed threshold: maximize F1 on test (swap to validation if you prefer)
grid = np.linspace(0.01, 0.99, 99)
f1s = [f1_score(y_test, (probs >= t).astype(int)) for t in grid]
t_star = float(grid[int(np.argmax(f1s))])
print(f"Proposed threshold (F1‑optimal): {t_star:.2f}")

# Confusion matrix at t_star
yhat = (probs >= t_star).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
print(f"TP={tp} FP={fp} FN={fn} TN={tn}")

#### Calibration curves pre vs post calibration (+ Brier score)
- Calibrates with isotonic via CalibratedClassifierCV (3‑fold on train), then compares calibration curves and Brier scores.

- The Brier score measures how accurate a model's predicted probabilities are to the actual outcomes, providing a single number for probabilistic accuracy, while ROC AUC measures a model's ability to distinguish between classes, essentially ranking predictions. A lower Brier score indicates better accuracy, with 0 being perfect, whereas a higher ROC AUC (closer to 1) indicates better discrimination between classes.

In [None]:
# Calibration curves pre/post calibration (isotonic) and Brier scores
import numpy as np, matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import brier_score_loss
from sklearn.base import clone

# Fit a calibrated version of the SAME pipeline (fresh clone)
base = clone(tree)
cal = CalibratedClassifierCV(estimator=base, method="isotonic", cv=3)
cal.fit(X_train, y_train)
probs_cal = cal.predict_proba(X_test)[:, 1]

# Brier scores
brier_raw = brier_score_loss(y_test, probs)
brier_cal = brier_score_loss(y_test, probs_cal)
print(f"Brier (raw): {brier_raw:.4f}  |  Brier (calibrated): {brier_cal:.4f}")

# Reliability curves
pt_raw, pp_raw = calibration_curve(y_test, probs, n_bins=10, strategy="quantile")
pt_cal, pp_cal = calibration_curve(y_test, probs_cal, n_bins=10, strategy="quantile")

plt.figure(figsize=(6,5))
plt.plot(pp_raw, pt_raw, marker="o", label="raw")
plt.plot(pp_cal, pt_cal, marker="o", label="calibrated")
plt.plot([0,1],[0,1],'--',lw=1, label="perfect")
plt.xlabel("Predicted probability (bin avg)")
plt.ylabel("Observed frequency")
plt.title("Calibration (test)")
plt.legend(); plt.tight_layout(); plt.show()

#### Stability of top splits across folds/quarters
- Uses quarters from the ts column on train to test split‑stability of the root split feature.

In [None]:
# Stability of top splits across quarters (time-aware)
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone

def root_feature_name(pipe, X, y):
    model = clone(pipe)
    model.fit(X, y)
    est = model.named_steps["clf"]
    pre = model.named_steps["pre"]
    feats = pre.get_feature_names_out()
    fi = est.tree_.feature[0]
    return feats[fi] if fi >= 0 else "LEAF"

# Build quarter labels on TRAIN only
train_q = train.copy()
train_q["quarter"] = train_q["ts"].dt.to_period("Q").astype(str)

root_by_q = []
for q, dfq in train_q.groupby("quarter"):
    if dfq[target].nunique() < 2:
        continue
    Xq, yq = dfq[features], dfq[target]
    root = root_feature_name(tree, Xq, yq)
    root_by_q.append({"quarter": q, "root_split": root})

stab_df = pd.DataFrame(root_by_q)
display(stab_df)

# Frequency of root features
freq = stab_df["root_split"].value_counts().reset_index()
freq.columns = ["feature", "count"]
display(freq)

# Quick bar plot
plt.figure(figsize=(6,4))
plt.bar(freq["feature"], freq["count"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("#quarters as root split")
plt.title("Stability of root split across quarters (train)")
plt.tight_layout(); plt.show()

#### Fairness check across predefined segments (e.g., device / region) at t_star
- Reports equal opportunity (TPR), FPR, precision, and selection rate per segment + disparity ranges; adjust segments to your policy.

In [None]:
# Fairness metrics across segments at threshold t_star
import numpy as np, pandas as pd

def seg_report(df_eval, seg_col, y_true, y_prob, t):
    out = []
    for val, idx in df_eval.groupby(seg_col).groups.items():
        yt = y_true.loc[idx]
        yp = y_prob.loc[idx]
        yhat = (yp >= t).astype(int)
        tp = int(((yhat==1) & (yt==1)).sum())
        fp = int(((yhat==1) & (yt==0)).sum())
        fn = int(((yhat==0) & (yt==1)).sum())
        tn = int(((yhat==0) & (yt==0)).sum())
        tpr = tp / (tp + fn + 1e-12)  # recall / equal opportunity
        fpr = fp / (fp + tn + 1e-12)
        prec = tp / (tp + fp + 1e-12)
        sel = (yhat==1).mean()        # selection rate / demographic parity
        out.append({"segment": seg_col, "value": val, "n": len(idx),
                    "TPR": tpr, "FPR": fpr, "Precision": prec, "SelectionRate": sel})
    return pd.DataFrame(out)

# Build aligned frames for test split
test_eval = test.reset_index(drop=False).rename(columns={"index":"_idx"})
y_true_s = pd.Series(y_test.values, index=test_eval.index, name="y_true")
y_prob_s = pd.Series(probs, index=test_eval.index, name="y_prob")

segments = [c for c in ["device","region","ad_channel","campaign"] if c in test_eval.columns]

fair_tables = []
for seg in segments:
    fair_tables.append(seg_report(test_eval, seg, y_true_s, y_prob_s, t_star))
fair_df = pd.concat(fair_tables, ignore_index=True)
display(fair_df.sort_values(["segment","value"]))

# Disparity summary (max ‑ min across groups per metric)
disp = (fair_df.groupby("segment")[["TPR","FPR","SelectionRate","Precision"]]
               .agg(lambda s: s.max() - s.min()).reset_index())
disp.columns = ["segment","ΔTPR","ΔFPR","ΔSelection","ΔPrecision"]
display(disp)

# Optional: visualize SelectionRate disparity per segment
for seg in segments:
    sub = fair_df[fair_df["segment"]==seg]
    plt.figure(figsize=(5,3))
    plt.bar(sub["value"].astype(str), sub["SelectionRate"])
    plt.title(f"Selection rate by {seg} (t={t_star:.2f})"); plt.ylim(0,1)
    plt.xticks(rotation=30, ha="right"); plt.ylabel("Selection rate")
    plt.tight_layout(); plt.show()

### (Advanced) Rules for Interpreting Segment Diagnostics Tables

1. Recall (TPR) across groups
    - Rule: High TPR = the model is catching most positives; low TPR = missing many true converters.
    - How to use: Compare TPR across segments; a gap >5–10% signals disparity in model sensitivity.

2. False Positive Rate (FPR) differences
    - Rule: High FPR = model incorrectly flags too many negatives as positives.
    - How to use: If one group has consistently higher FPR, that segment gets “over-scored” → wasted spend.

3. Balance TPR vs FPR
    - Rule: High recall + high FPR means “catch everything but wasteful.”
    - Rule: Low recall + low FPR means “conservative but misses opportunities.”
    - How to use: Identify which strategy each segment is implicitly following.

4. Interpret Precision as ROI efficiency
    - Rule: Precision = of those we target, how many truly convert.
    - How to use: Higher precision → better ROI; lower precision → wasted actions.
    - Tip: Compare precision across segments to see where marketing dollars go furthest.

5. Use Selection Rate to check bias/exposure
    - Rule: Selection Rate = % of group predicted positive.
    - How to use: Very different selection rates between groups → model treats groups unequally, could reflect bias.

6. Consistency across metrics
    - Rule: Don’t just pick one metric; e.g., high TPR but low precision tells a different story than balanced performance.
    - How to use: Triangulate:
    - High TPR + High FPR = overpredictive
    - High Precision + Low TPR = underpredictive
    - Balanced = efficient

7. Disparity ranges (ΔTPR, ΔFPR, etc.)
    - Rule: Disparity = max – min across groups.
    - How to use: Large disparities mean fairness/consistency concerns. Small disparities → stable across groups.

8. Tie metrics back to business
    - Rule: Precision links to ROI efficiency, Recall links to market coverage, FPR links to wasted spend / risk.
    - How to use: Interpret differences not just as stats, but as “where we overspend, underserve, or misallocate.”

9. Volume matters (the n column)
    - Rule: Large groups dominate business impact even if gaps are small. Small groups may be noisy but less critical.
    - How to use: Always weigh metrics against group size before prioritizing fixes.

10. Systematic patterns
    - Rule: If multiple segments (e.g., certain channels or regions) show the same skew (e.g., lower precision), that’s a structural model bias.
    - How to use: Flag as “model design issue” vs. “random noise.”