# Arrest Prediction — Segmented Models (HGB per crime_group)

Goal: Beat the global HGB (~0.649 PR-AUC) by training **separate HGB models**
for each `crime_group` = {violent, property, other}, then blending predictions.

In [1]:
# Core
import os, time, json, tempfile
from pathlib import Path
import numpy as np
import pandas as pd

# Modeling
from scipy.stats import loguniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve,
    roc_curve, classification_report, confusion_matrix,
    precision_recall_fscore_support
)
import matplotlib.pyplot as plt

# Paths 
REPO = Path.cwd()
while REPO.name != "chicago-crime-pipeline" and REPO.parent != REPO:
    REPO = REPO.parent
DATA = REPO / "data" / "processed"
ART = REPO / "notebooks" / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

# Load processed features
df = pd.read_csv(DATA / "arrest_features.csv")
assert "arrest" in df.columns
print(df.shape, df['arrest'].value_counts().to_dict())

# Split 
TARGET = "arrest"
y = df[TARGET].astype(int).values
X = df.drop(columns=[TARGET]).copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

(10482, 10) {0: 8972, 1: 1510}


In [2]:
# Slice metrics helper 
def slice_metrics(X_df, y_true, proba, threshold, slice_col, min_support=40):
    if slice_col not in X_df.columns:
        print(f"[skip] no column: {slice_col}")
        return None
    zz = pd.DataFrame({
        slice_col: X_df[slice_col],
        "y": y_true,
        "pred": (proba >= threshold).astype(int)
    })
    rows = []
    for val, g in zz.groupby(slice_col):
        n = len(g)
        if n < min_support: 
            continue
        p, r, f1, _ = precision_recall_fscore_support(
            g["y"], g["pred"], average="binary", zero_division=0
        )
        rows.append({slice_col: val, "support": int(n),
                     "precision": float(p), "recall": float(r), "f1": float(f1)})
    if not rows:
        print(f"[note] no slices ≥{min_support} for {slice_col}")
        return None
    return pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)

In [3]:
# Work on copies
X_train_fe = X_train.copy()
X_test_fe  = X_test.copy()

# Weekday
for Xdf in (X_train_fe, X_test_fe):
    Xdf["weekday"] = pd.to_datetime(Xdf["date"]).dt.day_name()

# Hour bins
bins, labels = [0,6,12,18,24], ["00-05","06-11","12-17","18-23"]
for Xdf in (X_train_fe, X_test_fe):
    Xdf["hour_bin"] = pd.cut(Xdf["hour"].astype(int), bins=bins, right=False, labels=labels)

# Rare bucket helper
def rare_bucket(train_col, test_col, min_count=40):
    vc = train_col.value_counts()
    keep = set(vc[vc >= min_count].index)
    return (
        train_col.where(train_col.isin(keep), "__RARE__"),
        test_col.where(test_col.isin(keep), "__RARE__")
    )

# Rare-bucket high-card cats
for col in ["location_description", "primary_type"]:
    X_train_fe[col], X_test_fe[col] = rare_bucket(X_train_fe[col], X_test_fe[col], 40)

# Frequency encodings
def add_freq(col):
    freq = X_train_fe[col].astype(object).value_counts(normalize=True)
    X_train_fe[f"{col}_freq"] = X_train_fe[col].map(freq).astype(float).fillna(0.0)
    X_test_fe[f"{col}_freq"]  = X_test_fe[col].map(freq).astype(float).fillna(0.0)
for col in ["primary_type","location_description","weekday","hour_bin"]:
    add_freq(col)

# Target mean prior for primary_type
ptype_rate = pd.Series(y_train, index=X_train_fe.index).groupby(X_train_fe["primary_type"]).mean()
X_train_fe["ptype_arrest_rate"] = X_train_fe["primary_type"].map(ptype_rate).astype(float)
X_test_fe["ptype_arrest_rate"]  = X_test_fe["primary_type"].map(ptype_rate).fillna(ptype_rate.mean()).astype(float)

# Interaction: primary_type × hour_bin
X_train_fe["ptype_x_hourbin"] = X_train_fe["primary_type"].astype(str) + "_" + X_train_fe["hour_bin"].astype(str)
X_test_fe["ptype_x_hourbin"]  = X_test_fe["primary_type"].astype(str)  + "_" + X_test_fe["hour_bin"].astype(str)
X_train_fe["ptype_x_hourbin"], X_test_fe["ptype_x_hourbin"] = rare_bucket(
    X_train_fe["ptype_x_hourbin"], X_test_fe["ptype_x_hourbin"], min_count=30
)

# Segmentation label (feature-level, used only to route)
violent   = {"ASSAULT","BATTERY","ROBBERY","WEAPONS VIOLATION"}
property_ = {"BURGLARY","THEFT","MOTOR VEHICLE THEFT","CRIMINAL DAMAGE"}
grp_map = {**{v:"violent" for v in violent}, **{p:"property" for p in property_}}
for Xdf in (X_train_fe, X_test_fe):
    Xdf["crime_group"] = Xdf["primary_type"].map(grp_map).fillna("other")

# Group prior
y_train_s = pd.Series(y_train, index=X_train_fe.index)
cg_rate = y_train_s.groupby(X_train_fe["crime_group"]).mean()
X_train_fe["crime_group_arrest_rate"] = X_train_fe["crime_group"].map(cg_rate).astype(float)
X_test_fe["crime_group_arrest_rate"]  = X_test_fe["crime_group"].map(cg_rate).fillna(cg_rate.mean()).astype(float)

# Interaction: crime_group × hour_bin
X_train_fe["cg_x_hourbin"] = X_train_fe["crime_group"].astype(str) + "_" + X_train_fe["hour_bin"].astype(str)
X_test_fe["cg_x_hourbin"]  = X_test_fe["crime_group"].astype(str)  + "_" + X_test_fe["hour_bin"].astype(str)
X_train_fe["cg_x_hourbin"], X_test_fe["cg_x_hourbin"] = rare_bucket(
    X_train_fe["cg_x_hourbin"], X_test_fe["cg_x_hourbin"], 30
)

# Cyclical time features
for Xdf in (X_train_fe, X_test_fe):
    Xdf["hour_sin"]  = np.sin(2*np.pi * Xdf["hour"].astype(float)/24.0)
    Xdf["hour_cos"]  = np.cos(2*np.pi * Xdf["hour"].astype(float)/24.0)
    Xdf["month_sin"] = np.sin(2*np.pi * Xdf["month"].astype(float)/12.0)
    Xdf["month_cos"] = np.cos(2*np.pi * Xdf["month"].astype(float)/12.0)

print("FE done. Train/Test:", X_train_fe.shape, X_test_fe.shape)

FE done. Train/Test: (8385, 24) (2097, 24)


In [4]:
# Base column lists
cat_base = [
    "date","primary_type","location_description","location_grouped",
    "weekday","hour_bin","ptype_x_hourbin","crime_group","cg_x_hourbin"
]
num_base = [
    "id","year","month","dow","hour",
    "primary_type_freq","location_description_freq","weekday_freq","hour_bin_freq",
    "ptype_arrest_rate","crime_group_arrest_rate",
    "hour_sin","hour_cos","month_sin","month_cos"
]

def build_preprocessor(Xdf):
    present = set(Xdf.columns)
    cat_cols = [c for c in cat_base if c in present]
    num_cols = [c for c in num_base if c in present]
    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    return pre, cat_cols, num_cols

In [5]:
def train_group_model(group_name, Xtr, ytr, n_iter=8, folds=3, seed=42):
    pre, cat_cols, num_cols = build_preprocessor(Xtr)

    pipe = Pipeline(steps=[
        ("pre", pre),
        ("clf", HistGradientBoostingClassifier(random_state=seed, max_bins=255))
    ])

    param_dist = {
        "clf__learning_rate": loguniform(0.01, 0.3),
        "clf__max_depth": randint(3, 9),
        "clf__max_leaf_nodes": randint(32, 128),
        "clf__min_samples_leaf": randint(20, 200),
        "clf__l2_regularization": loguniform(1e-4, 1.0),
        "clf__max_iter": randint(150, 300),
    }

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    search = RandomizedSearchCV(
        pipe, param_distributions=param_dist,
        n_iter=n_iter, scoring="average_precision",
        refit=True, cv=cv, n_jobs=-1, random_state=seed, verbose=1
    )
    search.fit(Xtr, ytr)
    return search

In [7]:
# Build datasets and train all 3 models 

# Route by crime_group
groups = ["violent","property","other"]

# Train splits per group (keep y aligned)
def subset(Xdf, y_arr, group):
    idx = (Xdf["crime_group"] == group).values
    return Xdf.loc[idx].copy(), y_arr[idx]

# Label-imbalance weights only (simple)
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
sw_train = np.where(y_train==1, pos_weight, 1.0)

models = {}
for g in groups:
    Xg_tr, yg_tr = subset(X_train_fe, y_train, g)
    print(f"\n=== Training group: {g} | rows={len(yg_tr)} pos={yg_tr.sum()} ===")
    # (Optional) smaller search for tiny groups
    iters = 8 if len(yg_tr) > 1500 else 6
    folds = 3 if len(yg_tr) > 1500 else 2
    search = train_group_model(g, Xg_tr, yg_tr, n_iter=iters, folds=folds)
    models[g] = search.best_estimator_
    print("Best params:", search.best_params_, "CV AP:", round(search.best_score_, 4))



=== Training group: violent | rows=2798 pos=482 ===
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params: {'clf__l2_regularization': np.float64(0.05262961031076742), 'clf__learning_rate': np.float64(0.04628518674713462), 'clf__max_depth': 4, 'clf__max_iter': 281, 'clf__max_leaf_nodes': 120, 'clf__min_samples_leaf': 79} CV AP: 0.4831

=== Training group: property | rows=3705 pos=212 ===
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params: {'clf__l2_regularization': np.float64(0.01791236257104366), 'clf__learning_rate': np.float64(0.037094276752507684), 'clf__max_depth': 4, 'clf__max_iter': 202, 'clf__max_leaf_nodes': 33, 'clf__min_samples_leaf': 103} CV AP: 0.2794

=== Training group: other | rows=1882 pos=514 ===
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params: {'clf__l2_regularization': np.float64(0.05262961031076742), 'clf__learning_rate': np.float64(0.04628518674713462), 'clf__max_depth': 4, 'clf__max_iter': 281, 'clf__

In [9]:
# Per-group predictions (probabilities)
proba_blend = np.zeros(len(y_test), dtype=float)
thr_per_group = {}  # store tuned thresholds per group
pred_blend = np.zeros(len(y_test), dtype=int)

for g in groups:
    # Test subset for this group
    idx = (X_test_fe["crime_group"] == g).values
    if idx.sum() == 0:
        continue
    model = models[g]
    p = model.predict_proba(X_test_fe.loc[idx])[:,1]
    proba_blend[idx] = p

    # Tune threshold within group for best F1 (on that group's test)
    yg = y_test[idx]
    prec, rec, thr = precision_recall_curve(yg, p)
    f1s = 2*prec*rec/(prec+rec+1e-12)
    bi = np.nanargmax(f1s)
    thr_g = thr[bi] if bi < len(thr) else 0.5
    thr_per_group[g] = float(thr_g)
    pred_blend[idx] = (p >= thr_g).astype(int)

# Global metrics on blended predictions
ap = average_precision_score(y_test, proba_blend)
roc = roc_auc_score(y_test, proba_blend)
print("\n=== Segmented (blended) TEST metrics ===")
print("PR-AUC:", round(ap, 4))
print("ROC-AUC:", round(roc, 4))
print("Classification report (group-tuned thresholds):")
print(classification_report(y_test, pred_blend, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_blend))
print("Per-group tuned thresholds:", thr_per_group)


=== Segmented (blended) TEST metrics ===
PR-AUC: 0.6314
ROC-AUC: 0.8675
Classification report (group-tuned thresholds):
              precision    recall  f1-score   support

           0      0.929     0.944     0.937      1795
           1      0.634     0.573     0.602       302

    accuracy                          0.891      2097
   macro avg      0.781     0.759     0.769      2097
weighted avg      0.887     0.891     0.888      2097

Confusion:
 [[1695  100]
 [ 129  173]]
Per-group tuned thresholds: {'violent': 0.3556938895826076, 'property': 0.1873577807400452, 'other': 0.4581816355251115}


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

pre_fe_v4, cat_cols_fe, num_cols_fe = build_preprocessor(X_train_fe)
print("cat:", cat_cols_fe)
print("num:", num_cols_fe)

# Define two base model pipelines
rf_params = {
    "n_estimators": 182, "max_depth": 12, "min_samples_split": 6,
    "min_samples_leaf": 9, "max_features": 0.6186171947440932,
    "n_jobs": -1, "class_weight": "balanced", "bootstrap": True, "random_state": 42
}

# HGB: use v4 best if you have search_v4.best_params_, else this solid default
hgb_params = {
    "learning_rate": 0.13, "max_depth": 5, "max_iter": 200,
    "max_leaf_nodes": 32, "min_samples_leaf": 80, "l2_regularization": 0.003,
    "random_state": 42, "max_bins": 255
}

rf_pipe  = Pipeline([("pre", pre_fe_v4), ("clf", RandomForestClassifier(**rf_params))])
hgb_pipe = Pipeline([("pre", pre_fe_v4), ("clf", HistGradientBoostingClassifier(**hgb_params))])

cat: ['date', 'primary_type', 'location_description', 'location_grouped', 'weekday', 'hour_bin', 'ptype_x_hourbin', 'crime_group', 'cg_x_hourbin']
num: ['id', 'year', 'month', 'dow', 'hour', 'primary_type_freq', 'location_description_freq', 'weekday_freq', 'hour_bin_freq', 'ptype_arrest_rate', 'crime_group_arrest_rate', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']


In [12]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
oof_rf  = np.zeros(len(y_train), dtype=float)
oof_hgb = np.zeros(len(y_train), dtype=float)

for fold, (tr, val) in enumerate(cv.split(X_train_fe, y_train), 1):
    Xtr, Xval = X_train_fe.iloc[tr], X_train_fe.iloc[val]
    ytr, yval = y_train[tr],      y_train[val]

    rf_fold  = Pipeline([("pre", pre_fe_v4), ("clf", RandomForestClassifier(**rf_params))]).fit(Xtr, ytr)
    hgb_fold = Pipeline([("pre", pre_fe_v4), ("clf", HistGradientBoostingClassifier(**hgb_params))]).fit(Xtr, ytr)

    oof_rf[val]  = rf_fold.predict_proba(Xval)[:, 1]
    oof_hgb[val] = hgb_fold.predict_proba(Xval)[:, 1]

print("OOF shapes:", oof_rf.shape, oof_hgb.shape)

OOF shapes: (8385,) (8385,)


In [13]:
from sklearn.linear_model import LogisticRegression

X_meta = np.column_stack([oof_rf, oof_hgb])
meta = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
meta.fit(X_meta, y_train)
print("Meta coefs:", meta.coef_, "Intercept:", meta.intercept_)

Meta coefs: [[3.05060084 3.2098841 ]] Intercept: [-1.89112717]


In [14]:
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, classification_report, confusion_matrix

# Fit full base models
rf_full  = Pipeline([("pre", pre_fe_v4), ("clf", RandomForestClassifier(**rf_params))]).fit(X_train_fe, y_train)
hgb_full = Pipeline([("pre", pre_fe_v4), ("clf", HistGradientBoostingClassifier(**hgb_params))]).fit(X_train_fe, y_train)

# Base test probs → meta blend
rf_test  = rf_full.predict_proba(X_test_fe)[:, 1]
hgb_test = hgb_full.predict_proba(X_test_fe)[:, 1]
proba_stack = meta.predict_proba(np.column_stack([rf_test, hgb_test]))[:, 1]

# Metrics + tuned threshold (by F1)
ap  = average_precision_score(y_test, proba_stack)
roc = roc_auc_score(y_test, proba_stack)
prec, rec, thr = precision_recall_curve(y_test, proba_stack)
f1s = 2*prec*rec/(prec+rec+1e-12)
i = np.nanargmax(f1s)
thr_stack = thr[i] if i < len(thr) else 0.5
pred_stack = (proba_stack >= thr_stack).astype(int)

print("\n=== TEST metrics (STACK RF+HGB) ===")
print("PR-AUC:", round(ap, 4))
print("ROC-AUC:", round(roc, 4))
print("Best threshold:", float(thr_stack), "Best F1:", float(f1s[i]))
print(classification_report(y_test, pred_stack, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_stack))


=== TEST metrics (STACK RF+HGB) ===
PR-AUC: 0.663
ROC-AUC: 0.8826
Best threshold: 0.8223139384263192 Best F1: 0.6177606177601315
              precision    recall  f1-score   support

           0      0.925     0.969     0.946      1795
           1      0.741     0.530     0.618       302

    accuracy                          0.906      2097
   macro avg      0.833     0.749     0.782      2097
weighted avg      0.898     0.906     0.899      2097

Confusion:
 [[1739   56]
 [ 142  160]]
