# Arrest Prediction — v1 (HistGradientBoosting + Engineered Features)
**Goal:** Beat RF v0 (PR-AUC ≈ 0.623) by adding time features, rare bucketing, and frequency encodings, then training a HistGradientBoosting baseline.

**Dataset:** data/processed/arrest_features.csv  
**Target:** arrest (0/1)  
**Artifacts:** saved to notebooks/artifacts/

In [None]:
# Core imports 
import os, time, json, numpy as np, pandas as pd 
from pathlib import Path 

# Modeling + metrics 
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    average_precision_score, roc_auc_score, classification_report, 
    confusion_matrix, precision_recall_curve, roc_curve
)
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

import matplotlib.pyplot as plt
from scipy.stats import loguniform, randint
import tempfile

# Paths
REPO = Path.cwd()
while REPO.name != "chicago-crime-pipeline" and REPO.parent != REPO:
    REPO = REPO.parent
DATA = REPO / "data" / "processed"
ART = REPO / "notebooks" / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

# Load
df = pd.read_csv(DATA / "arrest_features.csv")
assert "arrest" in df.columns
print(df.shape, df["arrest"].value_counts(dropna=False).to_dict())

# Split (same seed/stratify as v0)
TARGET = "arrest"
y = df[TARGET].astype(int).values
X = df.drop(columns=[TARGET]).copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape




(10482, 10) {0: 8972, 1: 1510}


((8385, 9), (2097, 9))

In [6]:
# Copy to avoid touching original splits
X_train_fe = X_train.copy()
X_test_fe  = X_test.copy()

# Weekday from 'date' (assumes ISO-ish strings)
for Xdf in (X_train_fe, X_test_fe):
    Xdf["weekday"] = pd.to_datetime(Xdf["date"]).dt.day_name()

# Hour bins to capture coarse time-of-day patterns
bins   = [0,6,12,18,24]
labels = ["00-05","06-11","12-17","18-23"]
for Xdf in (X_train_fe, X_test_fe):
    Xdf["hour_bin"] = pd.cut(Xdf["hour"].astype(int), bins=bins, right=False, labels=labels)

# Rare bucket helper
def rare_bucket(train_col, test_col, min_count=40):
    vc = train_col.value_counts()
    keep = set(vc[vc >= min_count].index)
    return (train_col.where(train_col.isin(keep), other="__RARE__"),
            test_col.where(test_col.isin(keep),  other="__RARE__"))

# Apply rare bucketing to high-cardinality categoricals
for col in ["location_description", "primary_type"]:
    X_train_fe[col], X_test_fe[col] = rare_bucket(X_train_fe[col], X_test_fe[col], min_count=40)

# Frequency encodings (priors) for chosen categoricals
def add_freq_encode(col):
    # ensure value_counts sees plain labels (avoids CategoryIndex quirks)
    freq = X_train_fe[col].astype(object).value_counts(normalize=True)

    tr = X_train_fe[col].map(freq).astype("float64")
    te = X_test_fe[col].map(freq).astype("float64")

    X_train_fe[f"{col}_freq"] = tr.fillna(0.0).to_numpy()
    X_test_fe[f"{col}_freq"]  = te.fillna(0.0).to_numpy()
for col in ["primary_type", "location_description", "weekday", "hour_bin"]:
    add_freq_encode(col)

# Final column lists for preprocessing
cat_cols_fe = ["date","primary_type","location_description","location_grouped","weekday","hour_bin"]
num_cols_fe = ["id","year","month","dow","hour",
               "primary_type_freq","location_description_freq","weekday_freq","hour_bin_freq"]

print("Categorical:", cat_cols_fe)
print("Numeric:", num_cols_fe)

Categorical: ['date', 'primary_type', 'location_description', 'location_grouped', 'weekday', 'hour_bin']
Numeric: ['id', 'year', 'month', 'dow', 'hour', 'primary_type_freq', 'location_description_freq', 'weekday_freq', 'hour_bin_freq']


In [9]:
# Preprocessor for engineered features
pre_fe = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols_fe),
        ("num", "passthrough", num_cols_fe),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

In [12]:
from sklearn.model_selection import train_test_split

# Subsample ~5k rows for faster search
SUB_N = 5000
if len(y_train) > SUB_N:
    X_sub, _, y_sub, _ = train_test_split(
        X_train_fe, y_train, train_size=SUB_N,
        stratify=y_train, random_state=42
    )
else:
    X_sub, y_sub = X_train_fe, y_train

hgb_search = RandomizedSearchCV(
    hgb_pipe, param_distributions=param_dist,
    n_iter=6,              # 6 candidates
    scoring="average_precision",
    refit=True, cv=2,      # 2 folds
    n_jobs=-1, random_state=42, verbose=2
)

hgb_search.fit(X_sub, y_sub, clf__sample_weight=sw_train[:len(y_sub)])
print("Best HGB params:", hgb_search.best_params_)
print("Best CV PR-AUC:", round(hgb_search.best_score_, 4))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1978522015446167, clf__max_depth=3, clf__max_iter=91, clf__max_leaf_nodes=41, clf__min_samples_leaf=81; total time=   5.2s
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1978522015446167, clf__max_depth=3, clf__max_iter=91, clf__max_leaf_nodes=41, clf__min_samples_leaf=81; total time=   5.5s
[CV] END clf__l2_regularization=0.0003487351559952693, clf__learning_rate=0.06207090305742937, clf__max_depth=5, clf__max_iter=90, clf__max_leaf_nodes=39, clf__min_samples_leaf=176; total time=   6.9s
[CV] END clf__l2_regularization=0.0003487351559952693, clf__learning_rate=0.06207090305742937, clf__max_depth=5, clf__max_iter=90, clf__max_leaf_nodes=39, clf__min_samples_leaf=176; total time=   7.1s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18679147494991152, clf__max_depth=5, clf__max_iter=87, clf__ma

In [13]:
hgb_final = hgb_search.best_estimator_
hgb_final.fit(X_train_fe, y_train, clf__sample_weight=sw_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,'/var/folders/6z/l9wv3c...q8m0000gn/T/tmptrqagad5'
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,np.float64(0....9147494991152)
,max_iter,87
,max_leaf_nodes,44
,max_depth,5
,min_samples_leaf,80
,l2_regularization,np.float64(0....9560245279666)
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [14]:
proba_hgb = hgb_final.predict_proba(X_test_fe)[:,1]

print("HGB TEST PR-AUC:", round(average_precision_score(y_test, proba_hgb), 4))
print("HGB TEST ROC-AUC:", round(roc_auc_score(y_test, proba_hgb), 4))

# Threshold tuning
prec, rec, thr = precision_recall_curve(y_test, proba_hgb)
f1s = 2*prec*rec/(prec+rec+1e-12)
best_idx = np.nanargmax(f1s)
thr_hgb = thr[best_idx] if best_idx < len(thr) else 0.5
pred_hgb = (proba_hgb >= thr_hgb).astype(int)

print("Best threshold:", float(thr_hgb), "Best F1:", float(f1s[best_idx]))
print(classification_report(y_test, pred_hgb, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_hgb))

HGB TEST PR-AUC: 0.6569
HGB TEST ROC-AUC: 0.8878
Best threshold: 0.6953415078096913 Best F1: 0.6265060240958864
              precision    recall  f1-score   support

           0      0.934     0.946     0.940      1795
           1      0.652     0.603     0.627       302

    accuracy                          0.897      2097
   macro avg      0.793     0.774     0.783      2097
weighted avg      0.893     0.897     0.895      2097

Confusion:
 [[1698   97]
 [ 120  182]]


In [15]:
stamp = time.strftime("%Y%m%m%d-%H%M%S")

metrics = {
    "timestamp": stamp,
    "model": "HGB + FE v1",
    "test_pr_auc": float(average_precision_score(y_test, proba_hgb)),
    "test_roc_auc": float(roc_auc_score(y_test, proba_hgb)),
    "threshold_tuned": float(thr_hgb),
    "confusion_tuned": confusion_matrix(y_test, pred_hgb).tolist(),
    "class_report_tuned": classification_report(y_test, pred_hgb, output_dict=True),
    "best_params": {k: (float(v) if hasattr(v, "item") else v) for k,v in hgb_search.best_params_.items()}
}

with open(ART / f"metrics_hgb_v1_{stamp}.json", "w") as f: 
    json.dump(metrics, f, indent=2)

with open(ART / "decision_threshold_hgb_v1.txt", "w") as f:
    f.write(str(metrics["threshold_tuned"]))

# PR/ROC plots 
prec, rec, _ = precision_recall_curve(y_test, proba_hgb)
fpr, tpr, _ = roc_curve(y_test, proba_hgb)

plt.figure(); plt.plot(rec, prec); plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title(f"HGB PR curve (AP={metrics['test_pr_auc']:.3f})"); plt.grid(True, alpha=0.3)
plt.savefig(ART / f"pr_curve_hgb_v1_{stamp}.png", bbox_inches="tight"); plt.close()

plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"HGB ROC curve (AUC={metrics['test_roc_auc']:.3f})")
plt.grid(True, alpha=0.3)
plt.savefig(ART / f"roc_curve_hgb_v1_{stamp}.png", bbox_inches="tight"); plt.close()

print("Saved HGB v1 artifacts:", ART)

Saved HGB v1 artifacts: /Volumes/easystore/Projects/chicago-crime-pipeline/notebooks/artifacts
