In [1]:
import os, time, json, tempfile
from pathlib import Path 
import numpy as np, pandas as pd, matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve, roc_curve,
    classification_report, confusion_matrix, precision_recall_fscore_support
)
from scipy.stats import loguniform, randint


# Leep CPU threads sane during CV
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

# Paths 
REPO = Path.cwd()
while REPO.name != "chicago-crime-pipeline" and REPO.parent != REPO:
    REPO = REPO.parent
DATA = REPO / "data" / "processed"
ART  = REPO / "notebooks" / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

# Load & split
df = pd.read_csv(DATA / "arrest_features.csv")
assert "arrest" in df.columns

TARGET = "arrest"
y = df[TARGET].astype(int).values
X = df.drop(columns=[TARGET]).copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print("Shapes:", X_train.shape, X_test.shape, "| target counts:", df[TARGET].value_counts().to_dict())



Shapes: (8385, 9) (2097, 9) | target counts: {0: 8972, 1: 1510}


In [2]:
# Helper slice metrics to compare slices cleanly 
def slice_metrics(X_df, y_true, proba, threshold, slice_col, min_support=40):
    """Compute precision/recall/F1 per value of a categorical slice column."""
    if slice_col not in X_df.columns:
        print(f"[skip] slice column not found: {slice_col}")
        return None
    df = pd.DataFrame({
        slice_col: X_df[slice_col],
        "y": y_true,
        "pred": (proba >= threshold).astype(int)
    })
    rows = []
    for val, g in df.groupby(slice_col):
        n = len(g)
        if n < min_support:
            continue
        p, r, f1, _ = precision_recall_fscore_support(
            g["y"], g["pred"], average="binary", zero_division=0
        )
        rows.append({slice_col: val, "support": int(n),
                     "precision": float(p), "recall": float(r), "f1": float(f1)})
    if not rows:
        print(f"[note] no slices with support ≥ {min_support} for {slice_col}")
        return None
    return pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)

In [3]:
# Base feature engineering

# fresh copies
X_train_fe = X_train.copy()
X_test_fe  = X_test.copy()

# weekday from date
for Xdf in (X_train_fe, X_test_fe):
    Xdf["weekday"] = pd.to_datetime(Xdf["date"]).dt.day_name()

# hour bins (object dtype so OneHotEncoder treats them as categories)
bins   = [0,6,12,18,24]
labels = ["00-05","06-11","12-17","18-23"]
for Xdf in (X_train_fe, X_test_fe):
    Xdf["hour_bin"] = pd.cut(Xdf["hour"].astype(int), bins=bins, right=False, labels=labels).astype(object)

# rare bucket helper
def rare_bucket(train_col, test_col, min_count=40):
    vc = train_col.value_counts()
    keep = set(vc[vc >= min_count].index)
    return (train_col.where(train_col.isin(keep), "__RARE__"),
            test_col.where(test_col.isin(keep), "__RARE__"))

# rare-bucket two high-card categoricals (if present)
for col in ["location_description", "primary_type"]:
    if col in X_train_fe.columns:
        X_train_fe[col], X_test_fe[col] = rare_bucket(X_train_fe[col], X_test_fe[col], min_count=40)

# frequency encodings (priors)
def add_freq_encode(col):
    freq = X_train_fe[col].astype(object).value_counts(normalize=True)
    X_train_fe[f"{col}_freq"] = X_train_fe[col].map(freq).astype("float64").fillna(0.0).to_numpy()
    X_test_fe[f"{col}_freq"]  = X_test_fe[col].map(freq).astype("float64").fillna(0.0).to_numpy()

for col in ["primary_type", "location_description", "weekday", "hour_bin"]:
    if col in X_train_fe.columns:
        add_freq_encode(col)

# target mean for primary_type (arrest propensity)
if "primary_type" in X_train_fe.columns:
    arrest_rate = pd.Series(y_train).groupby(X_train_fe["primary_type"]).mean()
    X_train_fe["ptype_arrest_rate"] = X_train_fe["primary_type"].map(arrest_rate)
    X_test_fe["ptype_arrest_rate"]  = X_test_fe["primary_type"].map(arrest_rate).fillna(float(arrest_rate.mean()))
else:
    X_train_fe["ptype_arrest_rate"] = 0.0
    X_test_fe["ptype_arrest_rate"]  = 0.0

# interaction: primary_type × hour_bin (then rare-bucket)
if set(["primary_type","hour_bin"]).issubset(X_train_fe.columns):
    X_train_fe["ptype_x_hourbin"] = X_train_fe["primary_type"].astype(str) + "_" + X_train_fe["hour_bin"].astype(str)
    X_test_fe["ptype_x_hourbin"]  = X_test_fe["primary_type"].astype(str)  + "_" + X_test_fe["hour_bin"].astype(str)
    X_train_fe["ptype_x_hourbin"], X_test_fe["ptype_x_hourbin"] = rare_bucket(
        X_train_fe["ptype_x_hourbin"], X_test_fe["ptype_x_hourbin"], min_count=30
    )
else:
    X_train_fe["ptype_x_hourbin"] = "__MISSING__"
    X_test_fe["ptype_x_hourbin"]  = "__MISSING__"

In [6]:
# New segmentation features 

# Map primary_type -> crime_group
violent   = {"ASSAULT", "BATTERY", "ROBBERY", "WEAPONS VIOLATION"}
property_ = {"BURGLARY", "THEFT", "MOTOR VEHICLE THEFT", "CRIMINAL DAMAGE"}
grp_map   = {**{v: "violent" for v in violent}, **{p: "property" for p in property_}}

for Xdf in (X_train_fe, X_test_fe):
    Xdf["crime_group"] = Xdf["primary_type"].map(grp_map).fillna("other")

# Group-level target mean prior (arrest propensity per crime_group)
# Align target index to X_train_fe to avoid any misalignment
y_train_s = pd.Series(y_train, index=X_train_fe.index)
cg_rate   = y_train_s.groupby(X_train_fe["crime_group"]).mean()

X_train_fe["crime_group_arrest_rate"] = (
    X_train_fe["crime_group"].map(cg_rate).astype("float64")
)
X_test_fe["crime_group_arrest_rate"] = (
    X_test_fe["crime_group"].map(cg_rate)
            .fillna(float(cg_rate.mean()))
            .astype("float64")
)

# Interaction: crime_group × hour_bin (ensure object dtype for OHE), then rare-bucket
X_train_fe["cg_x_hourbin"] = (
    X_train_fe["crime_group"].astype(str) + "_" + X_train_fe["hour_bin"].astype(str)
).astype(object)

X_test_fe["cg_x_hourbin"] = (
    X_test_fe["crime_group"].astype(str)  + "_" + X_test_fe["hour_bin"].astype(str)
).astype(object)

# rare_bucket must be defined earlier; min_count=30 keeps the OHE width in check
X_train_fe["cg_x_hourbin"], X_test_fe["cg_x_hourbin"] = rare_bucket(
    X_train_fe["cg_x_hourbin"], X_test_fe["cg_x_hourbin"], min_count=30
)

In [None]:
# Column lists+ existence-safe preprocessor

cat_cols_v2 = [
    "date", "primary_type", "location_description", "location_grouped", 
    "weekday", "hour_bin", "ptype_x_hourbin", "crime_group", "cg_x_hourbin"
]
num_cols_v2 = [
    "id", "year", "month", "dow", "hour",
    "primary_type_freq", "location_description_freq", "weekday_freq", "hour_bin_freq",
    "ptype_arrest_rate", "crime_group_arrest_rate"
    ]

present = set(X_train_fe.columns)
cat_used = [c for c in cat_cols_v2 if c in present]
num_used = [c for c in num_cols_v2 if c in present]
print("Using categorical:", cat_used)
print("Using numerical:", num_used)

pre_fe_v2 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_used), 
        ("num", "passthrough", num_used),
    ],
    remainder='drop',
    verbose_feature_names_out=False,
)

Using categorical: ['date', 'primary_type', 'location_description', 'location_grouped', 'weekday', 'hour_bin', 'ptype_x_hourbin', 'crime_group', 'cg_x_hourbin']
Using numerical: ['id', 'year', 'month', 'dow', 'hour', 'primary_type_freq', 'location_description_freq', 'weekday_freq', 'hour_bin_freq', 'ptype_arrest_rate', 'crime_group_arrest_rate']


In [8]:
from scipy.stats import loguniform, randint

# label imbalance → positive class gets higher weight
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
sw_train = np.where(y_train==1, pos_weight, 1.0)

hgb_pipe_v2 = Pipeline(steps=[
    ("pre", pre_fe_v2),
    ("clf", HistGradientBoostingClassifier(
        random_state=42, max_bins=255,
        early_stopping=True, validation_fraction=0.1, n_iter_no_change=10
    ))
], memory=tempfile.mkdtemp())

param_dist_v2 = {
    "clf__learning_rate": loguniform(0.03, 0.2),
    "clf__max_depth": randint(3, 8),
    "clf__max_leaf_nodes": randint(24, 64),
    "clf__min_samples_leaf": randint(60, 240),
    "clf__l2_regularization": loguniform(1e-4, 0.3),
    "clf__max_iter": randint(120, 240),
}

In [9]:
# Bounded search 
# subsample ~5k for search
SUB_N = 5000
if len(y_train) > SUB_N:
    X_sub, _, y_sub, _ = train_test_split(
        X_train_fe, y_train, train_size=SUB_N, stratify=y_train, random_state=42
    )
    sw_sub = sw_train[:len(y_sub)]
else:
    X_sub, y_sub, sw_sub = X_train_fe, y_train, sw_train

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
search_v2 = RandomizedSearchCV(
    hgb_pipe_v2, param_distributions=param_dist_v2, n_iter=6,
    scoring="average_precision", refit=True, cv=cv, n_jobs=-1,
    random_state=42, verbose=2
)
search_v2.fit(X_sub, y_sub, clf__sample_weight=sw_sub)
print("Best params (v2):", search_v2.best_params_)
print("CV AP (v2):", round(search_v2.best_score_, 4))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   2.5s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18214744423753768, clf__max_depth=5, clf__max_iter=191, clf__max_leaf_nodes=44, clf__min_samples_leaf=162; total time=   2.6s
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   3.5s
[CV] END clf__l2_regularization=0.0035498870995898887, clf__learning_rate=0.03626531563860245, clf__max_depth=5, clf__max_iter=207, clf__max_leaf_nodes=59, clf__min_samples_leaf=163; total time=   4.2s
[CV] END clf__l2_regularization=0.001029530064265006, clf__learning_rate=0.0957705988053993, clf__max_depth=4, clf__max_iter=211, cl

In [10]:
# Refit on all training rows 
hgb_final_v2 = search_v2.best_estimator_
hgb_final_v2.fit(X_train_fe, y_train, clf__sample_weight=sw_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,'/var/folders/6z/l9wv3c...q8m0000gn/T/tmp0k3_ixb6'
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,np.float64(0.0957705988053993)
,max_iter,211
,max_leaf_nodes,39
,max_depth,4
,min_samples_leaf,74
,l2_regularization,np.float64(0....9530064265006)
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [11]:
# Evaluate on v2 test set
proba_v2 = hgb_final_v2.predict_proba(X_test_fe)[:, 1]

print("\n=== TEST metrics (v2) ===")
print("PR-AUC:", round(average_precision_score(y_test, proba_v2), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, proba_v2), 4))

prec, rec, thr = precision_recall_curve(y_test, proba_v2)
f1s = 2*prec*rec/(prec+rec+1e-12)
i = np.nanargmax(f1s)
thr_v2 = thr[i] if i < len(thr) else 0.5
pred_v2 = (proba_v2 >= thr_v2).astype(int)

print("Best threshold:", float(thr_v2), "Best F1:", float(f1s[i]))
print(classification_report(y_test, pred_v2, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_v2))


=== TEST metrics (v2) ===
PR-AUC: 0.6516
ROC-AUC: 0.8831
Best threshold: 0.6380708268666978 Best F1: 0.6115992970118042
              precision    recall  f1-score   support

           0      0.930     0.948     0.939      1795
           1      0.652     0.576     0.612       302

    accuracy                          0.895      2097
   macro avg      0.791     0.762     0.775      2097
weighted avg      0.890     0.895     0.892      2097

Confusion:
 [[1702   93]
 [ 128  174]]


In [12]:
# Reweighting (mild): build per-type weights and combine
# Add per-primary_type weights and multiply.


# Inverse-frequency^alpha, alpha=0.5 is mild; try 0.8 if mild is flat
alpha = 0.5

ptype_freq = X_train_fe["primary_type"].value_counts(normalize=True)
ptype_w = (ptype_freq.mean() / ptype_freq) ** alpha 
ptype_w = ptype_w / ptype_w.mean()  # normalize to mean ~1

# Map to rows and combine with label-imbalance weights 
w_ptype = X_train_fe['primary_type'].map(ptype_w).fillna(1.0).values
sw_combo = sw_train * w_ptype

print("Weight summary:",
      f"\n - sw_train mean={sw_train.mean():.3f}",
      f"\n - w_ptype mean={np.mean(w_ptype):.3f}",
      f"\n - sw_combo mean={np.mean(sw_combo):.3f}")


Weight summary: 
 - sw_train mean=1.712 
 - w_ptype mean=0.553 
 - sw_combo mean=1.058


In [13]:
#  Bounded search (6×2) with combined weights

search_v3 = RandomizedSearchCV(
    estimator=hgb_pipe_v2,                 
    param_distributions=param_dist_v2,     
    n_iter=6, scoring="average_precision",
    refit=True, cv=cv, n_jobs=-1, random_state=42, verbose=2
)

# Use the same bounded subset you used earlier
# X_sub, y_sub were created before; align weights to the same length
if 'X_sub' not in globals():
    from sklearn.model_selection import train_test_split
    SUB_N = 5000
    if len(y_train) > SUB_N:
        X_sub, _, y_sub, _ = train_test_split(
            X_train_fe, y_train, train_size=SUB_N,
            stratify=y_train, random_state=42
        )
        sw_sub = sw_combo[:len(y_sub)]
    else:
        X_sub, y_sub, sw_sub = X_train_fe, y_train, sw_combo
else:
    sw_sub = sw_combo[:len(y_sub)]

search_v3.fit(X_sub, y_sub, clf__sample_weight=sw_sub)
print("Best params (v3-reweight):", search_v3.best_params_)
print("CV AP (v3-reweight):", round(search_v3.best_score_, 4))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   1.8s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18214744423753768, clf__max_depth=5, clf__max_iter=191, clf__max_leaf_nodes=44, clf__min_samples_leaf=162; total time=   2.0s
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   2.3s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18214744423753768, clf__max_depth=5, clf__max_iter=191, clf__max_leaf_nodes=44, clf__min_samples_leaf=162; total time=   2.9s
[CV] END clf__l2_regularization=0.0035498870995898887, clf__learning_rate=0.03626531563860245, clf__max_depth=5, clf__max_iter=207, 

In [14]:
# Full fit on all training rows + evaluation 
hgb_final_v3 = search_v3.best_estimator_
hgb_final_v3.fit(X_train_fe, y_train, clf__sample_weight=sw_combo)

from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, classification_report, confusion_matrix

proba_v3 = hgb_final_v3.predict_proba(X_test_fe)[:, 1]
ap_v3  = average_precision_score(y_test, proba_v3)
roc_v3 = roc_auc_score(y_test, proba_v3)

prec, rec, thr = precision_recall_curve(y_test, proba_v3)
f1s = 2*prec*rec/(prec+rec+1e-12)
i = np.nanargmax(f1s)
thr_v3 = thr[i] if i < len(thr) else 0.5
pred_v3 = (proba_v3 >= thr_v3).astype(int)

print("\n=== TEST metrics (v3 reweight) ===")
print("PR-AUC:", round(ap_v3, 4))
print("ROC-AUC:", round(roc_v3, 4))
print("Best threshold:", float(thr_v3), "Best F1:", float(f1s[i]))
print(classification_report(y_test, pred_v3, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_v3))


=== TEST metrics (v3 reweight) ===
PR-AUC: 0.6456
ROC-AUC: 0.8797
Best threshold: 0.6302952878988638 Best F1: 0.6010733452588949
              precision    recall  f1-score   support

           0      0.927     0.950     0.939      1795
           1      0.654     0.556     0.601       302

    accuracy                          0.894      2097
   macro avg      0.790     0.753     0.770      2097
weighted avg      0.888     0.894     0.890      2097

Confusion:
 [[1706   89]
 [ 134  168]]


In [15]:
# v3: slice metrics check 
def _slice(X, y, proba, thr, col, k=8):
    tbl = slice_metrics(X, y, proba, thr, col, min_support=40)
    if tbl is None or tbl.empty:
        print(f"[note] no slices for {col}")
        return
    print(f"\n=== {col} worst by PRECISION ==="); display(tbl.sort_values("precision").head(k))
    print(f"=== {col} worst by RECALL ===");    display(tbl.sort_values("recall").head(k))

for col in ["primary_type","crime_group","weekday","hour_bin"]:
    _slice(X_test_fe, y_test, proba_v3, thr_v3, col)


=== primary_type worst by PRECISION ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,BURGLARY,96,0.0,0.0,0.0
10,THEFT,473,0.333333,0.025,0.046512
4,CRIMINAL TRESPASS,43,0.363636,1.0,0.533333
9,ASSAULT,187,0.5,0.045455,0.083333
3,OTHER OFFENSE,134,0.548387,0.68,0.607143
7,BATTERY,379,0.566667,0.283333,0.377778
2,ROBBERY,56,0.625,0.714286,0.666667
1,WEAPONS VIOLATION,53,0.716981,1.0,0.835165


=== primary_type worst by RECALL ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,BURGLARY,96,0.0,0.0,0.0
10,THEFT,473,0.333333,0.025,0.046512
9,ASSAULT,187,0.5,0.045455,0.083333
8,CRIMINAL DAMAGE,232,1.0,0.111111,0.2
6,DECEPTIVE PRACTICE,136,1.0,0.25,0.4
7,BATTERY,379,0.566667,0.283333,0.377778
5,MOTOR VEHICLE THEFT,160,1.0,0.333333,0.5
3,OTHER OFFENSE,134,0.548387,0.68,0.607143



=== crime_group worst by PRECISION ===


Unnamed: 0,crime_group,support,precision,recall,f1
2,property,961,0.6,0.054545,0.1
0,other,461,0.654088,0.866667,0.74552
1,violent,675,0.655914,0.480315,0.554545


=== crime_group worst by RECALL ===


Unnamed: 0,crime_group,support,precision,recall,f1
2,property,961,0.6,0.054545,0.1
1,violent,675,0.655914,0.480315,0.554545
0,other,461,0.654088,0.866667,0.74552



=== weekday worst by PRECISION ===


Unnamed: 0,weekday,support,precision,recall,f1
6,Monday,288,0.527778,0.542857,0.535211
5,Tuesday,276,0.581395,0.581395,0.581395
4,Wednesday,286,0.625,0.555556,0.588235
2,Thursday,379,0.666667,0.571429,0.615385
0,Saturday,259,0.72,0.6,0.654545
3,Sunday,298,0.733333,0.511628,0.60274
1,Friday,311,0.793103,0.534884,0.638889


=== weekday worst by RECALL ===


Unnamed: 0,weekday,support,precision,recall,f1
3,Sunday,298,0.733333,0.511628,0.60274
1,Friday,311,0.793103,0.534884,0.638889
6,Monday,288,0.527778,0.542857,0.535211
4,Wednesday,286,0.625,0.555556,0.588235
2,Thursday,379,0.666667,0.571429,0.615385
5,Tuesday,276,0.581395,0.581395,0.581395
0,Saturday,259,0.72,0.6,0.654545



=== hour_bin worst by PRECISION ===


Unnamed: 0,hour_bin,support,precision,recall,f1
1,18-23,622,0.617647,0.636364,0.626866
3,12-17,638,0.626667,0.505376,0.559524
2,06-11,426,0.666667,0.528302,0.589474
0,00-05,411,0.789474,0.526316,0.631579


=== hour_bin worst by RECALL ===


Unnamed: 0,hour_bin,support,precision,recall,f1
3,12-17,638,0.626667,0.505376,0.559524
0,00-05,411,0.789474,0.526316,0.631579
2,06-11,426,0.666667,0.528302,0.589474
1,18-23,622,0.617647,0.636364,0.626866


In [16]:
# Try stronger weights (α=0.8)
alpha = 0.8
ptype_w2 = (ptype_freq.mean() / ptype_freq) ** alpha
ptype_w2 = ptype_w2 / ptype_w2.mean()
sw_combo2 = sw_train * X_train_fe["primary_type"].map(ptype_w2).fillna(1.0).values

search_v3b = RandomizedSearchCV(
    estimator=hgb_pipe_v2, param_distributions=param_dist_v2,
    n_iter=6, scoring="average_precision", refit=True, cv=cv,
    n_jobs=-1, random_state=42, verbose=2
)
search_v3b.fit(X_sub, y_sub, clf__sample_weight=sw_combo2[:len(y_sub)])
hgb_v3b = search_v3b.best_estimator_
hgb_v3b.fit(X_train_fe, y_train, clf__sample_weight=sw_combo2)

proba_v3b = hgb_v3b.predict_proba(X_test_fe)[:,1]
ap_v3b  = average_precision_score(y_test, proba_v3b)
roc_v3b = roc_auc_score(y_test, proba_v3b)
prec, rec, thr = precision_recall_curve(y_test, proba_v3b)
f1s = 2*prec*rec/(prec+rec+1e-12); i = np.nanargmax(f1s)
thr_v3b = thr[i] if i < len(thr) else 0.5

print("\n=== TEST metrics (v3b stronger weights) ===")
print("PR-AUC:", round(ap_v3b, 4))
print("ROC-AUC:", round(roc_v3b, 4))
print("Best threshold:", float(thr_v3b), "Best F1:", float(f1s[i]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   2.0s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18214744423753768, clf__max_depth=5, clf__max_iter=191, clf__max_leaf_nodes=44, clf__min_samples_leaf=162; total time=   2.3s
[CV] END clf__l2_regularization=0.00010062545641808922, clf__learning_rate=0.1970666034205786, clf__max_depth=3, clf__max_iter=195, clf__max_leaf_nodes=45, clf__min_samples_leaf=148; total time=   2.4s
[CV] END clf__l2_regularization=0.0020059560245279666, clf__learning_rate=0.18214744423753768, clf__max_depth=5, clf__max_iter=191, clf__max_leaf_nodes=44, clf__min_samples_leaf=162; total time=   3.4s
[CV] END clf__l2_regularization=0.001029530064265006, clf__learning_rate=0.0957705988053993, clf__max_depth=4, clf__max_iter=211, cl

In [17]:
# --- v3: slice metrics check ---
def _slice(X, y, proba, thr, col, k=8):
    tbl = slice_metrics(X, y, proba, thr, col, min_support=40)
    if tbl is None or tbl.empty:
        print(f"[note] no slices for {col}")
        return
    print(f"\n=== {col} worst by PRECISION ==="); display(tbl.sort_values("precision").head(k))
    print(f"=== {col} worst by RECALL ===");    display(tbl.sort_values("recall").head(k))

for col in ["primary_type","crime_group","weekday","hour_bin"]:
    _slice(X_test_fe, y_test, proba_v3, thr_v3, col)


=== primary_type worst by PRECISION ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,BURGLARY,96,0.0,0.0,0.0
10,THEFT,473,0.333333,0.025,0.046512
4,CRIMINAL TRESPASS,43,0.363636,1.0,0.533333
9,ASSAULT,187,0.5,0.045455,0.083333
3,OTHER OFFENSE,134,0.548387,0.68,0.607143
7,BATTERY,379,0.566667,0.283333,0.377778
2,ROBBERY,56,0.625,0.714286,0.666667
1,WEAPONS VIOLATION,53,0.716981,1.0,0.835165


=== primary_type worst by RECALL ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,BURGLARY,96,0.0,0.0,0.0
10,THEFT,473,0.333333,0.025,0.046512
9,ASSAULT,187,0.5,0.045455,0.083333
8,CRIMINAL DAMAGE,232,1.0,0.111111,0.2
6,DECEPTIVE PRACTICE,136,1.0,0.25,0.4
7,BATTERY,379,0.566667,0.283333,0.377778
5,MOTOR VEHICLE THEFT,160,1.0,0.333333,0.5
3,OTHER OFFENSE,134,0.548387,0.68,0.607143



=== crime_group worst by PRECISION ===


Unnamed: 0,crime_group,support,precision,recall,f1
2,property,961,0.6,0.054545,0.1
0,other,461,0.654088,0.866667,0.74552
1,violent,675,0.655914,0.480315,0.554545


=== crime_group worst by RECALL ===


Unnamed: 0,crime_group,support,precision,recall,f1
2,property,961,0.6,0.054545,0.1
1,violent,675,0.655914,0.480315,0.554545
0,other,461,0.654088,0.866667,0.74552



=== weekday worst by PRECISION ===


Unnamed: 0,weekday,support,precision,recall,f1
6,Monday,288,0.527778,0.542857,0.535211
5,Tuesday,276,0.581395,0.581395,0.581395
4,Wednesday,286,0.625,0.555556,0.588235
2,Thursday,379,0.666667,0.571429,0.615385
0,Saturday,259,0.72,0.6,0.654545
3,Sunday,298,0.733333,0.511628,0.60274
1,Friday,311,0.793103,0.534884,0.638889


=== weekday worst by RECALL ===


Unnamed: 0,weekday,support,precision,recall,f1
3,Sunday,298,0.733333,0.511628,0.60274
1,Friday,311,0.793103,0.534884,0.638889
6,Monday,288,0.527778,0.542857,0.535211
4,Wednesday,286,0.625,0.555556,0.588235
2,Thursday,379,0.666667,0.571429,0.615385
5,Tuesday,276,0.581395,0.581395,0.581395
0,Saturday,259,0.72,0.6,0.654545



=== hour_bin worst by PRECISION ===


Unnamed: 0,hour_bin,support,precision,recall,f1
1,18-23,622,0.617647,0.636364,0.626866
3,12-17,638,0.626667,0.505376,0.559524
2,06-11,426,0.666667,0.528302,0.589474
0,00-05,411,0.789474,0.526316,0.631579


=== hour_bin worst by RECALL ===


Unnamed: 0,hour_bin,support,precision,recall,f1
3,12-17,638,0.626667,0.505376,0.559524
0,00-05,411,0.789474,0.526316,0.631579
2,06-11,426,0.666667,0.528302,0.589474
1,18-23,622,0.617647,0.636364,0.626866


In [18]:
# label-imbalance only (baseline weights)
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
sw_train = np.where(y_train==1, pos_weight, 1.0)

In [19]:
# Add richer time features
# hour → sin/cos
X_train_fe["hour_sin"] = np.sin(2*np.pi * X_train_fe["hour"].astype(float)/24.0)
X_train_fe["hour_cos"] = np.cos(2*np.pi * X_train_fe["hour"].astype(float)/24.0)
X_test_fe["hour_sin"]  = np.sin(2*np.pi * X_test_fe["hour"].astype(float)/24.0)
X_test_fe["hour_cos"]  = np.cos(2*np.pi * X_test_fe["hour"].astype(float)/24.0)

# month → sin/cos
X_train_fe["month_sin"] = np.sin(2*np.pi * X_train_fe["month"].astype(float)/12.0)
X_train_fe["month_cos"] = np.cos(2*np.pi * X_train_fe["month"].astype(float)/12.0)
X_test_fe["month_sin"]  = np.sin(2*np.pi * X_test_fe["month"].astype(float)/12.0)
X_test_fe["month_cos"]  = np.cos(2*np.pi * X_test_fe["month"].astype(float)/12.0)

In [21]:
# Define/restore v2 lists if needed
if "cat_cols_v2" not in globals():
    cat_cols_v2 = [
        "date","primary_type","location_description","location_grouped",
        "weekday","hour_bin","ptype_x_hourbin","crime_group","cg_x_hourbin"
    ]
if "num_cols_v2" not in globals():
    num_cols_v2 = [
        "id","year","month","dow","hour",
        "primary_type_freq","location_description_freq","weekday_freq","hour_bin_freq",
        "ptype_arrest_rate","crime_group_arrest_rate"
    ]

# Start v4 lists from v2
cat_cols_fe = list(cat_cols_v2)
num_cols_fe = list(num_cols_v2)

# Add the 4 new time features
for c in ["hour_sin","hour_cos","month_sin","month_cos"]:
    if c not in num_cols_fe and c in X_train_fe.columns:
        num_cols_fe.append(c)

# Existence-safe filter
present = set(X_train_fe.columns)
cat_cols_fe = [c for c in cat_cols_fe if c in present]
num_cols_fe = [c for c in num_cols_fe if c in present]

print("Using categorical:", cat_cols_fe)
print("Using numeric    :", num_cols_fe)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
pre_fe_v4 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols_fe),
        ("num", "passthrough", num_cols_fe),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

Using categorical: ['date', 'primary_type', 'location_description', 'location_grouped', 'weekday', 'hour_bin', 'ptype_x_hourbin', 'crime_group', 'cg_x_hourbin']
Using numeric    : ['id', 'year', 'month', 'dow', 'hour', 'primary_type_freq', 'location_description_freq', 'weekday_freq', 'hour_bin_freq', 'ptype_arrest_rate', 'crime_group_arrest_rate', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']


In [22]:
param_dist_v4 = {
    "clf__learning_rate": loguniform(0.01, 0.3),
    "clf__max_depth": randint(3, 9),
    "clf__max_leaf_nodes": randint(32, 128),
    "clf__min_samples_leaf": randint(20, 200),
    "clf__l2_regularization": loguniform(1e-4, 1.0),
    "clf__max_iter": randint(150, 300),
}

hgb_pipe_v4 = Pipeline(steps=[
    ("pre", pre_fe_v4),
    ("clf", HistGradientBoostingClassifier(random_state=42, max_bins=255))
])

In [23]:
SUB_N = 5000
if len(y_train) > SUB_N:
    X_sub, _, y_sub, _ = train_test_split(
        X_train_fe, y_train, train_size=SUB_N,
        stratify=y_train, random_state=42
    )
else:
    X_sub, y_sub = X_train_fe, y_train

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

search_v4 = RandomizedSearchCV(
    hgb_pipe_v4, param_distributions=param_dist_v4,
    n_iter=6, scoring="average_precision", refit=True,
    cv=cv, n_jobs=-1, random_state=42, verbose=2
)
search_v4.fit(X_sub, y_sub)
print("Best params (v4 scout):", search_v4.best_params_, "| CV AP:", round(search_v4.best_score_, 4))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END clf__l2_regularization=0.003148911647956862, clf__learning_rate=0.2536999076681772, clf__max_depth=5, clf__max_iter=221, clf__max_leaf_nodes=92, clf__min_samples_leaf=40; total time=  19.3s
[CV] END clf__l2_regularization=0.003148911647956862, clf__learning_rate=0.2536999076681772, clf__max_depth=5, clf__max_iter=221, clf__max_leaf_nodes=92, clf__min_samples_leaf=40; total time=  19.5s
[CV] END clf__l2_regularization=0.00012087541473056971, clf__learning_rate=0.2708160864249968, clf__max_depth=6, clf__max_iter=187, clf__max_leaf_nodes=33, clf__min_samples_leaf=40; total time=  19.7s
[CV] END clf__l2_regularization=0.00012087541473056971, clf__learning_rate=0.2708160864249968, clf__max_depth=6, clf__max_iter=187, clf__max_leaf_nodes=33, clf__min_samples_leaf=40; total time=  19.7s
[CV] END clf__l2_regularization=0.00042079886696066364, clf__learning_rate=0.01699897838270077, clf__max_depth=5, clf__max_iter=237, clf__ma

In [24]:
hgb_final_v4 = search_v4.best_estimator_
hgb_final_v4.fit(X_train_fe, y_train)

proba_v4 = hgb_final_v4.predict_proba(X_test_fe)[:, 1]

print("\n=== TEST metrics (v4) ===")
print("PR-AUC:", round(average_precision_score(y_test, proba_v4), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, proba_v4), 4))

prec, rec, thr = precision_recall_curve(y_test, proba_v4)
f1s = 2*prec*rec/(prec+rec+1e-12)
i = np.nanargmax(f1s)
thr_v4 = thr[i] if i < len(thr) else 0.5
pred_v4 = (proba_v4 >= thr_v4).astype(int)

print("Best threshold:", float(thr_v4), "Best F1:", float(f1s[i]))
print(classification_report(y_test, pred_v4, digits=3))
print("Confusion:\n", confusion_matrix(y_test, pred_v4))


=== TEST metrics (v4) ===
PR-AUC: 0.6489
ROC-AUC: 0.8839
Best threshold: 0.27504349529259814 Best F1: 0.612244897958684
              precision    recall  f1-score   support

           0      0.933     0.941     0.937      1795
           1      0.629     0.596     0.612       302

    accuracy                          0.891      2097
   macro avg      0.781     0.768     0.775      2097
weighted avg      0.889     0.891     0.890      2097

Confusion:
 [[1689  106]
 [ 122  180]]


In [25]:
for col in ["primary_type","weekday","hour_bin"]:
    tbl = slice_metrics(X_test_fe, y_test, proba_v4, thr_v4, col, min_support=40)
    if tbl is not None:
        print(f"\n=== {col} worst by PRECISION ==="); display(tbl.sort_values("precision").head(8))
        print(f"=== {col} worst by RECALL ===");    display(tbl.sort_values("recall").head(8))


=== primary_type worst by PRECISION ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,MOTOR VEHICLE THEFT,160,0.0,0.0,0.0
10,ASSAULT,187,0.166667,0.045455,0.071429
3,CRIMINAL TRESPASS,43,0.37931,0.916667,0.536585
8,THEFT,473,0.387097,0.3,0.338028
4,OTHER OFFENSE,134,0.483871,0.6,0.535714
7,BATTERY,379,0.586207,0.283333,0.382022
2,ROBBERY,56,0.714286,0.714286,0.714286
1,WEAPONS VIOLATION,53,0.716981,1.0,0.835165


=== primary_type worst by RECALL ===


Unnamed: 0,primary_type,support,precision,recall,f1
11,MOTOR VEHICLE THEFT,160,0.0,0.0,0.0
10,ASSAULT,187,0.166667,0.045455,0.071429
9,CRIMINAL DAMAGE,232,1.0,0.111111,0.2
6,DECEPTIVE PRACTICE,136,1.0,0.25,0.4
7,BATTERY,379,0.586207,0.283333,0.382022
8,THEFT,473,0.387097,0.3,0.338028
5,BURGLARY,96,1.0,0.333333,0.5
4,OTHER OFFENSE,134,0.483871,0.6,0.535714



=== weekday worst by PRECISION ===


Unnamed: 0,weekday,support,precision,recall,f1
5,Tuesday,276,0.5,0.581395,0.537634
4,Monday,288,0.555556,0.571429,0.56338
6,Wednesday,286,0.594595,0.488889,0.536585
2,Thursday,379,0.650794,0.650794,0.650794
1,Friday,311,0.692308,0.627907,0.658537
3,Sunday,298,0.727273,0.55814,0.631579
0,Saturday,259,0.75,0.7,0.724138


=== weekday worst by RECALL ===


Unnamed: 0,weekday,support,precision,recall,f1
6,Wednesday,286,0.594595,0.488889,0.536585
3,Sunday,298,0.727273,0.55814,0.631579
4,Monday,288,0.555556,0.571429,0.56338
5,Tuesday,276,0.5,0.581395,0.537634
1,Friday,311,0.692308,0.627907,0.658537
2,Thursday,379,0.650794,0.650794,0.650794
0,Saturday,259,0.75,0.7,0.724138



=== hour_bin worst by PRECISION ===


Unnamed: 0,hour_bin,support,precision,recall,f1
2,06-11,426,0.6,0.622642,0.611111
1,12-17,638,0.629213,0.602151,0.615385
0,18-23,622,0.635417,0.616162,0.625641
3,00-05,411,0.652174,0.526316,0.582524


=== hour_bin worst by RECALL ===


Unnamed: 0,hour_bin,support,precision,recall,f1
3,00-05,411,0.652174,0.526316,0.582524
1,12-17,638,0.629213,0.602151,0.615385
0,18-23,622,0.635417,0.616162,0.625641
2,06-11,426,0.6,0.622642,0.611111
