In [1]:
import os
import joblib
import numpy as np
import pandas as pd

MODEL_PATH = "/work/siads_699_mads_capstone/data/models/ensemble_model_dedup_final.pkl"
FEATURES_CSV = "/work/siads_699_mads_capstone/data/models/workzone_features.csv"

print("Model path:", MODEL_PATH)
print("Features CSV:", FEATURES_CSV)
model = joblib.load(MODEL_PATH)
print("Loaded trained model:", type(model))


Model path: /work/siads_699_mads_capstone/data/models/ensemble_model_dedup_final.pkl
Features CSV: /work/siads_699_mads_capstone/data/models/workzone_features.csv
Loaded trained model: <class 'sklearn.pipeline.Pipeline'>


In [2]:
df = pd.read_csv(FEATURES_CSV)
print(f"Loaded features CSV: {df.shape[0]} rows, {df.shape[1]} columns")

df = df.replace(r"[\[\]]", "", regex=True)
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="ignore")

for leak_col in ["crash_count", "avg_severity"]:
    if leak_col in df.columns:
        df.drop(columns=leak_col, inplace=True)
        print(f"Dropped leakage column: {leak_col}")
target = df["high_risk"].astype(int)
df_numeric = df.select_dtypes(include=[np.number]).copy()
df_numeric["high_risk"] = target

numeric_cols = df_numeric.select_dtypes(include=["number"]).columns
before = len(df_numeric)
df_numeric = df_numeric.drop_duplicates(subset=numeric_cols, keep="first")
after = len(df_numeric)
print(f"Deduped numeric: {before} and {after} rows")

X = df_numeric.drop(columns=["high_risk"])
y = df_numeric["high_risk"]

FEATURE_COLUMNS = list(X.columns)
print("Feature columns used for inference:")
for col in FEATURE_COLUMNS:
    print("  -", col)


Loaded features CSV: 1653 rows, 14 columns
Dropped leakage column: crash_count
Dropped leakage column: avg_severity
Deduped numeric: 1653 and 219 rows
Feature columns used for inference:
  - duration_hr
  - latitude
  - longitude
  - avg_distance_km
  - avg_visibility_mi
  - avg_precip_in
  - avg_temp_F
  - avg_wind_mph


In [3]:
def prepare_input_from_dict(input_dict, feature_order=FEATURE_COLUMNS):
    row = {}
    for f in feature_order:
        row[f] = input_dict.get(f, 0.0)
    df_in = pd.DataFrame([row], columns=feature_order)
    return df_in

def predict_workzone_risk(input_dict):
    df_in = prepare_input_from_dict(input_dict)
    proba = model.predict_proba(df_in)[:, 1][0]
    label = int(proba >= 0.5)
    return proba, label

In [4]:
sample = X.iloc[0]
sample_dict = sample.to_dict()

print("Sample feature values:")
for k, v in sample_dict.items():
    print(f"{k:20s} = {v}")

proba, label = predict_workzone_risk(sample_dict)
print("\nPredicted high-risk probability:", round(proba, 3))
print("Predicted label (1=high risk):", label)


Sample feature values:
duration_hr          = 23.99972222222222
latitude             = 31.676631
longitude            = -106.323587
avg_distance_km      = 0.5618894579277065
avg_visibility_mi    = 9.772388059701491
avg_precip_in        = 0.0735714285714285
avg_temp_F           = 72.2410447761194
avg_wind_mph         = 8.43015873015873

Predicted high-risk probability: 0.013
Predicted label (1=high risk): 0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=609edc76-98cb-4265-b05f-b8bb14bd7d7b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>