In [1]:
import os
import pandas as pd
import joblib
from pathlib import Path

In [2]:
DATA_DIR = "data"
RAW_DIR = os.path.join(DATA_DIR, "raw")
CLEAN_DIR = os.path.join(DATA_DIR, "clean")

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(CLEAN_DIR, exist_ok=True)

In [3]:
MODEL_PATH = Path("C:/Users/lkneh/HealthScore-Predictor/notebooks/Model/xgboost_tuned_scaleweight.pkl")  # adjust

raw_obj = joblib.load(MODEL_PATH)
type(raw_obj)

dict

In [4]:
print(type(raw_obj))

<class 'dict'>


In [5]:
if isinstance(raw_obj, dict) and "model" in raw_obj:
    model = raw_obj["model"]
else:
    model = raw_obj


In [6]:
hasattr(model, "predict"), hasattr(model, "predict_proba")


(True, True)

In [8]:
FEATURES = [
    'latitude', 'longitude', 'avg_violations_last_3', 'fail_rate_last_3',
    'days_since_last_inspection', 'trend_last_3',
    'BusinessName_id', 'Address_id',
    'insp_year', 'insp_month', 'insp_day', 'insp_dow', 'insp_days_since_ref',
    'inspection_type_clean_change of ownership',
    'inspection_type_clean_complaint',
    'inspection_type_clean_complaint (i)',
    'inspection_type_clean_complaint (r)',
    'inspection_type_clean_complaint reinspection/follow-up',
    'inspection_type_clean_foodborne illness',
    'inspection_type_clean_foodborne illness investigation',
    'inspection_type_clean_new construction',
    'inspection_type_clean_new ownership',
    'inspection_type_clean_new ownership (i)',
    'inspection_type_clean_new ownership (r)',
    'inspection_type_clean_new ownership - followup',
    'inspection_type_clean_non-inspection site visit',
    'inspection_type_clean_plan check',
    'inspection_type_clean_plan check (i)',
    'inspection_type_clean_plan check (r)',
    'inspection_type_clean_reinspection',
    'inspection_type_clean_reinspection/followup',
    'inspection_type_clean_routine',
    'inspection_type_clean_site visit',
    'inspection_type_clean_structural',
    'inspection_type_clean_structural inspection',
    'inspection_type_clean_nan'
]


In [None]:
import numpy as np

example = {f: 0.0 for f in FEATURES}

example.update({
    'latitude': 37.77,
    'longitude': -122.42,
    'avg_violations_last_3': 1.0,
    'fail_rate_last_3': 0.2,
    'days_since_last_inspection': 90.0,
    'trend_last_3': -0.3,
    'BusinessName_id': 0.0,
    'Address_id': 0.0,
    'insp_year': 2025.0,
    'insp_month': 12.0,
    'insp_day': 23.0,
    'insp_dow': 2.0,
    'insp_days_since_ref': 365.0,
})

# one‑hot for routine inspection
example['inspection_type_clean_routine'] = 1.0

# convert to array in correct order
X = np.array([example[f] for f in FEATURES], dtype=float).reshape(1, -1)
X.shape


(1, 36)

In [10]:
proba = model.predict_proba(X)[0, 1]  # probability of positive class
label = "HIGH" if proba >= 0.7 else "MEDIUM" if proba >= 0.4 else "LOW"

proba, label


(np.float32(0.2657015), 'LOW')

In [None]:
def predict_from_raw_input(
    latitude, longitude,
    avg_violations_last_3,
    fail_rate_last_3,
    days_since_last_inspection,
    trend_last_3,
    insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
    inspection_type_col,  # e.g. 'inspection_type_clean_routine'
):
    ex = {f: 0.0 for f in FEATURES}
    ex.update({
        'latitude': latitude,
        'longitude': longitude,
        'avg_violations_last_3': avg_violations_last_3,
        'fail_rate_last_3': fail_rate_last_3,
        'days_since_last_inspection': days_since_last_inspection,
        'trend_last_3': trend_last_3,
        'BusinessName_id': 0.0,
        'Address_id': 0.0,
        'insp_year': insp_year,
        'insp_month': insp_month,
        'insp_day': insp_day,
        'insp_dow': insp_dow,
        'insp_days_since_ref': insp_days_since_ref,
    })
    if inspection_type_col in FEATURES:
        ex[inspection_type_col] = 1.0
    else:
        ex['inspection_type_clean_nan'] = 1.0

    X = np.array([ex[f] for f in FEATURES], dtype=float).reshape(1, -1)
    p = float(model.predict_proba(X)[0, 1])
    label = "HIGH" if p >= 0.7 else "MEDIUM" if p >= 0.4 else "LOW"
    p_fail = float(probas[1])  # assuming classes_ == [0,1] and 1 == fail
y_hat = int(model.predict(X)[0])
    return p, label


In [12]:
p, label = predict_from_raw_input(
    latitude=37.77,
    longitude=-122.42,
    avg_violations_last_3=10,        # very high
    fail_rate_last_3=1.0,            # 100% recent fails
    days_since_last_inspection=2000, # very long time
    trend_last_3=5.0,                # strongly worsening
    insp_year=2010,
    insp_month=1,
    insp_day=1,
    insp_dow=1,
    insp_days_since_ref=5000,
    inspection_type_col="inspection_type_clean_complaint"
)
p, label


(0.03133456036448479, 'LOW')

In [13]:
import numpy as np

FEATURES = [
    'latitude', 'longitude', 'avg_violations_last_3', 'fail_rate_last_3',
    'days_since_last_inspection', 'trend_last_3',
    'BusinessName_id', 'Address_id',
    'insp_year', 'insp_month', 'insp_day', 'insp_dow', 'insp_days_since_ref',
    'inspection_type_clean_change of ownership',
    'inspection_type_clean_complaint',
    'inspection_type_clean_complaint (i)',
    'inspection_type_clean_complaint (r)',
    'inspection_type_clean_complaint reinspection/follow-up',
    'inspection_type_clean_foodborne illness',
    'inspection_type_clean_foodborne illness investigation',
    'inspection_type_clean_new construction',
    'inspection_type_clean_new ownership',
    'inspection_type_clean_new ownership (i)',
    'inspection_type_clean_new ownership (r)',
    'inspection_type_clean_new ownership - followup',
    'inspection_type_clean_non-inspection site visit',
    'inspection_type_clean_plan check',
    'inspection_type_clean_plan check (i)',
    'inspection_type_clean_plan check (r)',
    'inspection_type_clean_reinspection',
    'inspection_type_clean_reinspection/followup',
    'inspection_type_clean_routine',
    'inspection_type_clean_site visit',
    'inspection_type_clean_structural',
    'inspection_type_clean_structural inspection',
    'inspection_type_clean_nan'
]

def build_feature_row(
    latitude, longitude,
    avg_violations_last_3,
    fail_rate_last_3,
    days_since_last_inspection,
    trend_last_3,
    insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
    inspection_type_col  # e.g. 'inspection_type_clean_routine'
):
    row = {f: 0.0 for f in FEATURES}
    row.update({
        'latitude': latitude,
        'longitude': longitude,
        'avg_violations_last_3': avg_violations_last_3,
        'fail_rate_last_3': fail_rate_last_3,
        'days_since_last_inspection': days_since_last_inspection,
        'trend_last_3': trend_last_3,
        'BusinessName_id': 0.0,
        'Address_id': 0.0,
        'insp_year': insp_year,
        'insp_month': insp_month,
        'insp_day': insp_day,
        'insp_dow': insp_dow,
        'insp_days_since_ref': insp_days_since_ref,
    })
    if inspection_type_col in FEATURES:
        row[inspection_type_col] = 1.0
    else:
        row['inspection_type_clean_nan'] = 1.0

    X = np.array([row[f] for f in FEATURES], dtype=float).reshape(1, -1)
    return X


In [14]:
def predict_pass_fail_and_risk(
    latitude, longitude,
    avg_violations_last_3,
    fail_rate_last_3,
    days_since_last_inspection,
    trend_last_3,
    insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
    inspection_type_col,
    high_thr=0.7,
    med_thr=0.4,
):
    X = build_feature_row(
        latitude, longitude,
        avg_violations_last_3,
        fail_rate_last_3,
        days_since_last_inspection,
        trend_last_3,
        insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
        inspection_type_col,
    )

    # classes_ should be [0, 1] where 1 == fail_flag
    print("classes_:", model.classes_)
    probas = model.predict_proba(X)[0]
    p_fail = float(probas[1])

    y_hat = int(model.predict(X)[0])  # 0 or 1
    outcome_text = "FAIL" if y_hat == 1 else "PASS"

    if p_fail >= high_thr:
        risk_band = "HIGH"
    elif p_fail >= med_thr:
        risk_band = "MEDIUM"
    else:
        risk_band = "LOW"

    return {
        "y_pred_flag": y_hat,        # 0 / 1, same as fail_flag
        "outcome_text": outcome_text, # PASS / FAIL
        "p_fail": p_fail,             # probability of failure
        "risk_band": risk_band        # LOW / MEDIUM / HIGH
    }


In [15]:
result = predict_pass_fail_and_risk(
    latitude=37.77,
    longitude=-122.42,
    avg_violations_last_3=1.0,
    fail_rate_last_3=0.2,
    days_since_last_inspection=90,
    trend_last_3=-0.3,
    insp_year=2025,
    insp_month=12,
    insp_day=23,
    insp_dow=2,
    insp_days_since_ref=365,
    inspection_type_col="inspection_type_clean_routine",
)

result


classes_: [0 1]


{'y_pred_flag': 0,
 'outcome_text': 'PASS',
 'p_fail': 0.2657015025615692,
 'risk_band': 'LOW'}

In [16]:
result_bad = predict_pass_fail_and_risk(
    latitude=37.77,
    longitude=-122.42,

    # very bad recent history
    avg_violations_last_3=15,          # many violations
    fail_rate_last_3=1.0,              # 100% of last 3 failed
    days_since_last_inspection=3000,   # not inspected for years
    trend_last_3=5.0,                  # strongly worsening

    # old inspection date, long ref gap
    insp_year=2010,
    insp_month=1,
    insp_day=1,
    insp_dow=1,
    insp_days_since_ref=5000,

    # high‑risk context (e.g. complaint)
    inspection_type_col="inspection_type_clean_complaint reinspection/follow-up",
)

result_bad


classes_: [0 1]


{'y_pred_flag': 0,
 'outcome_text': 'PASS',
 'p_fail': 0.03133456036448479,
 'risk_band': 'LOW'}

In [29]:
file='C:/Users/lkneh/HealthScore-Predictor/data/clean/encoded/HealthInspectionsEncoded.csv'
df = pd.read_csv(file)

In [30]:
row = df[df["failFlag"] == 1].iloc
row


<pandas.core.indexing._iLocIndexer at 0x2027024de50>

In [31]:
def build_feature_row_from_raw(
    latitude, longitude,
    avg_violations_last_3,
    fail_rate_last_3,
    days_since_last_inspection,
    trend_last_3,
    insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
    inspection_type_col,
):
    row = {f: 0.0 for f in FEATURES}
    row.update({
        "latitude": latitude,
        "longitude": longitude,
        "avg_violations_last_3": avg_violations_last_3,
        "fail_rate_last_3": fail_rate_last_3,
        "days_since_last_inspection": days_since_last_inspection,
        "trend_last_3": trend_last_3,
        "BusinessName_id": 0.0,
        "Address_id": 0.0,
        "insp_year": insp_year,
        "insp_month": insp_month,
        "insp_day": insp_day,
        "insp_dow": insp_dow,
        "insp_days_since_ref": insp_days_since_ref,
    })
    if inspection_type_col in FEATURES:
        row[inspection_type_col] = 1.0
    else:
        row["inspection_type_clean_nan"] = 1.0

    import numpy as np
    X = np.array([row[f] for f in FEATURES], dtype=float).reshape(1, -1)
    return X


In [None]:
def predict_pass_fail_and_risk_from_raw(
    latitude, longitude,
    avg_violations_last_3,
    fail_rate_last_3,
    days_since_last_inspection,
    trend_last_3,
    insp_year, insp_month, insp_day, insp_dow, insp_days_since_ref,
    inspection_type_col
):
    X = build_feature_row_from_raw(
        latitude=latitude,
        longitude=longitude,
        avg_violations_last_3=avg_violations_last_3,
        fail_rate_last_3=fail_rate_last_3,
        days_since_last_inspection=days_since_last_inspection,
        trend_last_3=trend_last_3,
        insp_year=insp_year,
        insp_month=insp_month,
        insp_day=insp_day,
        insp_dow=insp_dow,
        insp_days_since_ref=insp_days_since_ref,
        inspection_type_col=inspection_type_col
    )
    probas = model.predict_proba(X)[0]
    p_fail = float(probas[1])        # since classes_ = [0,1]
    y_hat = int(model.predict(X)[0]) # 0 or 1

    

    outcome_text = "FAIL" if y_hat == 1 else "PASS"
    if p_fail >= 0.0 and p_fail < 0.6:
        risk_band = "HIGH"
    # elif p_fail >= 0.1* 0.5:
    #     risk_band = "MEDIUM"
    else:
        risk_band = "LOW"

    return {
        "y_pred_flag": y_hat,
        "outcome_text": outcome_text,
        "p_fail": p_fail,
        "risk_band": risk_band,
    }


In [37]:
row = df[df["failFlag"] == 1].iloc[0]
# print(row)
insp_type_col = next(
    c for c in FEATURES
    if c.startswith("inspection_type_clean_") and row[c] == 1
)

res = predict_pass_fail_and_risk_from_raw(
    latitude=row["latitude"],
    longitude=row["longitude"],
    avg_violations_last_3=row["avg_violations_last_3"],
    fail_rate_last_3=row["fail_rate_last_3"],
    days_since_last_inspection=row["days_since_last_inspection"],
    trend_last_3=row["trend_last_3"],
    insp_year=row["insp_year"],
    insp_month=row["insp_month"],
    insp_day=row["insp_day"],
    insp_dow=row["insp_dow"],
    insp_days_since_ref=row["insp_days_since_ref"],
    inspection_type_col=insp_type_col,
)
res


{'y_pred_flag': 0,
 'outcome_text': 'PASS',
 'p_fail': 0.1039000079035759,
 'risk_band': 'HIGH'}

In [38]:

result = predict_pass_fail_and_risk(
    latitude=37.77,
    longitude=-122.42,
    avg_violations_last_3=1.0,
    fail_rate_last_3=0.2,
    days_since_last_inspection=90,
    trend_last_3=-0.3,
    insp_year=2025,
    insp_month=12,
    insp_day=23,
    insp_dow=2,
    insp_days_since_ref=365,
    inspection_type_col="inspection_type_clean_routine",
)

result

classes_: [0 1]


{'y_pred_flag': 0,
 'outcome_text': 'PASS',
 'p_fail': 0.2657015025615692,
 'risk_band': 'LOW'}