# DEPRICATED - TESTING

In [35]:
import os, time, json, joblib, math, warnings, random
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# --- Repo paths ---
NB_DIR = Path.cwd()
REPO_ROOT = NB_DIR.parent  # assumes notebook lives in repo/notebooks/
ARTIFACTS_DIR = Path(os.getenv("ARTIFACTS_DIR", str(REPO_ROOT / "artifacts")))
DATA_PARQUET = ARTIFACTS_DIR / "turn_level_dataset.parquet"
DATA_CSV     = ARTIFACTS_DIR / "turn_level_dataset.csv"

# Make src/ importable
import sys
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# MLflow
import mlflow, mlflow.sklearn
mlflow.set_tracking_uri("file:" + str(REPO_ROOT / "mlruns"))
mlflow.set_experiment("escalation-detector-semantic-ml")

print("Repo root:", REPO_ROOT)
print("Artifacts dir:", ARTIFACTS_DIR)
print("MLflow dir:", REPO_ROOT / "mlruns")


2025/09/21 23:56:35 INFO mlflow.tracking.fluent: Experiment with name 'escalation-detector-semantic-ml' does not exist. Creating a new experiment.


Repo root: /Users/mukeshsingh/Desktop/sumup
Artifacts dir: /Users/mukeshsingh/Desktop/sumup/artifacts
MLflow dir: /Users/mukeshsingh/Desktop/sumup/mlruns


In [36]:
EXPECTED_COLS = {
    "conversation_id", "turn_global_idx", "user_turn_idx",
    "user_text", "prev_bot_text", "y", "is_escalation_needed_convo"
}

def load_turn_dataset():
    if DATA_PARQUET.exists():
        df = pd.read_parquet(DATA_PARQUET)
        src = DATA_PARQUET
    elif DATA_CSV.exists():
        df = pd.read_csv(DATA_CSV)
        src = DATA_CSV
    else:
        raise FileNotFoundError(
            f"Turn-level dataset not found. Expected {DATA_PARQUET} or {DATA_CSV}."
        )
    missing = EXPECTED_COLS - set(df.columns)
    if missing:
        raise ValueError(f"Dataset is missing expected columns: {missing}")
    # Normalize dtypes
    df = df.copy()
    df["conversation_id"] = df["conversation_id"].astype("string")
    df["turn_global_idx"] = df["turn_global_idx"].astype("int32")
    df["user_turn_idx"] = df["user_turn_idx"].astype("int32")
    df["user_text"] = df["user_text"].astype("string")
    df["prev_bot_text"] = df["prev_bot_text"].astype("string")
    df["y"] = df["y"].astype("int8")
    df["is_escalation_needed_convo"] = df["is_escalation_needed_convo"].astype("int8")
    print("Loaded:", src)
    return df

df = load_turn_dataset()
print("Shape:", df.shape, "Conversations:", df["conversation_id"].nunique(), "Pos rate:", round(float(df["y"].mean()),3))
display(df.head(8))


Loaded: /Users/mukeshsingh/Desktop/sumup/artifacts/turn_level_dataset.parquet
Shape: (61, 8) Conversations: 20 Pos rate: 0.197


Unnamed: 0,conversation_id,turn_global_idx,user_turn_idx,user_text,prev_bot_text,y,y_reason,is_escalation_needed_convo
0,c001,1,0,I have tried to verify id for another sum up a...,Hello there. If you’d like to return to your p...,0,,1
1,c001,3,1,So I resent and haven’t heard anything back,"Okay, it seems there might be an issue with th...",0,,1
2,c001,5,2,It’s been 2 weeks,"Okay, verification may take a couple of days. ...",1,fallback_last_user,1
3,c002,1,0,I’m not being sent a code to verify my device ...,Hello there! I'm here to assist you with anyth...,0,,1
4,c002,3,1,I’ve done that,**Didn't get the code?**\n\n1. Tap **Resend co...,0,,1
5,c002,5,2,Yes all of that is good,"Okay, let's try another approach. Ensure your ...",0,,1
6,c002,7,3,Where will I find that,"Got it. As a next step, try enabling **Push no...",1,fallback_last_user,1
7,c003,1,0,Hi I'm having issues with reports since the re...,Hello! If you'd like to return to our previous...,0,,1


In [37]:
# Load policy (repo root preferred) or fallback to minimal defaults.
import yaml

def load_policy():
    repo_policy = REPO_ROOT / "policy.yaml"
    art_policy  = ARTIFACTS_DIR / "policy.yaml"
    if repo_policy.exists():
        with open(repo_policy, "r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    elif art_policy.exists():
        with open(art_policy, "r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    else:
        # Minimal defaults (rules for risk & explicit human)
        return {
            "version": "policy@notebook-default",
            "guards": {"min_turn_before_model": 1},
            "rules": {
                "explicit_human_request": {
                    "enabled": True,
                    "patterns": [r"\b(human|agent|real person|talk to (?:a )?human|speak to (?:a )?human|customer service|support agent)\b"]
                },
                "risk_terms": {"enabled": True, "patterns": ["kyc","blocked","chargeback","legal","id verification"]},
                "bot_unhelpful_templates": {"enabled": True, "patterns": [
                    "could you provide more details","we could not find the information","check your spam folder",
                    "ensure your documents are clear and valid"
                ]}
            }
        }

policy = load_policy()
print("Policy version:", policy.get("version"))

# Optional warm-up of the semantic detector (if sentence-transformers is available).
# Training continues even if embeddings aren't available; features fall back to zeros.
try:
    from src.semantic_detection import get_semantic_detector
    _ = get_semantic_detector()
    print("Semantic embeddings: OK")
except Exception as e:
    print("Semantic embeddings unavailable, proceeding with zeros:", e)


Policy version: policy@prod-1
Semantic embeddings: OK


In [38]:
from src.features import featurize_one

# Single source of truth for features used both in training & serving.
FEATURE_ORDER = [
  "turn_idx",
  "user_caps_ratio",
  "exclam_count",
  "msg_len",
  "bot_unhelpful",
  "user_requests_human",
  "risk_terms",
  "no_progress_count",
  "bot_repeat_count",
  "semantic_frustration_score",
  "semantic_unhelpful_score",
  "semantic_human_request_score",
]

def build_design_matrix(df_in: pd.DataFrame, policy: dict) -> (pd.DataFrame, np.ndarray, pd.DataFrame):
    rows = []
    # We'll also rebuild a df_meta for grouped metrics
    meta = df_in[["conversation_id","user_turn_idx","y"]].reset_index(drop=True).copy()
    # rolling state per conversation must be reset
    for cid, block in df_in.groupby("conversation_id", sort=False):
        state = {"user_turn_idx": 0, "no_progress_count": 0.0, "bot_repeat_count": 0.0, "prev_bot_text": ""}
        block_sorted = block.sort_values(["user_turn_idx","turn_global_idx"])
        for _, r in block_sorted.iterrows():
            # user turn index must reflect position, not raw value from file (to mirror runtime counter)
            user_idx = int(state.get("user_turn_idx", 0))
            row_df, state = featurize_one(user_idx, str(r["user_text"]), str(r["prev_bot_text"]), state, policy, FEATURE_ORDER)
            rows.append(row_df.values[0])
            # increment when we actually consumed a user message
            state["user_turn_idx"] = user_idx + 1
    X = pd.DataFrame(rows, columns=FEATURE_ORDER)
    y = df_in["y"].astype(int).values
    return X, y, meta

X_all, y_all, meta_all = build_design_matrix(df, policy)
print("X_all shape:", X_all.shape, "y mean:", round(float(y_all.mean()),3))
display(X_all.head(8))


X_all shape: (61, 12) y mean: 0.197


Unnamed: 0,turn_idx,user_caps_ratio,exclam_count,msg_len,bot_unhelpful,user_requests_human,risk_terms,no_progress_count,bot_repeat_count,semantic_frustration_score,semantic_unhelpful_score,semantic_human_request_score
0,0.0,0.030303,0.0,250.0,0.0,0.0,0.0,0.0,0.0,0.260941,0.188849,0.170362
1,1.0,0.057143,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.187796,0.156374,0.150503
2,2.0,0.083333,0.0,17.0,1.0,0.0,0.0,0.0,0.0,0.440087,0.224596,0.111789
3,0.0,0.038462,0.0,99.0,0.0,0.0,0.0,0.0,0.0,0.269645,0.357625,0.175918
4,1.0,0.090909,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.487891,0.261046,0.120573
5,2.0,0.055556,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.117935,0.10455,0.075511
6,3.0,0.111111,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.26305,0.120984,0.193333
7,0.0,0.027027,0.0,89.0,0.0,0.0,0.0,0.0,0.0,0.162414,0.160482,0.084144


In [41]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

def early_escalation_at_first(df_meta: pd.DataFrame, y_true: np.ndarray, proba: np.ndarray, tau: float) -> float:
    tmp = df_meta.copy()
    tmp["proba"] = proba
    tmp["pred"] = (proba >= tau).astype(int)
    good = 0
    total_pos = 0
    for cid, block in tmp.groupby("conversation_id"):
        block = block.sort_values("user_turn_idx")
        if block["y"].max() == 0:
            continue
        total_pos += 1
        first_pos = int(block[block["y"] == 1]["user_turn_idx"].min())
        pred_before_or_at = block[block["user_turn_idx"] <= first_pos]["pred"]
        fired_before_or_at = int(pred_before_or_at.max()) == 1 if len(pred_before_or_at) > 0 else False
        if fired_before_or_at:
            good += 1
    return (good / total_pos) if total_pos else np.nan

def premature_posconv(df_meta: pd.DataFrame, y_true: np.ndarray, proba: np.ndarray, tau: float) -> float:
    # Fraction of positive conversations where we escalate before the first positive label
    tmp = df_meta.copy()
    tmp["proba"] = proba
    tmp["pred"] = (proba >= tau).astype(int)
    bad = 0
    total_pos = 0
    for cid, block in tmp.groupby("conversation_id"):
        block = block.sort_values("user_turn_idx")
        if block["y"].max() == 0:
            continue
        total_pos += 1
        first_pos = int(block[block["y"] == 1]["user_turn_idx"].min())
        pred_before = block[block["user_turn_idx"] < first_pos]["pred"]
        fired_before = int(pred_before.max()) == 1 if len(pred_before) > 0 else False
        if fired_before:
            bad += 1
    return (bad / total_pos) if total_pos else np.nan

def false_alarm_negconv(df_meta: pd.DataFrame, y_true: np.ndarray, proba: np.ndarray, tau: float) -> float:
    # Fraction of negative conversations where we ever escalate
    tmp = df_meta.copy()
    tmp["proba"] = proba
    tmp["pred"] = (proba >= tau).astype(int)
    neg_total, neg_bad = 0, 0
    for cid, block in tmp.groupby("conversation_id"):
        block = block.sort_values("user_turn_idx")
        if block["y"].max() == 1:
            continue
        neg_total += 1
        if block["pred"].max() == 1:
            neg_bad += 1
    return (neg_bad / neg_total) if neg_total else np.nan

def tte_mean(df_meta: pd.DataFrame, y_true: np.ndarray, proba: np.ndarray, tau: float) -> float:
    # Time-to-escalation relative to first positive user turn; negative if after, positive if before
    tmp = df_meta.copy()
    tmp["proba"] = proba
    tmp["pred"] = (proba >= tau).astype(int)
    vals = []
    for cid, block in tmp.groupby("conversation_id"):
        block = block.sort_values("user_turn_idx")
        if block["y"].max() == 0:
            continue
        first_pos = int(block[block["y"] == 1]["user_turn_idx"].min())
        fired_idxs = block[block["pred"] == 1]["user_turn_idx"]
        if not fired_idxs.empty:
            first_fire = int(fired_idxs.min())
            vals.append(first_fire - first_pos)
    return float(np.mean(vals)) if vals else np.nan

def eval_at_tau(df_meta, y_true, proba, tau):
    return {
        "roc_auc": roc_auc_score(y_true, proba),
        "pr_auc":  average_precision_score(y_true, proba),
        "early_at_first": early_escalation_at_first(df_meta, y_true, proba, tau),
        "premature_posconv": premature_posconv(df_meta, y_true, proba, tau),
        "false_alarm_negconv": false_alarm_negconv(df_meta, y_true, proba, tau),
        "tte_mean": tte_mean(df_meta, y_true, proba, tau),
        "tau": float(tau)
    }

def choose_tau_constrained(df_meta, y_true, proba, cap_premature=0.20):
    grid = np.linspace(0.01, 0.99, 99)
    rows = []
    best_tau, best_key = None, (-math.inf, )
    for tau in grid:
        m = eval_at_tau(df_meta, y_true, proba, tau)
        if m["premature_posconv"] is not np.nan and (m["premature_posconv"] is not None):
            if (not np.isnan(m["premature_posconv"])) and (m["premature_posconv"] <= cap_premature):
                key = (m["early_at_first"], m["pr_auc"], m["roc_auc"])
                if best_tau is None or (np.nan_to_num(key, nan=-1e9) > np.nan_to_num(best_key, nan=-1e9)).any():
                    best_tau, best_key = tau, key
        rows.append(m)
    table = pd.DataFrame(rows)
    if best_tau is None:
        # fallback: pick F1-like
        prec, rec, thr = precision_recall_curve(y_true, proba)
        f1 = (2 * prec * rec) / (prec + rec + 1e-9)
        idx = int(np.nanargmax(f1))
        best_tau = float(thr[max(0, idx - 1)]) if len(thr) else 0.5
    return float(best_tau), table.sort_values("tau").reset_index(drop=True)


In [42]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier

# optional xgboost (we fail closed if not available)
try:
    import xgboost as xgb
    XGB_OK = True
except Exception:
    XGB_OK = False

def train_eval_log(name, estimator, Xtr, ytr, Xte, yte, df_te, calibrate=True, params=None):
    params = params or {}
    run_name = f"{name}_{int(time.time())}"
    with mlflow.start_run(run_name=run_name):
        # estimator params
        try:
            est_params = estimator.get_params(deep=False)
        except Exception:
            est_params = {}
        mlflow.log_params({"model": name, **est_params, **params})

        # fit + calibrate
        if calibrate:
            try:
                clf = CalibratedClassifierCV(estimator, cv=3, method="sigmoid")
                clf.fit(Xtr, ytr)
            except Exception as e:
                # fallback to prefit
                estimator.fit(Xtr, ytr)
                clf = CalibratedClassifierCV(estimator, cv="prefit", method="sigmoid")
                clf.fit(Xtr, ytr)
        else:
            estimator.fit(Xtr, ytr)
            clf = estimator

        proba = clf.predict_proba(Xte)[:,1]
        roc = roc_auc_score(yte, proba)
        pr  = average_precision_score(yte, proba)
        tau_base, table = choose_tau_constrained(df_te, yte, proba, cap_premature=0.20)
        m = eval_at_tau(df_te, yte, proba, tau_base)

        # log metrics
        mlflow.log_metrics({
            "roc_auc": float(roc),
            "pr_auc": float(pr),
            "tau_base": float(tau_base),
            "early_at_first": float(m["early_at_first"]),
            "premature_posconv": float(m["premature_posconv"]),
            "false_alarm_negconv": float(m["false_alarm_negconv"]),
            "tte_mean": float(m["tte_mean"]),
        })

        # signature + input example
        try:
            from mlflow.models.signature import infer_signature
            sig = infer_signature(Xtr, clf.predict_proba(Xtr)[:,1])
            mlflow.sklearn.log_model(clf, artifact_path="model", signature=sig, input_example=Xtr.head(5))
        except Exception:
            mlflow.sklearn.log_model(clf, artifact_path="model")

        print(f"[{name}] ROC-AUC={roc:.3f} PR-AUC={pr:.3f} early@first={m['early_at_first']:.3f} "
              f"premature={m['premature_posconv']:.3f} Fneg={m['false_alarm_negconv']:.3f} tau={tau_base:.3f}")

        return clf, tau_base, m, table

candidates = []
candidates.append(("logreg_calibrated",
                   LogisticRegression(max_iter=2000, class_weight="balanced"),
                   True,
                   {"notes":"interpretable baseline"}))
candidates.append(("histgb",
                   HistGradientBoostingClassifier(random_state=42, learning_rate=0.12, max_depth=None),
                   True,
                   {"notes":"tree boosting"}))
candidates.append(("random_forest",
                   RandomForestClassifier(n_estimators=400, max_depth=None, class_weight="balanced_subsample", n_jobs=-1, random_state=42),
                   True,
                   {"notes":"rf baseline"}))
if XGB_OK:
    candidates.append(("xgboost",
                       xgb.XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.06, subsample=0.9, colsample_bytree=0.9,
                                         reg_lambda=1.0, objective="binary:logistic", eval_metric="logloss", tree_method="hist",
                                         n_jobs=-1, random_state=42),
                       True,
                       {"notes":"xgb strong tabular"}))
else:
    print("XGBoost not available; skipping.")

results = []
best = None
best_key = None

for name, est, calibrate, params in candidates:
    clf, tau_b, metrics, table = train_eval_log(name, est, X_train, y_train, X_test, y_test, df_test, calibrate, params)
    results.append({"model": name, "tau": tau_b, **metrics})
    key = (metrics["early_at_first"], metrics["pr_auc"], metrics["roc_auc"])
    if best is None:
        best = (clf, tau_b, name, metrics, table)
        best_key = key
    else:
        # prioritize early@first, break ties with PR-AUC then ROC
        if key > best_key or (np.isnan(best_key[0]) and not np.isnan(key[0])):
            best, best_key = (clf, tau_b, name, metrics, table), key

print("\nBest by (early@first, PR, ROC):", best[2], "tau_base:", round(best[1],3))
df_results = pd.DataFrame(results).sort_values(["early_at_first","pr_auc","roc_auc"], ascending=[False,False,False])
display(df_results)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[logreg_calibrated] ROC-AUC=0.683 PR-AUC=0.594 early@first=0.500 premature=0.000 Fneg=0.500 tau=0.460


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[histgb] ROC-AUC=0.500 PR-AUC=0.211 early@first=0.000 premature=0.000 Fneg=0.000 tau=0.220


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[random_forest] ROC-AUC=0.700 PR-AUC=0.537 early@first=0.250 premature=0.000 Fneg=0.000 tau=0.510


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[xgboost] ROC-AUC=0.425 PR-AUC=0.287 early@first=0.000 premature=0.000 Fneg=0.000 tau=0.500

Best by (early@first, PR, ROC): logreg_calibrated tau_base: 0.46


Unnamed: 0,model,tau,roc_auc,pr_auc,early_at_first,premature_posconv,false_alarm_negconv,tte_mean
0,logreg_calibrated,0.46,0.683333,0.594298,0.5,0.0,0.5,0.0
2,random_forest,0.51,0.7,0.5375,0.25,0.0,0.0,0.0
3,xgboost,0.5,0.425,0.287007,0.0,0.0,0.0,
1,histgb,0.22,0.5,0.210526,0.0,0.0,0.0,


In [43]:
best_clf, tau_base, best_name, best_metrics, best_table = best

# Suggest tau_high: instant escalate for very confident cases
tau_hi = float(np.clip(max(np.quantile(best_table["tau"].values, 0.95), tau_base + 0.20), 0.50, 0.99))

ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Save model
model_path = ARTIFACTS_DIR / "model.joblib"
joblib.dump(best_clf, model_path)

# Save feature order
feat_path = ARTIFACTS_DIR / "feature_order.json"
with open(feat_path, "w", encoding="utf-8") as f:
    json.dump(FEATURE_ORDER, f, indent=2)

# Build version file
version_txt = ARTIFACTS_DIR / "version.txt"
now = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime())
with open(version_txt, "w", encoding="utf-8") as f:
    f.write(f"model_type={best_name}\n")
    f.write(f"training_date={now}\n")
    f.write(f"features={len(FEATURE_ORDER)}\n")
    f.write(f"test_auc={best_metrics['roc_auc']:.3f}\n")
    f.write(f"pr_auc={best_metrics['pr_auc']:.3f}\n")
    f.write(f"threshold={tau_base}\n")
    f.write(f"tau_high={tau_hi}\n")

# Write policy snapshot specific to artifacts (does not overwrite repo policy.yaml)
policy_snapshot = {
    "version": f"policy@snapshot:{now}",
    "thresholds": {
        "tau_low": float(tau_base),
        "tau_high": float(tau_hi)
    },
    "guards": {
        "min_turn_before_model": int((policy.get("guards") or {}).get("min_turn_before_model", 1))
    },
    "rules": (policy.get("rules") or {})
}
with open(ARTIFACTS_DIR / "policy.yaml", "w", encoding="utf-8") as f:
    import yaml
    yaml.safe_dump(policy_snapshot, f, sort_keys=False)

print("Exported:")
print(" -", model_path)
print(" -", feat_path)
print(" -", version_txt)
print(" -", ARTIFACTS_DIR / "policy.yaml")

# Show a slice of the threshold table around tau_base
nearest = best_table.iloc[(best_table["tau"] - tau_base).abs().argmin()]
print(f"Chosen tau_base={tau_base:.3f}  early@first={nearest['early_at_first']:.3f}  "
      f"premature={nearest['premature_posconv']:.3f}  false_alarm={nearest['false_alarm_negconv']:.3f}  "
      f"tte_mean={nearest['tte_mean']:.3f}")
display(best_table.head(10))


Exported:
 - /Users/mukeshsingh/Desktop/sumup/artifacts/model.joblib
 - /Users/mukeshsingh/Desktop/sumup/artifacts/feature_order.json
 - /Users/mukeshsingh/Desktop/sumup/artifacts/version.txt
 - /Users/mukeshsingh/Desktop/sumup/artifacts/policy.yaml
Chosen tau_base=0.460  early@first=0.500  premature=0.000  false_alarm=0.500  tte_mean=0.000


Unnamed: 0,roc_auc,pr_auc,early_at_first,premature_posconv,false_alarm_negconv,tte_mean,tau
0,0.683333,0.594298,1.0,0.75,1.0,-2.5,0.01
1,0.683333,0.594298,1.0,0.75,1.0,-2.5,0.02
2,0.683333,0.594298,1.0,0.75,1.0,-2.5,0.03
3,0.683333,0.594298,1.0,0.75,1.0,-2.5,0.04
4,0.683333,0.594298,0.75,0.75,1.0,-3.333333,0.05
5,0.683333,0.594298,0.75,0.75,1.0,-3.333333,0.06
6,0.683333,0.594298,0.75,0.75,1.0,-3.333333,0.07
7,0.683333,0.594298,0.75,0.75,1.0,-3.0,0.08
8,0.683333,0.594298,0.75,0.75,1.0,-3.0,0.09
9,0.683333,0.594298,0.75,0.75,0.5,-2.0,0.1


In [45]:
# Ensure we reload using the same API as your service uses
from src.model import load_artifacts

mdl, feat_order_reload, tau_loaded, pol = load_artifacts(str(ARTIFACTS_DIR))
print("Reloaded tau:", tau_loaded)
# Align columns to the artifact's feature order (serving will use this order)
X_test_aligned = X_test.reindex(columns=feat_order_reload)

proba_reload = mdl.predict_proba(X_test_aligned)[:,1]
m_reload = eval_at_tau(df_test, y_test, proba_reload, tau_loaded)

print(f"RELOAD TEST — ROC-AUC={m_reload['roc_auc']:.3f} PR-AUC={m_reload['pr_auc']:.3f} "
      f"early@first={m_reload['early_at_first']:.3f} premature={m_reload['premature_posconv']:.3f} "
      f"false_alarm={m_reload['false_alarm_negconv']:.3f} tte_mean={m_reload['tte_mean']:.3f} tau={tau_loaded:.3f}")

display(pd.DataFrame({
    "conversation_id": df_test["conversation_id"].values[:10],
    "user_turn_idx": df_test["user_turn_idx"].values[:10],
    "y": df_test["y"].values[:10],
    "proba": proba_reload[:10],
    "pred": (proba_reload[:10] >= tau_loaded).astype(int)
}))


Reloaded tau: 0.46
RELOAD TEST — ROC-AUC=0.683 PR-AUC=0.594 early@first=0.500 premature=0.000 false_alarm=0.500 tte_mean=0.000 tau=0.460


Unnamed: 0,conversation_id,user_turn_idx,y,proba,pred
0,c001,0,0,0.374504,0
1,c001,1,0,0.105293,0
2,c001,2,1,0.496187,1
3,c002,0,0,0.099181,0
4,c002,1,0,0.078037,0
5,c002,2,0,0.161261,0
6,c002,3,1,0.277314,0
7,c006,0,0,0.07711,0
8,c006,1,0,0.09109,0
9,c006,2,0,0.160898,0
