In [1]:

# === Config & imports ===
import os, re, time, json, joblib, random
import numpy as np
import pandas as pd
from typing import List, Dict, Any, Tuple

# Reproducibility
SEED = int(os.getenv("SEED", "42"))
random.seed(SEED); np.random.seed(SEED)

# Paths
DATA_PATH = os.getenv("CONV_PATH", "../data/escalation_dataset.json")
ARTIFACTS_DIR = os.getenv("ARTIFACTS_DIR", "artifacts")
MLRUNS_DIR = os.getenv("MLRUNS_DIR", "./mlruns")
POLICY_PATH = os.getenv("POLICY_PATH", "policy.yaml")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# MLflow
import mlflow, mlflow.sklearn
mlflow.set_tracking_uri(f"file:{MLRUNS_DIR}")
mlflow.set_experiment("escalation-detector-e2e")

print("Using SEED:", SEED)
print("Data path:", DATA_PATH)
print("Artifacts dir:", ARTIFACTS_DIR)
print("MLruns dir:", MLRUNS_DIR)
print("Policy path:", POLICY_PATH)


2025/09/22 00:18:35 INFO mlflow.tracking.fluent: Experiment with name 'escalation-detector-e2e' does not exist. Creating a new experiment.


Using SEED: 42
Data path: ../data/escalation_dataset.json
Artifacts dir: artifacts
MLruns dir: ./mlruns
Policy path: policy.yaml


In [2]:

# === Load policy (regex patterns). If policy.yaml missing, use defaults and snapshot. ===
try:
    import yaml
except Exception as e:
    raise RuntimeError("pyyaml is required: pip install pyyaml") from e

DEFAULT_POLICY = {
    "version": "policy@notebook",
    "thresholds": {"tau_low": 0.45, "tau_high": 0.70},
    "rules": {
        "explicit_human_request": {
            "enabled": True,
            "patterns": [r"\b(human|agent|real person|talk to (?:a )?human|speak to (?:a )?human|customer service|support agent)\b"]
        },
        "risk_terms": {
            "enabled": True,
            "patterns": ["kyc","blocked","chargeback","legal","id verification"]
        },
        "bot_unhelpful_templates": {
            "enabled": True,
            "patterns": [
                "could you provide more details",
                "we could not find the information",
                "check your spam folder",
                "ensure your documents are clear and valid"
            ]
        }
    },
    "llm": {"enabled": False, "max_latency_ms": 400},
    "redis": {"ttl_seconds": 86400}
}

def load_policy(path: str) -> Dict[str, Any]:
    if not os.path.exists(path):
        print(f"policy.yaml not found at {path}; using defaults and writing snapshot to artifacts.")
        with open(os.path.join(ARTIFACTS_DIR, "policy.yaml"), "w", encoding="utf-8") as f:
            yaml.safe_dump(DEFAULT_POLICY, f, sort_keys=False, allow_unicode=True)
        return DEFAULT_POLICY
    with open(path, "r", encoding="utf-8") as f:
        p = yaml.safe_load(f) or {}
    pol = DEFAULT_POLICY.copy()
    pol.update(p)
    with open(os.path.join(ARTIFACTS_DIR, "policy.yaml"), "w", encoding="utf-8") as f:
        yaml.safe_dump(pol, f, sort_keys=False, allow_unicode=True)
    return pol

POLICY = load_policy(POLICY_PATH)
RULES = POLICY.get("rules", {})
print("Policy loaded. Explicit patterns:", RULES.get("explicit_human_request", {}).get("patterns", [])[:2], "...")


policy.yaml not found at policy.yaml; using defaults and writing snapshot to artifacts.
Policy loaded. Explicit patterns: ['\\b(human|agent|real person|talk to (?:a )?human|speak to (?:a )?human|customer service|support agent)\\b'] ...


In [3]:

# === Load raw conversations and validate schema ===
def load_conversations(path: str) -> List[Dict[str, Any]]:
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Missing {path}. Put transcripts at data/escalation_dataset.json or set CONV_PATH."
        )
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Root JSON must be a list of conversation objects.")
    if len(data) == 0:
        raise ValueError("No conversations found.")
    for i, conv in enumerate(data[:5]):
        assert "conversation_id" in conv, f"conversation_id missing idx {i}"
        ch = conv.get("conversation_history")
        assert isinstance(ch, list), f"conversation_history missing/not list idx {i}"
        assert all(("role" in m and "message" in m) for m in ch), f"role/message missing idx {i}"
        assert "is_escalation_needed" in conv, f"is_escalation_needed missing idx {i}"
    print(f"Loaded {len(data)} conversations.")
    return data

raw_conversations = load_conversations(DATA_PATH)
print("Sample IDs:", [c["conversation_id"] for c in raw_conversations[:5]])


Loaded 20 conversations.
Sample IDs: ['c001', 'c002', 'c003', 'c004', 'c005']


In [4]:
# === Weak labeling: earliest escalation-worthy user turn ===
EXPL_PATTS = RULES.get("explicit_human_request", {}).get("patterns",
            [r"\b(human|agent|real person|talk to (?:a )?human|speak to (?:a )?human|customer service|support agent)\b"])
UNHELP_PATTS = RULES.get("bot_unhelpful_templates", {}).get("patterns",
              ["could you provide more details","we could not find the information","check your spam folder","ensure your documents are clear and valid"])

def _regex_any(patterns: List[str], text: str) -> bool:
    t = (text or "").lower()
    return any(re.search(p, t) for p in patterns)

def is_unhelpful_bot(msg: str) -> bool:
    return _regex_any(UNHELP_PATTS, msg)

def explicit_human_request(msg: str) -> bool:
    return _regex_any(EXPL_PATTS, msg)

def user_repeats(prev_user_msg: str, this_user_msg: str, jaccard_threshold: float = 0.5) -> bool:
    if not prev_user_msg:
        return False
    a = set(w for w in re.findall(r"[A-Za-z']+", prev_user_msg.lower()) if len(w) > 2)
    b = set(w for w in re.findall(r"[A-Za-z']+", this_user_msg.lower()) if len(w) > 2)
    if not a or not b:
        return False
    inter = len(a & b); denom = max(1, len(a | b))
    return (inter / denom) >= jaccard_threshold

def weak_label_conversation(conv: Dict[str, Any]) -> Tuple[List[int], List[str]]:
    ch = conv["conversation_history"]
    is_pos = bool(conv.get("is_escalation_needed", False))
    pos_user_turn_idx = None
    reason = None
    loop_score = 0
    prev_user_text = ""

    for idx, m in enumerate(ch):
        role = m.get("role"); text = m.get("message", "") or ""
        if role == "user" and explicit_human_request(text):
            pos_user_turn_idx = idx; reason = "explicit_request"; break
        if role == "bot" and is_unhelpful_bot(text):
            loop_score += 1
        if role == "user" and user_repeats(prev_user_text, text, 0.5):
            loop_score += 1
        if role == "user":
            prev_user_text = text
        if loop_score >= 2 and is_pos and pos_user_turn_idx is None:
            if role == "user":
                pos_user_turn_idx = idx
            else:
                for j in range(idx + 1, len(ch)):
                    if ch[j]["role"] == "user":
                        pos_user_turn_idx = j; break
            reason = "loop_unhelpful"
            if pos_user_turn_idx is not None:
                break

    if is_pos and pos_user_turn_idx is None:
        for j in range(len(ch)-1, -1, -1):
            if ch[j]["role"] == "user":
                pos_user_turn_idx = j; reason = "fallback_last_user"; break

    labels, reasons = [], []
    for idx, m in enumerate(ch):
        if m["role"] != "user":
            continue
        if not is_pos:
            labels.append(0); reasons.append("")
        else:
            y = 1 if (pos_user_turn_idx is not None and idx >= pos_user_turn_idx) else 0
            labels.append(y); reasons.append(reason if y==1 else "")
    return labels, reasons

def to_turn_rows(conv: Dict[str, Any]) -> List[Dict[str, Any]]:
    rows = []
    ch = conv["conversation_history"]
    labels, reasons = weak_label_conversation(conv)
    user_turn_ptr = 0; prev_bot_msg = ""

    for idx, m in enumerate(ch):
        role = m["role"]; msg = m.get("message", "") or ""
        if role == "user":
            rows.append({
                "conversation_id": conv["conversation_id"],
                "turn_global_idx": idx,
                "user_turn_idx": user_turn_ptr,
                "user_text": msg,
                "prev_bot_text": prev_bot_msg,
                "y": labels[user_turn_ptr],
                "y_reason": reasons[user_turn_ptr],
                "is_escalation_needed_convo": bool(conv.get("is_escalation_needed", False)),
            })
            user_turn_ptr += 1
        elif role == "bot":
            prev_bot_msg = msg
    return rows

all_rows: List[Dict[str, Any]] = []
for conv in raw_conversations:
    all_rows.extend(to_turn_rows(conv))

df = pd.DataFrame(all_rows)
print("Turn-level dataset:", df.shape)
display(df.head(8))
print("Label counts:", df["y"].value_counts().to_dict())
print("Conversations:", df["conversation_id"].nunique())

Turn-level dataset: (61, 8)


Unnamed: 0,conversation_id,turn_global_idx,user_turn_idx,user_text,prev_bot_text,y,y_reason,is_escalation_needed_convo
0,c001,1,0,I have tried to verify id for another sum up a...,Hello there. If you’d like to return to your p...,0,,True
1,c001,3,1,So I resent and haven’t heard anything back,"Okay, it seems there might be an issue with th...",0,,True
2,c001,5,2,It’s been 2 weeks,"Okay, verification may take a couple of days. ...",1,fallback_last_user,True
3,c002,1,0,I’m not being sent a code to verify my device ...,Hello there! I'm here to assist you with anyth...,0,,True
4,c002,3,1,I’ve done that,**Didn't get the code?**\n\n1. Tap **Resend co...,0,,True
5,c002,5,2,Yes all of that is good,"Okay, let's try another approach. Ensure your ...",0,,True
6,c002,7,3,Where will I find that,"Got it. As a next step, try enabling **Push no...",1,fallback_last_user,True
7,c003,1,0,Hi I'm having issues with reports since the re...,Hello! If you'd like to return to our previous...,0,,True


Label counts: {0: 49, 1: 12}
Conversations: 20


In [5]:
# === Save turn-level dataset ===
parquet_path = os.path.join(ARTIFACTS_DIR, "turn_level_dataset.parquet")
csv_path = os.path.join(ARTIFACTS_DIR, "turn_level_dataset.csv")

dtype_map = {
    "conversation_id": "string", "turn_global_idx": "int32", "user_turn_idx": "int32",
    "user_text": "string", "prev_bot_text": "string", "y": "int8",
    "y_reason": "string", "is_escalation_needed_convo": "int8"
}
df_out = df.astype(dtype_map)

def save_with_fallbacks(df_out, parquet_path, csv_path):
    try:
        import pyarrow  # noqa
        df_out.to_parquet(parquet_path, index=False, engine="pyarrow")
        print(f"Saved (pyarrow) → {parquet_path}")
        return parquet_path
    except Exception as e1:
        print("pyarrow failed:", e1)
        try:
            import fastparquet  # noqa
            df_out.to_parquet(parquet_path, index=False, engine="fastparquet")
            print(f"Saved (fastparquet) → {parquet_path}")
            return parquet_path
        except Exception as e2:
            print("fastparquet failed:", e2)
            df_out.to_csv(csv_path, index=False)
            print(f"Saved (CSV) → {csv_path}")
            return csv_path

saved_turns_path = save_with_fallbacks(df_out, parquet_path, csv_path)


Saved (pyarrow) → artifacts/turn_level_dataset.parquet


In [6]:
# === Feature engineering (aligned with serving) ===
UNHELPFUL_BOT_PATTERNS = RULES.get("bot_unhelpful_templates", {}).get("patterns",
    ["could you provide more details","we could not find the information","check your spam folder","ensure your documents are clear and valid"])
HUMAN_REQUEST_PATTERNS = RULES.get("explicit_human_request", {}).get("patterns",
    [r"\b(human|agent|real person|talk to (?:a )?human|speak to (?:a )?human|customer service|support agent)\b"])
RISK_TERMS = RULES.get("risk_terms", {}).get("patterns", ["kyc","blocked","chargeback","legal","id verification"])

def _has_any(patterns, s: str) -> int:
    s = (s or "").lower()
    return int(any(re.search(p, s) for p in patterns))

def _caps_ratio(s: str) -> float:
    if not s: return 0.0
    caps = sum(1 for c in s if c.isupper())
    letters = sum(1 for c in s if c.isalpha())
    return (caps / letters) if letters else 0.0

def _exclam_count(s: str) -> int:
    return (s or "").count("!")

def _msg_len(s: str) -> int:
    return len(s or "")

def featurize(df_in: pd.DataFrame) -> pd.DataFrame:
    X = pd.DataFrame({
        "turn_idx": df_in["user_turn_idx"].astype(float),
        "user_caps_ratio": df_in["user_text"].fillna("").apply(_caps_ratio).astype(float),
        "exclam_count": df_in["user_text"].fillna("").apply(_exclam_count).astype(float),
        "msg_len": df_in["user_text"].fillna("").apply(_msg_len).astype(float),
        "bot_unhelpful": df_in["prev_bot_text"].fillna("").apply(lambda s: _has_any(UNHELPFUL_BOT_PATTERNS, s)).astype(float),
        "user_requests_human": df_in["user_text"].fillna("").apply(lambda s: _has_any(HUMAN_REQUEST_PATTERNS, s)).astype(float),
        "risk_terms": df_in["user_text"].fillna("").apply(lambda s: _has_any(RISK_TERMS, s)).astype(float),
    }, index=df_in.index)

    X["no_progress_count"] = 0.0
    X["bot_repeat_count"] = 0.0

    for cid, idxs in df_in.groupby("conversation_id").groups.items():
        idxs = list(sorted(idxs, key=lambda i: (int(df_in.loc[i, "user_turn_idx"]), int(df_in.loc[i, "turn_global_idx"]))))
        prev_bot = None; npc = 0; brc = 0
        for i in idxs:
            this_bot = (df_in.loc[i, "prev_bot_text"] or "").strip().lower()
            if prev_bot and this_bot and (this_bot == prev_bot):
                brc += 1
            else:
                brc = max(brc - 1, 0)
            if _has_any(UNHELPFUL_BOT_PATTERNS, this_bot):
                npc += 1
            else:
                npc = max(npc - 1, 0)
            X.at[i, "bot_repeat_count"] = float(brc)
            X.at[i, "no_progress_count"] = float(npc)
            prev_bot = this_bot

    return X.astype(float)

X = featurize(df)
y = df["y"].astype(int).values
groups = df["conversation_id"].astype("string").values

FEATURE_ORDER = [
    "turn_idx","user_caps_ratio","exclam_count","msg_len",
    "bot_unhelpful","user_requests_human","risk_terms",
    "no_progress_count","bot_repeat_count"
]
with open(os.path.join(ARTIFACTS_DIR, "feature_order.json"), "w", encoding="utf-8") as f:
    json.dump(FEATURE_ORDER, f, indent=2)

print("X shape:", X.shape, "| Pos rate:", round(float(y.mean()), 3))
display(X.head(6))

X shape: (61, 9) | Pos rate: 0.197


Unnamed: 0,turn_idx,user_caps_ratio,exclam_count,msg_len,bot_unhelpful,user_requests_human,risk_terms,no_progress_count,bot_repeat_count
0,0.0,0.030303,0.0,250.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.057143,0.0,43.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.083333,0.0,17.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.038462,0.0,99.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.090909,0.0,14.0,0.0,0.0,0.0,0.0,0.0
5,2.0,0.055556,0.0,23.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# === Group-aware split ===
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
(train_idx, test_idx) = list(gss.split(X, y, groups))[0]

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
groups_train, groups_test = groups[train_idx], groups[test_idx]
df_test = df.iloc[test_idx][["conversation_id","user_turn_idx","y"]].copy()

print("Train:", X_train.shape, "| Test:", X_test.shape)
print("Pos rate train:", round(float(y_train.mean()), 3), "| test:", round(float(y_test.mean()), 3))


Train: (49, 9) | Test: (12, 9)
Pos rate train: 0.204 | test: 0.167


In [8]:
# === Metrics & threshold helpers ===
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

def pick_threshold_tpr_bound(y_true: np.ndarray, proba: np.ndarray, min_tpr: float = 0.90) -> float:
    prec, rec, thr = precision_recall_curve(y_true, proba)
    idxs = np.where(rec >= min_tpr)[0]
    if len(idxs) >= 1 and len(thr) > 0:
        candidate_thr = [thr[i-1] for i in idxs if i-1 >= 0]
        if candidate_thr:
            return float(min(candidate_thr))
    f1 = (2 * prec * rec) / (prec + rec + 1e-9)
    best_idx = int(np.nanargmax(f1))
    tau = thr[max(0, best_idx - 1)] if len(thr) > 0 else 0.5
    return float(tau)

def early_escalation_at_first(df_test: pd.DataFrame, proba: np.ndarray, tau: float) -> float:
    tmp = df_test.copy()
    tmp = tmp.assign(proba=proba, pred=(proba >= tau).astype(int))
    convs = tmp["conversation_id"].unique()
    total_pos, good = 0, 0
    for cid in convs:
        seq = tmp[tmp["conversation_id"] == cid].sort_values("user_turn_idx")
        if seq["y"].max() == 0:
            continue
        total_pos += 1
        first_pos = int(seq[seq["y"] == 1]["user_turn_idx"].min())
        fired = seq[seq["user_turn_idx"] <= first_pos]["pred"].max() == 1
        if fired:
            good += 1
    return (good / total_pos) if total_pos else np.nan

def premature_escalation_rate(df_test: pd.DataFrame, proba: np.ndarray, tau: float) -> float:
    tmp = df_test.copy()
    tmp = tmp.assign(proba=proba, pred=(proba >= tau).astype(int))
    convs = tmp["conversation_id"].unique()
    total_pos, early = 0, 0
    for cid in convs:
        seq = tmp[tmp["conversation_id"] == cid].sort_values("user_turn_idx")
        if seq["y"].max() == 0:
            continue
        total_pos += 1
        first_pos = int(seq[seq["y"] == 1]["user_turn_idx"].min())
        early_fire = seq[seq["user_turn_idx"] < first_pos]["pred"].max() == 1
        if early_fire:
            early += 1
    return (early / total_pos) if total_pos else np.nan

def time_to_escalation_turns(df_test: pd.DataFrame, proba: np.ndarray, tau: float) -> float:
    tmp = df_test.copy()
    tmp = tmp.assign(proba=proba, pred=(proba >= tau).astype(int))
    convs = tmp["conversation_id"].unique()
    diffs = []
    for cid in convs:
        seq = tmp[tmp["conversation_id"] == cid].sort_values("user_turn_idx")
        if seq["y"].max() == 0:
            continue
        first_pos = int(seq[seq["y"] == 1]["user_turn_idx"].min())
        pred_idxs = seq[seq["pred"] == 1]["user_turn_idx"].values
        if len(pred_idxs) == 0:
            continue
        first_pred = int(pred_idxs[0])
        diffs.append(first_pred - first_pos)
    return float(np.mean(diffs)) if diffs else np.nan


In [9]:

# === Grouped CV + calibration + MLflow ===
from itertools import product
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

def cv_eval_config(name, estimator, params, X, y, groups, min_tpr=0.90, k=5, calibrate=True):
    gkf = GroupKFold(n_splits=min(k, len(np.unique(groups))))
    oof_proba, oof_truth, oof_meta = [], [], []

    for fold, (tr, va) in enumerate(gkf.split(X, y, groups)):
        est = estimator.set_params(**params)
        if calibrate:
            try:
                clf = CalibratedClassifierCV(est, cv=3, method="sigmoid")
                clf.fit(X.iloc[tr], y[tr])
            except Exception:
                est.fit(X.iloc[tr], y[tr])
                clf = CalibratedClassifierCV(est, cv="prefit", method="sigmoid")
                clf.fit(X.iloc[tr], y[tr])
        else:
            est.fit(X.iloc[tr], y[tr])
            clf = est
        proba = clf.predict_proba(X.iloc[va])[:, 1]
        oof_proba.append(proba); oof_truth.append(y[va])
        oof_meta.append(df.iloc[va][["conversation_id","user_turn_idx","y"]].copy())

    proba_all = np.concatenate(oof_proba)
    y_all = np.concatenate(oof_truth)
    df_meta = pd.concat(oof_meta, axis=0)

    roc = roc_auc_score(y_all, proba_all)
    pr  = average_precision_score(y_all, proba_all)
    tau = pick_threshold_tpr_bound(y_all, proba_all, min_tpr=min_tpr)
    ee  = early_escalation_at_first(df_meta, proba_all, tau)
    per = premature_escalation_rate(df_meta, proba_all, tau)
    tte = time_to_escalation_turns(df_meta, proba_all, tau)

    est_full = estimator.set_params(**params)
    if calibrate:
        try:
            clf_full = CalibratedClassifierCV(est_full, cv=3, method="sigmoid")
            clf_full.fit(X, y)
        except Exception:
            est_full.fit(X, y)
            clf_full = CalibratedClassifierCV(est_full, cv="prefit", method="sigmoid")
            clf_full.fit(X, y)
    else:
        est_full.fit(X, y)
        clf_full = est_full

    metrics = {
        "roc_auc": float(roc),
        "pr_auc": float(pr),
        "tau": float(tau),
        "ee_at_first": float(ee),
        "premature_rate": float(per),
        "tte_mean": float(tte),
    }
    return clf_full, tau, metrics

# Candidate grids
grids = [
    ("logreg", LogisticRegression(class_weight="balanced", max_iter=2000, solver="liblinear"),
     {"C": [0.5, 1.0, 2.0]}, True),
    ("rf", RandomForestClassifier(class_weight="balanced_subsample", random_state=SEED, n_jobs=-1),
     {"n_estimators": [200, 400], "max_depth": [None, 8]}, True),
]

# Optional XGBoost
try:
    import xgboost as xgb
    grids.append(("xgb",
        xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss",
                          tree_method="hist", n_jobs=-1, random_state=SEED),
        {"n_estimators":[300,500], "max_depth":[4,6], "learning_rate":[0.05,0.1]},
        True))
except Exception as e:
    print("XGBoost not available for CV:", e)

MIN_TPR = float(os.getenv("MIN_TPR", "0.90"))
K_FOLDS = int(os.getenv("K_FOLDS", "5"))

cv_results = []
best = None
best_key = (-np.inf, -np.inf, -np.inf)

for name, est, grid, calibrate in grids:
    keys, values = zip(*grid.items())
    for combo in product(*values):
        params = dict(zip(keys, combo))
        with mlflow.start_run(run_name=f"{name}_cv_{params}"):
            mlflow.log_params({"model": name, **params, "cv_k": K_FOLDS, "min_tpr": MIN_TPR, "calibrate": calibrate})
            clf_full, tau, metrics = cv_eval_config(
                name, est, params, X_train, y_train, groups_train, min_tpr=MIN_TPR, k=K_FOLDS, calibrate=calibrate
            )
            mlflow.log_metrics({
                "cv_roc_auc": metrics["roc_auc"],
                "cv_pr_auc": metrics["pr_auc"],
                "cv_tau": metrics["tau"],
                "cv_ee_first": metrics["ee_at_first"],
                "cv_premature_rate": metrics["premature_rate"],
                "cv_tte_mean": metrics["tte_mean"],
            })
            tmp_path = os.path.join(ARTIFACTS_DIR, f"{name}_{int(time.time())}.joblib")
            joblib.dump(clf_full, tmp_path)
            mlflow.sklearn.log_model(clf_full, artifact_path="model", input_example=X_test.iloc[:5])

            row = {"model": name, "params": params, **metrics}
            cv_results.append(row)

            key = (metrics["ee_at_first"], metrics["pr_auc"], metrics["roc_auc"])
            if key > best_key or (np.isnan(best_key[0]) and not np.isnan(key[0])):
                best, best_key = (clf_full, tau, name, params, metrics), key

summary = pd.DataFrame(cv_results).sort_values(
    by=["ee_at_first","pr_auc","roc_auc"], ascending=[False, False, False]
).reset_index(drop=True)
display(summary.head(10))

best_clf, best_tau, best_name, best_params, best_metrics = best
print("Best:", best_name, best_params, best_metrics)



Unnamed: 0,model,params,roc_auc,pr_auc,tau,ee_at_first,premature_rate,tte_mean
0,logreg,{'C': 0.5},0.633333,0.460922,0.08129,1.0,0.909091,-2.818182
1,logreg,{'C': 2.0},0.671795,0.443847,0.075093,1.0,0.909091,-2.818182
2,logreg,{'C': 1.0},0.641026,0.42647,0.078844,1.0,0.909091,-2.818182
3,rf,"{'n_estimators': 200, 'max_depth': None}",0.705128,0.416053,0.093678,1.0,0.909091,-2.818182
4,rf,"{'n_estimators': 200, 'max_depth': 8}",0.705128,0.416053,0.093678,1.0,0.909091,-2.818182
5,rf,"{'n_estimators': 400, 'max_depth': None}",0.694872,0.401695,0.095351,1.0,0.909091,-2.818182
6,rf,"{'n_estimators': 400, 'max_depth': 8}",0.694872,0.401695,0.095351,1.0,0.909091,-2.818182
7,xgb,"{'n_estimators': 300, 'max_depth': 4, 'learnin...",0.462821,0.258824,0.107292,1.0,0.909091,-2.818182
8,xgb,"{'n_estimators': 300, 'max_depth': 6, 'learnin...",0.462821,0.258824,0.107292,1.0,0.909091,-2.818182
9,xgb,"{'n_estimators': 300, 'max_depth': 4, 'learnin...",0.462821,0.257526,0.104014,1.0,0.909091,-2.818182


Best: logreg {'C': 0.5} {'roc_auc': 0.6333333333333333, 'pr_auc': 0.460921729156779, 'tau': 0.0812895078611868, 'ee_at_first': 1.0, 'premature_rate': 0.9090909090909091, 'tte_mean': -2.8181818181818183}


In [10]:
# === Final test-set evaluation & promotion ===
from sklearn.metrics import roc_auc_score, average_precision_score

proba_test = best_clf.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, proba_test)
pr  = average_precision_score(y_test, proba_test)

tau = float(best_tau)
ee  = early_escalation_at_first(df_test, proba_test, tau)
per = premature_escalation_rate(df_test, proba_test, tau)
tte = time_to_escalation_turns(df_test, proba_test, tau)

print(f"TEST — ROC-AUC={roc:.3f} PR-AUC={pr:.3f} EE@first={ee:.3f} Premature={per:.3f} TTE_mean={tte:.3f} tau={tau:.3f}")

with mlflow.start_run(run_name=f"final_test_{best_name}_{best_params}"):
    mlflow.log_params({"final_model": best_name, **best_params, "tau": tau})
    mlflow.log_metrics({
        "test_roc_auc": float(roc),
        "test_pr_auc": float(pr),
        "test_ee_first": float(ee),
        "test_premature_rate": float(per),
        "test_tte_mean": float(tte),
    })
    mlflow.sklearn.log_model(best_clf, artifact_path="model", input_example=X_test.iloc[:5])

model_path = os.path.join(ARTIFACTS_DIR, "model.joblib")
joblib.dump(best_clf, model_path)

version_txt = os.path.join(ARTIFACTS_DIR, "version.txt")
with open(version_txt, "w", encoding="utf-8") as f:
    f.write(
        f"{best_name}@{int(time.time())}\n"
        f"params={best_params}\n"
        f"threshold={tau}\n"
        f"test_roc_auc={roc:.4f}\n"
        f"test_pr_auc={pr:.4f}\n"
        f"test_ee_first={ee:.4f}\n"
        f"test_premature_rate={per:.4f}\n"
        f"test_tte_mean={tte:.4f}\n"
    )

print("Promoted model →", model_path)
print("Wrote metadata →", version_txt)
print("Also wrote feature_order.json and policy.yaml snapshot in artifacts/.")




TEST — ROC-AUC=0.950 PR-AUC=0.833 EE@first=1.000 Premature=1.000 TTE_mean=-1.500 tau=0.081




Promoted model → artifacts/model.joblib
Wrote metadata → artifacts/version.txt
Also wrote feature_order.json and policy.yaml snapshot in artifacts/.
