# S05 — Weak Supervision Label Model + LightGBM

Объединяем правила детекторов в **Label Model** (pseudo‑labels) и обучаем **LightGBM** для
прогноза риска на **edge/hex**. Валидация — **spatial k‑fold** (H3).

In [None]:
%run ./S00_setup.ipynb

In [None]:
# 1) Загрузка фич и событий
edge_df = pd.read_parquet(EDGE_FEATURES_PARQUET) if EDGE_FEATURES_PARQUET.exists() else None
events_edge = pd.read_parquet(EVENTS_EDGE_PARQUET) if EVENTS_EDGE_PARQUET.exists() else None
print("edge_df:", None if edge_df is None else edge_df.shape)
print("events_edge:", None if events_edge is None else events_edge.shape)

if edge_df is None or events_edge is None:
    raise SystemExit("Нужны EDGE_FEATURES и EVENTS_EDGE. Запустите S03 и S04.")

In [None]:
# 2) Label Model: из событий строим pseudo‑labels
# Pseudo score = sum_i w_i * (k_i / n_obs), затем бинаризация по порогу
w = CONFIG["SRI_WEIGHTS"]
ev = events_edge.copy()
ev["risk_sum"] = 0.0
for r in REASONS:
    if r in ev.columns:
        ev["risk_sum"] += w.get(r,1.0) * (ev[r] / ev["n_obs"].replace(0,np.nan))
ev["pseudo_label"] = (ev["risk_sum"] > ev["risk_sum"].quantile(0.7)).astype(int)  # верхние 30% считаем risk=1
ev_pl = ev[["u","v","key","pseudo_label","risk_sum","n_obs","n_ids"]]
print(ev_pl["pseudo_label"].value_counts(normalize=True))

In [None]:
# 3) Merge features + labels
X = edge_df.merge(ev_pl, on=["u","v","key"], how="inner")
print("Train table:", X.shape)
feature_cols = ["n_obs","n_ids","p50_spd","p85_spd","p95_spd","stop_rate","bearing_dev_mean","dist2node_med","freeflow","congestion"]
feature_cols = [c for c in feature_cols if c in X.columns]
target_col = "pseudo_label"

In [None]:
# 4) Spatial k-fold: H3 по центру ребра (приближённо — если геометрия доступна в EDGE_FEATURES нет,
# возьмем surrogate по p50_spd и dist2node; для простоты сделаем случайный split с фикс.seed,
# а в прод-версии заменить на H3 по геометрии ребра)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y = X[target_col].values
import numpy as np
scores = []
pred_oof = np.zeros(len(X))

try:
    import lightgbm as lgb
except Exception:
    lgb = None

if lgb is None:
    print("LightGBM не установлен — установите пакет и перезапустите для обучения модели.")
else:
    for fold, (tr, va) in enumerate(skf.split(X[feature_cols], y)):
        train_set = lgb.Dataset(X.loc[tr, feature_cols], label=y[tr])
        valid_set = lgb.Dataset(X.loc[va, feature_cols], label=y[va])
        params = {
            "objective":"binary",
            "metric":["auc","binary_logloss"],
            "learning_rate":0.05,
            "num_leaves":31,
            "feature_fraction":0.8,
            "bagging_fraction":0.8,
            "bagging_freq":1,
            "seed":42,
            "verbose":-1
        }
        model = lgb.train(params, train_set, valid_sets=[valid_set], num_boost_round=150, callbacks=[lgb.early_stopping(stopping_rounds=30), lgb.log_evaluation(period=50)])
        prob = model.predict(X.loc[va, feature_cols], num_iteration=model.best_iteration)
        pred_oof[va] = prob
        auc = roc_auc_score(y[va], prob)
        ap  = average_precision_score(y[va], prob)
        print(f"Fold {fold}: AUC={auc:.3f} AP={ap:.3f}")
        scores.append((auc, ap))

    print("CV mean AUC/AP:", np.mean([s[0] for s in scores]), np.mean([s[1] for s in scores]))
    X["ml_prob"] = pred_oof
    X[["u","v","key","ml_prob"]].to_parquet(ML_EDGE_PRED_PARQUET, index=False)
    print("Saved:", ML_EDGE_PRED_PARQUET)