In [None]:
import os, json, random, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from collections import Counter

import joblib
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
ROOT = Path.cwd().parent

EMB_ROOT = ROOT / "embeddings"
PRED_SPATIAL_DIR = ROOT / "predictions" / "spatial"
PRED_TEMPORAL_DIR = ROOT / "predictions" / "temporal"

CACHE_DIR = ROOT / "ensemble_features_final"
CACHE_DIR.mkdir(exist_ok=True)

CHECKPOINT_DIR = ROOT / "checkpoints" / "ensemble_final"
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

LABELS_JSON = ROOT / "data" / "labels.json"

In [None]:
def safe_auc(y_true, y_pred):
    if len(np.unique(y_true)) < 2:
        return np.nan
    if np.isnan(y_pred).any():
        return np.nan
    return roc_auc_score(y_true, y_pred)

def load_labels():
    with open(LABELS_JSON, "r") as f:
        return json.load(f)

labels_map = load_labels()

In [None]:
def build_features_from_predictions(split):
    cache = CACHE_DIR / f"{split}.npz"
    if cache.exists():
        d = np.load(cache)
        return d["X"], d["y"]

    sdir = PRED_SPATIAL_DIR / split
    tdir = PRED_TEMPORAL_DIR / split

    X, y = [], []
    for p in sdir.glob("*.npy"):
        stem = p.stem
        if not (tdir / f"{stem}.npy").exists():
            continue

        s = float(np.load(p))
        t = float(np.load(tdir / f"{stem}.npy"))

        # 5-dim unified feature format
        X.append([s, s, 0.0, s, t])

        label = labels_map.get(stem)
        if label is None:
            continue
        y.append(int(label))

    X = np.asarray(X, dtype=np.float32)
    y = np.asarray(y, dtype=np.int64)
    np.savez(cache, X=X, y=y)
    return X, y

In [None]:
X_train, y_train = build_features_from_predictions("train")
X_val, y_val     = build_features_from_predictions("val")
X_test, y_test   = build_features_from_predictions("test")

print("Train:", X_train.shape, Counter(y_train))
print("Val  :", X_val.shape, Counter(y_val))
print("Test :", X_test.shape, Counter(y_test))

In [None]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        solver="saga",
        penalty="l2",
        max_iter=3000,
        random_state=SEED
    ))
])

param_grid = {"clf__C": [0.01, 0.1, 1.0, 10.0]}

gs = GridSearchCV(
    pipe,
    param_grid,
    scoring="roc_auc",
    cv=StratifiedKFold(5, shuffle=True, random_state=SEED),
    n_jobs=-1
)
gs.fit(X_train, y_train)

best_pipe = gs.best_estimator_
print("Best C:", gs.best_params_, "CV AUC:", gs.best_score_)

In [None]:
skf = StratifiedKFold(5, shuffle=True, random_state=SEED)
oof = np.zeros(len(y_train))
fold_aucs = []

for i, (tr, va) in enumerate(skf.split(X_train, y_train), 1):
    model = Pipeline(best_pipe.steps)
    model.fit(X_train[tr], y_train[tr])
    p = model.predict_proba(X_train[va])[:, 1]
    auc = safe_auc(y_train[va], p)
    fold_aucs.append(auc)
    oof[va] = p
    print(f"Fold {i} AUC: {auc:.4f}")

print("Mean OOF AUC:", np.mean(fold_aucs))

In [None]:
calibrator = CalibratedClassifierCV(
    base_estimator=best_pipe,
    method="sigmoid",
    cv="prefit"
)

best_pipe.fit(X_train, y_train)
calibrator.fit(X_val, y_val)

In [None]:
def eval_split(name, X, y):
    p = calibrator.predict_proba(X)[:, 1]
    auc = safe_auc(y, p)
    ap = average_precision_score(y, p)
    print(f"{name} AUC: {auc:.4f} | AP: {ap:.4f}")
    return p

p_train = eval_split("Train", X_train, y_train)
p_val   = eval_split("Val", X_val, y_val)
p_test  = eval_split("Test", X_test, y_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, p_test)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label="Ensemble ROC")
plt.plot([0,1], [0,1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Ensemble ROC Curve (Test)")
plt.legend()
plt.grid()
plt.show()

In [None]:
joblib.dump({
    "pipeline": best_pipe,
    "calibrator": calibrator,
    "oof_preds": oof,
    "fold_aucs": fold_aucs,
    "features": ["spatial_mean","spatial_max","spatial_std","spatial_top3","temporal_score"]
}, CHECKPOINT_DIR / "ensemble_final.joblib")

print("Saved ensemble model")