### En este archivo se entrena el modelo final, se guarda y se evalúan sus predicciones para el 2024, a modo de test.

Si ya hay un modelo entrenado y guardado, no se entrena nuevamente. Solamente se evalúan las predicciones

In [1]:
import os
import json
from pathlib import Path

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

from run_optuna import compute_past_features

In [2]:
DATA_PATH = Path(os.getenv("DATA_PATH", "data/final_dataset.parquet"))
ARTIFACTS_DIR = Path(os.getenv("ARTIFACTS_DIR", "artifacts"))

TRAIN_YEAR = int(os.getenv("TRAIN_YEAR", "2023"))
EVAL_YEAR = int(os.getenv("EVAL_YEAR", "2024"))

BEST_TRIAL_PATH = Path(os.getenv("BEST_TRIAL_PATH", "artifacts/study_best.json"))

ENSEMBLE_DIR = Path(os.getenv("ENSEMBLE_DIR", str(ARTIFACTS_DIR / f"ensemble_train{TRAIN_YEAR}")))
ENSEMBLE_META_PATH = Path(
    os.getenv("ENSEMBLE_META_PATH", str(ARTIFACTS_DIR / f"final_ensemble_train{TRAIN_YEAR}_meta.json"))
)

FORCE_RETRAIN = os.getenv("FORCE_RETRAIN", "0") == "1"

REG_STRAT_BINS = int(os.getenv("REG_STRAT_BINS", "10"))
N_SPLITS = int(os.getenv("N_SPLITS", "5"))

In [3]:
# ============================================================
# Helperss
# ============================================================
def make_regression_strat_bins(y: pd.Series, n_bins: int, n_splits: int):
    """
    Create quantile bins for regression stratification.
    Returns integer bin labels or None if we can't create valid bins.
    """
    y = pd.Series(y).reset_index(drop=True)

    for q in range(int(n_bins), 1, -1):
        try:
            b = pd.qcut(y, q=q, duplicates="drop")
            vc = b.value_counts()
            if (vc < n_splits).any():
                continue
            return b.cat.codes.to_numpy()
        except ValueError:
            continue

    return None


def get_cv_splitter(y: pd.Series, seed: int):
    """
    Prefer StratifiedKFold on quantile bins. Fall back to KFold if binning fails.
    Returns (splitter, strat_labels_or_none).
    """
    strat = make_regression_strat_bins(y, REG_STRAT_BINS, N_SPLITS)
    if strat is None:
        return KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed), None
    return StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed), strat


# ============================================================
# Build X/y exactly like your eval pipeline
# ============================================================
def build_x_and_y(df: pd.DataFrame, target_year: int, selected_features: list[str]):
    id_candidates = ["gid", "dept_id", "departamento_id", "id", "in1"]
    id_cols = [c for c in id_candidates if c in df.columns]

    mask = df["totpobla_2022"].notna()
    ids = df.loc[mask, id_cols].copy() if id_cols else pd.DataFrame(index=df.loc[mask].index)

    df_model = df.loc[mask].select_dtypes("number").copy()

    target_col = f"dengue_incid_{target_year}"
    if target_col not in df_model.columns:
        raise ValueError(f"Target column {target_col} not found for year={target_year}.")

    y_true = np.log1p(df_model[target_col].astype(float))

    X_all = compute_past_features(df_model, target_year)

    missing = [f for f in selected_features if f not in X_all.columns]
    if missing:
        raise ValueError(f"Missing selected features in X: {missing}")

    X = X_all[selected_features].copy()
    return ids.reset_index(drop=True), X.reset_index(drop=True), pd.Series(y_true).reset_index(drop=True)


# ============================================================
# Train ensemble (CV-fold models) from Optuna best
# ============================================================
def train_ensemble_models(df: pd.DataFrame):
    if not BEST_TRIAL_PATH.exists():
        raise FileNotFoundError(f"Best trial JSON not found: {BEST_TRIAL_PATH}")

    best = json.loads(BEST_TRIAL_PATH.read_text(encoding="utf-8"))

    params = dict(best["params"])
    user_attrs = best.get("user_attrs", {})

    selected_features = user_attrs.get("selected_features")
    if not selected_features:
        raise ValueError("selected_features not found in study_best.json user_attrs.")

    cv_seeds = user_attrs.get("cv_seeds")
    if not cv_seeds:
        raise ValueError("cv_seeds not found in study_best.json user_attrs.")

    params.pop("top_k", None)

    base_cb_params = {
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "verbose": False,
        "allow_writing_files": False,
        "task_type": "CPU",
        "od_type": "Iter",
        "od_wait": 50,
        **params,
    }

    _, X_train, y_train = build_x_and_y(df, TRAIN_YEAR, selected_features)

    ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

    model_files = []
    per_model_info = []
    used_stratified_any = False

    for cv_seed in cv_seeds:
        splitter, strat_labels = get_cv_splitter(y_train, seed=int(cv_seed))
        used_stratified = strat_labels is not None
        used_stratified_any = used_stratified_any or used_stratified

        split_iter = splitter.split(X_train, strat_labels) if strat_labels is not None else splitter.split(X_train)

        for fold_idx, (tr_idx, va_idx) in enumerate(split_iter):
            model_path = ENSEMBLE_DIR / f"model_seed{int(cv_seed)}_fold{fold_idx}.cbm"

            if model_path.exists() and not FORCE_RETRAIN:
                model_files.append(str(model_path))
                per_model_info.append(
                    {"cv_seed": int(cv_seed), "fold": int(fold_idx), "path": str(model_path), "trained": False}
                )
                continue

            X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
            y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

            cb_params = dict(base_cb_params)
            cb_params["random_seed"] = int(cv_seed) + int(fold_idx)

            m = CatBoostRegressor(**cb_params)
            m.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)

            m.save_model(str(model_path))
            model_files.append(str(model_path))

            per_model_info.append(
                {
                    "cv_seed": int(cv_seed),
                    "fold": int(fold_idx),
                    "path": str(model_path),
                    "trained": True,
                    "best_iteration": int(m.get_best_iteration()) if m.get_best_iteration() is not None else None,
                }
            )

    meta = {
        "train_year": int(TRAIN_YEAR),
        "n_splits": int(N_SPLITS),
        "reg_strat_bins": int(REG_STRAT_BINS),
        "cv_seeds": [int(s) for s in cv_seeds],
        "used_stratified_any": bool(used_stratified_any),
        "selected_features": list(selected_features),
        "catboost_params_base": base_cb_params,
        "model_files": model_files,
        "per_model_info": per_model_info,
        "note": "Ensemble is the mean of CV-fold models (trained on K-1 folds each).",
    }

    ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
    ENSEMBLE_META_PATH.write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print(f"[ensemble] saved meta → {ENSEMBLE_META_PATH}")
    print(f"[ensemble] models in   → {ENSEMBLE_DIR}  (count={len(model_files)})")

    return meta


# ============================================================
# Load ensemble and predict
# ============================================================
def load_ensemble_models(model_files: list[str]):
    models = []
    for p in model_files:
        mp = Path(p)
        if not mp.exists():
            raise FileNotFoundError(f"Ensemble model missing: {mp}")
        m = CatBoostRegressor()
        m.load_model(str(mp))
        models.append(m)
    return models


def ensemble_predict(models, X: pd.DataFrame):
    preds = []
    for m in models:
        preds.append(m.predict(X))
    preds = np.vstack(preds)
    return preds.mean(axis=0), preds


# ============================================================
# Main
# ============================================================
def main():
    df = pd.read_parquet(DATA_PATH)

    if ENSEMBLE_META_PATH.exists() and not FORCE_RETRAIN:
        meta = json.loads(ENSEMBLE_META_PATH.read_text(encoding="utf-8"))
        model_files = meta.get("model_files", [])
        if not model_files:
            print("[ensemble] meta exists but model_files empty → retraining.")
            meta = train_ensemble_models(df)
    else:
        meta = train_ensemble_models(df)

    selected_features = meta["selected_features"]
    model_files = meta["model_files"]

    ids, X_eval, y_true = build_x_and_y(df, EVAL_YEAR, selected_features)

    models = load_ensemble_models(model_files)
    y_pred, preds_matrix = ensemble_predict(models, X_eval)

    rmse = root_mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    out = pd.concat(
        [
            ids,
            pd.DataFrame(
                {
                    "y_true_log1p": y_true,
                    "y_pred_log1p": y_pred,
                    "y_true_incid": np.expm1(y_true),
                    "y_pred_incid": np.expm1(y_pred),
                    "residual_log1p": (y_true - y_pred),
                    "n_models": len(models),
                }
            ),
        ],
        axis=1,
    )

    ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
    pred_path = ARTIFACTS_DIR / f"predictions_{EVAL_YEAR}_from_train{TRAIN_YEAR}_ensemble.csv"
    out.to_csv(pred_path, index=False)

    metrics = {
        "train_year": int(TRAIN_YEAR),
        "eval_year": int(EVAL_YEAR),
        "metric_space": "log1p(incidence)",
        "rmse": float(rmse),
        "mae": float(mae),
        "r2": float(r2),
        "n_rows": int(len(out)),
        "n_models": int(len(models)),
        "ensemble_dir": str(ENSEMBLE_DIR),
        "ensemble_meta": str(ENSEMBLE_META_PATH),
    }
    metrics_path = ARTIFACTS_DIR / f"metrics_{EVAL_YEAR}_from_train{TRAIN_YEAR}_ensemble.json"
    metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")

    print(f"=== {EVAL_YEAR} evaluation (ENSEMBLE) ===")
    print(json.dumps(metrics, indent=2))
    print(f"Saved predictions → {pred_path}")
    print(f"Saved metrics     → {metrics_path}")

In [4]:
main()

[ensemble] saved meta → artifacts\final_ensemble_train2023_meta.json
[ensemble] models in   → artifacts\ensemble_train2023  (count=15)
=== 2024 evaluation (ENSEMBLE) ===
{
  "train_year": 2023,
  "eval_year": 2024,
  "metric_space": "log1p(incidence)",
  "rmse": 3.0782757513415113,
  "mae": 2.694206722801272,
  "r2": -1.0176729681480925,
  "n_rows": 527,
  "n_models": 15,
  "ensemble_dir": "artifacts\\ensemble_train2023",
  "ensemble_meta": "artifacts\\final_ensemble_train2023_meta.json"
}
Saved predictions → artifacts\predictions_2024_from_train2023_ensemble.csv
Saved metrics     → artifacts\metrics_2024_from_train2023_ensemble.json
