In [None]:
# -*- coding: utf-8 -*-
# 目的：以 LightGBM 預測 y_dir_6h，WF 調參，校準機率，記錄 MLflow

from pathlib import Path
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
import joblib
import mlflow
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb

# ========== 0) 參數 ==========
FEAT_PATH = Path("./data/feat_6h.parquet")
LABEL_PATH = Path("./data/label_6h.parquet")
ART_DIR = Path("./artifacts_lgbm")
ART_DIR.mkdir(parents=True, exist_ok=True)

TEST_DAYS = 90
FREQ_PER_DAY = 4
TRACKING_URI = "mlruns"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("lgbm_6h")

# ========== 1) 資料 ==========
feat = pd.read_parquet(FEAT_PATH)
label = pd.read_parquet(LABEL_PATH)
df = feat.merge(label, on="time", how="inner").sort_values("time").reset_index(drop=True)
times = pd.to_datetime(df["time"], utc=True)

y = df["y_dir_6h"].astype(int).values
X = df.drop(columns=["time", "y_dir_6h", "y_tail_6h"]).values

# 切分
test_start_ts = times.max() - pd.Timedelta(days=TEST_DAYS)
train_mask = times < test_start_ts
test_mask = times >= test_start_ts

X_train_raw, y_train_raw, t_train = X[train_mask], y[train_mask], times[train_mask]
X_test_raw, y_test_raw, t_test = X[test_mask], y[test_mask], times[test_mask]

# 縮放（沿用 TCN 的 scaler，如不存在則新 fit 一個）
scaler_path = Path("./artifacts_tcn/scaler.pkl")
if scaler_path.exists():
    scaler = joblib.load(scaler_path)
else:
    scaler = StandardScaler().fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# ========== 2) Walk-Forward 切分 ==========
def gen_walk_forward_splits(times_arr: pd.Series, train_days=270, val_days=90):
    splits = []
    start_date = times_arr.min()
    end_date = times_arr.max()
    cursor = start_date
    while True:
        train_end = cursor + pd.Timedelta(days=train_days)
        val_end = train_end + pd.Timedelta(days=val_days)
        if val_end > end_date:
            break
        tr_idx = (times_arr >= cursor) & (times_arr < train_end)
        va_idx = (times_arr >= train_end) & (times_arr < val_end)
        if tr_idx.sum() > 200 and va_idx.sum() > 50:
            splits.append((np.where(tr_idx)[0], np.where(va_idx)[0]))
        cursor = cursor + pd.Timedelta(days=val_days)
    if not splits:
        n = len(times_arr)
        cut = int(n * 0.7)
        splits = [(np.arange(0, cut), np.arange(cut, n))]
    return splits

cv_splits = gen_walk_forward_splits(pd.Series(t_train))

# ========== 3) 調參 ==========
param_grid = {
    "num_leaves": [31, 63],
    "max_depth": [-1, 7],
    "learning_rate": [0.1, 0.01],
    "colsample_bytree": [0.7, 1.0],
}
grid = list(ParameterGrid(param_grid))

best_params = None
best_cv_auc = -np.inf
oof_preds = []
oof_labels = []

with mlflow.start_run(run_name="lgbm_cv"):
    for params in grid:
        fold_aucs = []
        for tr_idx, va_idx in cv_splits:
            clf = lgb.LGBMClassifier(
                objective="binary",
                metric="auc",
                n_estimators=2000,
                reg_lambda=5.0,
                min_child_samples=80,
                **params
            )
            clf.fit(
                X_train[tr_idx], y_train_raw[tr_idx],
                eval_set=[(X_train[va_idx], y_train_raw[va_idx])],
                eval_metric="auc",
                callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
            )
            va_pred = clf.predict_proba(X_train[va_idx])[:, 1]
            fold_aucs.append(roc_auc_score(y_train_raw[va_idx], va_pred))
        mean_auc = float(np.mean(fold_aucs))
        mlflow.log_param(f"params_{params}", str(params))
        mlflow.log_metric(f"cv_auc_{params}", mean_auc)
        if mean_auc > best_cv_auc:
            best_cv_auc = mean_auc
            best_params = params

    # 以最佳參數重訓並收集 OOF 作校準
    for tr_idx, va_idx in cv_splits:
        clf = lgb.LGBMClassifier(
            objective="binary",
            metric="auc",
            n_estimators=2000,
            reg_lambda=5.0,
            min_child_samples=80,
            **best_params
        )
        clf.fit(
            X_train[tr_idx], y_train_raw[tr_idx],
            eval_set=[(X_train[va_idx], y_train_raw[va_idx])],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )
        va_pred = clf.predict_proba(X_train[va_idx])[:, 1]
        oof_preds.append(va_pred)
        oof_labels.append(y_train_raw[va_idx])

    oof_preds = np.concatenate(oof_preds)
    oof_labels = np.concatenate(oof_labels)
    mlflow.log_metric("best_cv_auc", float(best_cv_auc))

# ========== 4) 最終模型 ==========
final_clf = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=3000,
    reg_lambda=5.0,
    min_child_samples=80,
    **best_params
)
# 內部早停
cut = int(len(X_train) * 0.9)
final_clf.fit(
    X_train[:cut], y_train_raw[:cut],
    eval_set=[(X_train[cut:], y_train_raw[cut:])],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
)

proba_test = final_clf.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test_raw, proba_test)
brier_test = brier_score_loss(y_test_raw, proba_test)

# 校準
calibrator = IsotonicRegression(out_of_bounds="clip").fit(oof_preds, oof_labels)
proba_test_cal = calibrator.transform(proba_test)
brier_test_cal = brier_score_loss(y_test_raw, proba_test_cal)

# 保存
joblib.dump(final_clf, ART_DIR / "lgbm_model.pkl")
final_clf.booster_.save_model(str(ART_DIR / "lgbm_model.txt"))
joblib.dump(calibrator, ART_DIR / "lgbm_calibrator.pkl")
joblib.dump(scaler, ART_DIR / "scaler.pkl")

with mlflow.start_run(run_name="lgbm_final"):
    mlflow.log_param("best_params", str(best_params))
    mlflow.log_metric("test_auc_raw", float(auc_test))
    mlflow.log_metric("test_brier_raw", float(brier_test))
    mlflow.log_metric("test_brier_cal", float(brier_test_cal))
    mlflow.log_artifact(str(ART_DIR / "lgbm_model.pkl"))
    mlflow.log_artifact(str(ART_DIR / "lgbm_model.txt"))
    mlflow.log_artifact(str(ART_DIR / "lgbm_calibrator.pkl"))
    mlflow.log_artifact(str(ART_DIR / "scaler.pkl"))

print({
    "best_cv_auc": float(best_cv_auc),
    "test_auc_raw": float(auc_test),
    "test_brier_raw": float(brier_test),
    "test_brier_cal": float(brier_test_cal),
})
