In [None]:
# -*- coding: utf-8 -*-
# 目的：以 LightGBM 預測 y_dir_6h，WF 調參，校準機率，記錄 MLflow
# 並加入 Logistic Regression baseline 檢查資料信號

from pathlib import Path
import numpy as np
import pandas as pd
import joblib
import mlflow
from sklearn.metrics import roc_auc_score, brier_score_loss, precision_recall_fscore_support
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# ========== 0) 參數 ==========
FEAT_PATH = Path("./data/feat_6h.parquet")
LABEL_PATH = Path("./data/label_6h.parquet")
ART_DIR = Path("./artifacts_lgbm")
ART_DIR.mkdir(parents=True, exist_ok=True)

TEST_DAYS = 90
TRACKING_URI = "mlruns"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("lgbm_6h")

# ========== 1) 資料 ==========
feat = pd.read_parquet(FEAT_PATH)
label = pd.read_parquet(LABEL_PATH)

df = feat.merge(label, on="ts_utc", how="inner").sort_values("ts_utc").reset_index(drop=True)
times = pd.to_datetime(df["ts_utc"], utc=True)

y = df["y_dir_6h"].astype(int).values
feature_cols = df.drop(columns=["ts_utc", "y_dir_6h", "y_tail_6h"]).select_dtypes(include=[np.number]).columns
X = df[feature_cols].values

# 切分
test_start_ts = times.max() - pd.Timedelta(days=TEST_DAYS)
train_mask = times < test_start_ts
test_mask = times >= test_start_ts

X_train_raw, y_train_raw, t_train = X[train_mask], y[train_mask], times[train_mask]
X_test_raw, y_test_raw, t_test = X[test_mask], y[test_mask], times[test_mask]

# 縮放
scaler_path = Path("./artifacts_tcn/scaler.pkl")
if scaler_path.exists():
    scaler = joblib.load(scaler_path)
else:
    scaler = StandardScaler().fit(X_train_raw)

X_train = scaler.transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# ========== 2) Walk-Forward 切分 ==========
def gen_walk_forward_splits(times_arr: pd.Series, train_days=270, val_days=90):
    splits = []
    start_date = times_arr.min()
    end_date = times_arr.max()
    cursor = start_date
    while True:
        train_end = cursor + pd.Timedelta(days=train_days)
        val_end = train_end + pd.Timedelta(days=val_days)
        if val_end > end_date:
            break
        tr_idx = (times_arr >= cursor) & (times_arr < train_end)
        va_idx = (times_arr >= train_end) & (times_arr < val_end)
        if tr_idx.sum() > 200 and va_idx.sum() > 50:
            splits.append((np.where(tr_idx)[0], np.where(va_idx)[0]))
        cursor = cursor + pd.Timedelta(days=val_days)
    if not splits:
        n = len(times_arr)
        cut = int(n * 0.7)
        splits = [(np.arange(0, cut), np.arange(cut, n))]
    return splits

cv_splits = gen_walk_forward_splits(pd.Series(t_train))

# ========== 3) 調參 ==========
# Baseline 參數（放寬限制，避免 constant 預測）
param_grid = {
    "num_leaves": [31, 63, 127],
    "max_depth": [-1, 7, 15],
    "learning_rate": [0.05, 0.1],
    "colsample_bytree": [0.7, 1.0],
    "min_child_samples": [5, 10, 20],
    "min_data_in_leaf": [5, 10],
    "reg_lambda": [0, 1],
}
grid = list(ParameterGrid(param_grid))

best_params = None
best_cv_auc = -np.inf
oof_preds, oof_labels = [], []

with mlflow.start_run(run_name="lgbm_cv") as parent_run:
    for i, params in enumerate(grid):
        fold_aucs = []
        with mlflow.start_run(run_name=f"param_set_{i}", nested=True):
            for tr_idx, va_idx in cv_splits:
                clf = lgb.LGBMClassifier(
                    objective="binary",
                    metric="auc",
                    n_estimators=2000,
                    **params
                )
                clf.fit(
                    X_train[tr_idx], y_train_raw[tr_idx],
                    eval_set=[(X_train[va_idx], y_train_raw[va_idx])],
                    eval_metric="auc",
                    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
                )
                va_pred = clf.predict_proba(X_train[va_idx])[:, 1]
                fold_aucs.append(roc_auc_score(y_train_raw[va_idx], va_pred))

            mean_auc = float(np.mean(fold_aucs))
            for k, v in params.items():
                mlflow.log_param(k, v)
            mlflow.log_metric("cv_auc", mean_auc)

            if mean_auc > best_cv_auc:
                best_cv_auc = mean_auc
                best_params = params

    # 收集 OOF 預測
    for tr_idx, va_idx in cv_splits:
        clf = lgb.LGBMClassifier(
            objective="binary",
            metric="auc",
            n_estimators=2000,
            **best_params
        )
        clf.fit(
            X_train[tr_idx], y_train_raw[tr_idx],
            eval_set=[(X_train[va_idx], y_train_raw[va_idx])],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
        )
        va_pred = clf.predict_proba(X_train[va_idx])[:, 1]
        oof_preds.append(va_pred)
        oof_labels.append(y_train_raw[va_idx])

    oof_preds = np.concatenate(oof_preds)
    oof_labels = np.concatenate(oof_labels)
    mlflow.log_metric("best_cv_auc", float(best_cv_auc))

# ========== 4) 最終模型 ==========
final_clf = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=3000,
    **best_params
)
cut = int(len(X_train) * 0.9)
final_clf.fit(
    X_train[:cut], y_train_raw[:cut],
    eval_set=[(X_train[cut:], y_train_raw[cut:])],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
)

proba_test = final_clf.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test_raw, proba_test)
brier_test = brier_score_loss(y_test_raw, proba_test)
prec, rec, f1, _ = precision_recall_fscore_support((proba_test >= 0.5).astype(int), y_test_raw, average="binary", zero_division=0)

# 校準
calibrator = IsotonicRegression(out_of_bounds="clip").fit(oof_preds, oof_labels)
proba_test_cal = calibrator.transform(proba_test)
brier_test_cal = brier_score_loss(y_test_raw, proba_test_cal)

# 保存
joblib.dump(final_clf, ART_DIR / "lgbm_model.pkl")
final_clf.booster_.save_model(str(ART_DIR / "lgbm_model.txt"))
joblib.dump(calibrator, ART_DIR / "lgbm_calibrator.pkl")
joblib.dump(scaler, ART_DIR / "scaler.pkl")

with mlflow.start_run(run_name="lgbm_final"):
    for k, v in best_params.items():
        mlflow.log_param(k, v)
    mlflow.log_metric("test_auc_raw", float(auc_test))
    mlflow.log_metric("test_brier_raw", float(brier_test))
    mlflow.log_metric("test_brier_cal", float(brier_test_cal))
    mlflow.log_metric("precision_thr_0_5", float(prec))
    mlflow.log_metric("recall_thr_0_5", float(rec))
    mlflow.log_metric("f1_thr_0_5", float(f1))
    mlflow.log_artifact(str(ART_DIR / "lgbm_model.pkl"))
    mlflow.log_artifact(str(ART_DIR / "lgbm_model.txt"))
    mlflow.log_artifact(str(ART_DIR / "lgbm_calibrator.pkl"))
    mlflow.log_artifact(str(ART_DIR / "scaler.pkl"))

print({
    "best_cv_auc": float(best_cv_auc),
    "test_auc_raw": float(auc_test),
    "test_brier_raw": float(brier_test),
    "test_brier_cal": float(brier_test_cal),
})

# ========== 5) Logistic Regression baseline ==========
print("\n===== Logistic Regression baseline =====")
log_clf = LogisticRegression(max_iter=200, solver="lbfgs")
log_clf.fit(X_train, y_train_raw)
log_proba = log_clf.predict_proba(X_test)[:, 1]

auc_log = roc_auc_score(y_test_raw, log_proba)
brier_log = brier_score_loss(y_test_raw, log_proba)

print({
    "logreg_auc": float(auc_log),
    "logreg_brier": float(brier_log)
})
