In [None]:
# -*- coding: utf-8 -*-
# 目的：以 TCN 預測 y_dir_6h，3 折走勢前移驗證，校準機率，記錄 MLflow

import os
from pathlib import Path
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
import joblib
import mlflow

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, brier_score_loss, precision_recall_fscore_support
from sklearn.isotonic import IsotonicRegression

import tensorflow as tf
from tensorflow import keras
from tcn import TCN

# ========== 0) 參數 ==========
FEAT_PATH = Path("./data/feat_6h.parquet")
LABEL_PATH = Path("./data/label_6h.parquet")
ART_DIR = Path("./artifacts_tcn")
ART_DIR.mkdir(parents=True, exist_ok=True)

LOOKBACK = 16
TEST_DAYS = 90
TRAIN_MIN_DAYS = 120  # 若資料不足，至少用這麼多天作訓練
FREQ_PER_DAY = 4      # 6h 一天四筆
TRACKING_URI = "mlruns"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("tcn_6h")

# ========== 1) 載入與合併 ==========
feat = pd.read_parquet(FEAT_PATH)
label = pd.read_parquet(LABEL_PATH)

df = feat.merge(label, on="time", how="inner").sort_values("time").reset_index(drop=True)
times = pd.to_datetime(df["time"], utc=True)

# 特徵與標籤
y = df["y_dir_6h"].astype(int).values
X = df.drop(columns=["time", "y_dir_6h", "y_tail_6h"]).values

# ========== 2) 切分訓練/測試（最後 90 天為測試） ==========
test_start_ts = times.max() - pd.Timedelta(days=TEST_DAYS)
train_mask = times < test_start_ts
test_mask = times >= test_start_ts

X_train_raw, y_train_raw, t_train = X[train_mask], y[train_mask], times[train_mask]
X_test_raw, y_test_raw, t_test = X[test_mask], y[test_mask], times[test_mask]

# 特徵縮放：只在訓練集 fit
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)
joblib.dump(scaler, ART_DIR / "scaler.pkl")

# ========== 3) 建立序列資料 ==========
def make_sequences(X2d: np.ndarray, y1d: np.ndarray, lookback: int) -> Tuple[np.ndarray, np.ndarray]:
    Xseq = []
    yseq = []
    for i in range(lookback, len(X2d)):
        Xseq.append(X2d[i - lookback:i, :])
        yseq.append(y1d[i])
    return np.asarray(Xseq, dtype=np.float32), np.asarray(yseq, dtype=np.float32)

X_train_seq, y_train_seq = make_sequences(X_train, y_train_raw, LOOKBACK)
X_test_seq, y_test_seq = make_sequences(X_test, y_test_raw, LOOKBACK)

# 為 walk-forward 分割保留時間戳對齊
t_train_seq = t_train[LOOKBACK:]

# ========== 4) Walk-Forward 折切分 ==========
def gen_walk_forward_splits(times_seq: pd.Series, train_days: int = 270, val_days: int = 90) -> List[Tuple[np.ndarray, np.ndarray]]:
    splits = []
    start_date = times_seq.min()
    end_date = times_seq.max()
    cursor = start_date
    while True:
        train_end = cursor + pd.Timedelta(days=train_days)
        val_end = train_end + pd.Timedelta(days=val_days)
        if val_end > end_date:
            break
        train_idx = (times_seq >= cursor) & (times_seq < train_end)
        val_idx = (times_seq >= train_end) & (times_seq < val_end)
        if train_idx.sum() > LOOKBACK * 2 and val_idx.sum() > LOOKBACK:
            splits.append((np.where(train_idx)[0], np.where(val_idx)[0]))
        cursor = cursor + pd.Timedelta(days=val_days)
    # 若無法產生任何合法切分，退化為 1 折 70/30
    if not splits:
        n = len(times_seq)
        cut = int(n * 0.7)
        splits = [(np.arange(0, cut), np.arange(cut, n))]
    return splits

cv_splits = gen_walk_forward_splits(pd.Series(t_train_seq))

# ========== 5) 建模函式 ==========
def build_tcn(input_shape, nb_filters=64, kernel_size=3, dilations=(1,2,4,8,16,32), dropout_rate=0.1) -> keras.Model:
    inp = keras.Input(shape=input_shape)
    x = TCN(nb_filters=nb_filters,
            kernel_size=kernel_size,
            dilations=dilations,
            dropout_rate=dropout_rate,
            activation='relu',
            padding='causal',
            return_sequences=False)(inp)
    out = keras.layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inp, out)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="auc")]
    )
    return model

param_grid = [
    dict(nb_filters=64, kernel_size=3, dilations=(1,2,4,8,16,32), dropout_rate=0.1),
    dict(nb_filters=64, kernel_size=5, dilations=(1,2,4,8,16), dropout_rate=0.1),
    dict(nb_filters=96, kernel_size=3, dilations=(1,2,4,8,16,32), dropout_rate=0.2),
]

# ========== 6) 超參數搜尋（WF 3 折） ==========
best_params = None
best_cv_auc = -np.inf
oof_preds = []  # 收集驗證集預測以供校準
oof_labels = []

with mlflow.start_run(run_name="tcn_cv"):
    for params in param_grid:
        fold_aucs = []
        for (tr_idx, va_idx) in cv_splits:
            model = build_tcn(
                input_shape=(LOOKBACK, X_train_seq.shape[-1]),
                **params
            )
            es = keras.callbacks.EarlyStopping(
                monitor="val_auc", mode="max", patience=10, restore_best_weights=True
            )
            model.fit(
                X_train_seq[tr_idx], y_train_seq[tr_idx],
                validation_data=(X_train_seq[va_idx], y_train_seq[va_idx]),
                epochs=100,
                batch_size=128,
                callbacks=[es],
                verbose=0
            )
            va_pred = model.predict(X_train_seq[va_idx], verbose=0).ravel()
            auc = roc_auc_score(y_train_seq[va_idx], va_pred)
            fold_aucs.append(auc)
        mean_auc = float(np.mean(fold_aucs))
        mlflow.log_param(f"params_{params}", str(params))
        mlflow.log_metric(f"cv_auc_{params}", mean_auc)
        if mean_auc > best_cv_auc:
            best_cv_auc = mean_auc
            best_params = params

    # 以最佳參數重跑一次，收集全部驗證預測做校準
    for (tr_idx, va_idx) in cv_splits:
        model = build_tcn(input_shape=(LOOKBACK, X_train_seq.shape[-1]), **best_params)
        es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=10, restore_best_weights=True)
        model.fit(
            X_train_seq[tr_idx], y_train_seq[tr_idx],
            validation_data=(X_train_seq[va_idx], y_train_seq[va_idx]),
            epochs=100,
            batch_size=128,
            callbacks=[es],
            verbose=0
        )
        va_pred = model.predict(X_train_seq[va_idx], verbose=0).ravel()
        oof_preds.append(va_pred)
        oof_labels.append(y_train_seq[va_idx])

    oof_preds = np.concatenate(oof_preds)
    oof_labels = np.concatenate(oof_labels)
    mlflow.log_metric("best_cv_auc", float(best_cv_auc))

# ========== 7) 最終模型（整段訓練集） ==========
final_model = build_tcn(input_shape=(LOOKBACK, X_train_seq.shape[-1]), **best_params)
es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=12, restore_best_weights=True)
# 使用訓練集中末尾 10% 作內部驗證以利早停
cut = int(len(X_train_seq) * 0.9)
final_model.fit(
    X_train_seq[:cut], y_train_seq[:cut],
    validation_data=(X_train_seq[cut:], y_train_seq[cut:]),
    epochs=120,
    batch_size=128,
    callbacks=[es],
    verbose=0
)

# 測試集評估
proba_test = final_model.predict(X_test_seq, verbose=0).ravel()
auc_test = roc_auc_score(y_test_seq, proba_test)
brier_test = brier_score_loss(y_test_seq, proba_test)
prec, rec, f1, _ = precision_recall_fscore_support((proba_test >= 0.5).astype(int), y_test_seq, average="binary", zero_division=0)

# ========== 8) 機率校準（Isotonic） ==========
calibrator = IsotonicRegression(out_of_bounds="clip").fit(oof_preds, oof_labels)
proba_test_cal = calibrator.transform(proba_test)
brier_test_cal = brier_score_loss(y_test_seq, proba_test_cal)

# 保存
final_model.save(ART_DIR / "tcn_model.h5")
joblib.dump(calibrator, ART_DIR / "tcn_calibrator.pkl")

with mlflow.start_run(run_name="tcn_final"):
    mlflow.log_param("lookback", LOOKBACK)
    mlflow.log_param("best_params", str(best_params))
    mlflow.log_metric("test_auc_raw", float(auc_test))
    mlflow.log_metric("test_brier_raw", float(brier_test))
    mlflow.log_metric("test_brier_cal", float(brier_test_cal))
    mlflow.log_metric("precision@0.5", float(prec))
    mlflow.log_metric("recall@0.5", float(rec))
    mlflow.log_metric("f1@0.5", float(f1))
    mlflow.log_artifact(str(ART_DIR / "tcn_model.h5"))
    mlflow.log_artifact(str(ART_DIR / "tcn_calibrator.pkl"))
    mlflow.log_artifact(str(ART_DIR / "scaler.pkl"))

print({
    "best_cv_auc": float(best_cv_auc),
    "test_auc_raw": float(auc_test),
    "test_brier_raw": float(brier_test),
    "test_brier_cal": float(brier_test_cal),
})
