In [7]:
# -*- coding: utf-8 -*-
"""
TransactionAlertPro_SOTA.py
Strong Baseline for 2025 Esun AI Challenge — big F1 lift focus

Key upgrades:
1) Richer features: amount buckets (q25/q50/q75), channel entropy, burstiness, daily stats, graph reciprocity/degree
2) Robust parsing + dtype downsizing (float32) for speed/memory
3) LightGBM (GPU if available) + OOF F1 threshold tuning; optional Top-K / target rate thresholding
4) Safe column sanitization + strict train/test parity; outputs feature importance for analysis

Author: ChatGPT
"""

import os
import warnings
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, precision_recall_curve, average_precision_score,
    roc_auc_score, classification_report
)
from sklearn.inspection import permutation_importance

warnings.filterwarnings("ignore")

# Try LightGBM (GPU-friendly); fallback message if missing
try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False
    raise ImportError("lightgbm not installed. Please `conda install -c conda-forge lightgbm` (GPU build preferred).")

# ==========================
# Config
# ==========================
CONFIG = {
    "data_dir": "data",
    "cv_folds": 5,
    "random_state": 42,

    # Threshold strategy: "f1" (OOF F1-best), "topk" (保留固定 K 名單), "rate" (保留固定比例)
    "threshold_mode": "f1",
    "topk": 400,                  # if mode == "topk"
    "target_positive_rate": None, # e.g., 0.08 if mode == "rate"

    # LightGBM settings (auto GPU if possible)
    "use_gpu_if_available": True,
    "n_estimators": 5000,
    "learning_rate": 0.03,
    "num_leaves": 63,
    "max_depth": -1,
    "early_stopping_rounds": 200,

    # Safety: cap feature importance rows
    "save_feature_importance_top": 200
}


# ==========================
# Utils
# ==========================
def _to_numeric(s):
    try:
        return pd.to_numeric(s, errors="coerce")
    except Exception:
        return np.nan

def _to_datetime(s):
    try:
        return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
    except Exception:
        return pd.NaT

def _ensure_int(x):
    try:
        return int(x)
    except Exception:
        return np.nan

def _to_hour(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    s2 = ''.join(ch for ch in s if (ch.isdigit() or ch == ':'))
    if ':' in s2:
        try:
            hh = int(s2.split(':')[0])
            return hh if 0 <= hh <= 23 else np.nan
        except Exception:
            return np.nan
    try:
        s2 = s2.zfill(6)
        hh = int(s2[:2])
        return hh if 0 <= hh <= 23 else np.nan
    except Exception:
        return np.nan

def _safe_div(a, b):
    """Return aligned Series when inputs are Series; else scalar/array; zeros where invalid."""
    if isinstance(a, (pd.Series, pd.DataFrame)) or isinstance(b, (pd.Series, pd.DataFrame)):
        mask = pd.notna(b) & (b != 0)
        arr = np.where(mask, a / b, 0.0)
        if isinstance(a, pd.Series):
            return pd.Series(arr, index=a.index)
        elif isinstance(b, pd.Series):
            return pd.Series(arr, index=b.index)
        else:
            return arr
    else:
        return (a / b) if (b is not None and b != 0) else 0.0

def _map_is_esun(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    return 1 if s in {"1", "01", "esun", "ESUN", "玉山"} else 0

def _entropy(probs: np.ndarray) -> float:
    p = probs[probs > 0]
    if p.size == 0:
        return 0.0
    return float(-(p * np.log(p)).sum())

def _sanitize_col(name: str) -> str:
    return (name.replace('[', '_')
                .replace(']', '_')
                .replace('<', '_')
                .replace('>', '_')
                .replace(':', '_')
                .replace('=', '_')
                .replace(' ', '_'))


# ==========================
# I/O
# ==========================
def load_csvs(dir_path: str):
    df_txn = pd.read_csv(os.path.join(dir_path, 'acct_transaction.csv'))
    df_alert = pd.read_csv(os.path.join(dir_path, 'acct_alert.csv'))
    df_test = pd.read_csv(os.path.join(dir_path, 'acct_predict.csv'))
    print("[OK] Loaded datasets.")
    return df_txn, df_alert, df_test


# ==========================
# Feature Engineering
# ==========================
def engineer_features(df_txn: pd.DataFrame) -> pd.DataFrame:
    """
    Build a strong set of account-level features from row-level transactions.
    Includes: amount stats + buckets, channel entropy, burstiness, daily activity,
              hour/night patterns, currency mix, and graph-like features.
    """
    df = df_txn.copy()

    # ---- Typing ----
    if 'txn_amt' not in df.columns:
        raise ValueError("Missing txn_amt.")
    df['txn_amt'] = df['txn_amt'].apply(_to_numeric).astype('float32')

    df['txn_date_dt'] = _to_datetime(df['txn_date']) if 'txn_date' in df.columns else pd.NaT
    df['hour'] = df['txn_time'].apply(_to_hour) if 'txn_time' in df.columns else np.nan
    df['hour'] = df['hour'].astype('float32')

    for col in ['from_acct_type', 'to_acct_type']:
        if col in df.columns:
            df[col] = df[col].apply(_map_is_esun).astype('float32')
        else:
            df[col] = np.nan

    # binary self-transfer
    if 'is_self_txn' in df.columns:
        df['is_self_txn_f'] = df['is_self_txn'].map(lambda x: 1 if str(x).strip().upper() == 'Y' else (0 if str(x).strip().upper() == 'N' else np.nan))
    else:
        df['is_self_txn_f'] = np.nan
    df['is_self_txn_f'] = df['is_self_txn_f'].astype('float32')

    # Normalize categories
    df['channel_type_norm'] = df['channel_type'].astype(str).str.strip().fillna('UNK') if 'channel_type' in df.columns else 'UNK'
    df['currency_type_norm'] = df['currency_type'].astype(str).str.strip().fillna('UNK') if 'currency_type' in df.columns else 'UNK'

    if not set(['from_acct', 'to_acct']).issubset(df.columns):
        raise ValueError("from_acct / to_acct required.")

    # ---- Global amount buckets (q25/q50/q75) ----
    amt = df['txn_amt'].dropna().values
    if amt.size > 0:
        q25, q50, q75 = np.quantile(amt, [0.25, 0.50, 0.75])
    else:
        q25 = q50 = q75 = 0.0

    def _bucket(a):
        if pd.isna(a): return 'UNK'
        if a <= q25: return 'S'     # small
        elif a <= q50: return 'SM'  # small-mid
        elif a <= q75: return 'ML'  # mid-large
        else: return 'L'            # large
    df['amt_bucket'] = df['txn_amt'].apply(_bucket)

    # ---- Sender aggregates ----
    g_from = df.groupby('from_acct', observed=True)

    send_cnt = g_from.size().rename('send_cnt').astype('float32')
    send_amt_agg = g_from['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).rename(
        columns=lambda c: f"send_amt_{c}"
    ).astype('float32')

    send_unique_ctp = g_from['to_acct'].nunique().rename('send_unique_ctp').astype('float32')
    send_hour_mean = g_from['hour'].mean().rename('send_hour_mean').astype('float32')
    send_hour_std = g_from['hour'].std(ddof=0).fillna(0).rename('send_hour_std').astype('float32')
    send_night_cnt = g_from['hour'].apply(lambda s: ((s>=22) | (s<=6)).sum()).rename('send_night_cnt').astype('float32')
    send_self_cnt = g_from['is_self_txn_f'].sum(min_count=1).fillna(0).rename('send_self_cnt').astype('float32')
    send_self_ratio = _safe_div(send_self_cnt, send_cnt.replace(0, np.nan)).fillna(0).rename('send_self_ratio').astype('float32')
    send_to_esun_ratio = g_from['to_acct_type'].mean().rename('send_to_esun_ratio').astype('float32')

    # Channel mix (top 8)
    top_channels = df['channel_type_norm'].value_counts().head(8).index.tolist()
    send_channel_props = []
    for ch in top_channels:
        send_channel_props.append(
            g_from.apply(lambda s, ch=ch: (s['channel_type_norm'] == ch).mean()).rename(f"send_ch_{ch}_prop").astype('float32')
        )

    # Currency mix
    send_curr_nunique = g_from['currency_type_norm'].nunique().rename('send_curr_nunique').astype('float32')
    send_curr_twd_ratio = g_from.apply(lambda s: (s['currency_type_norm']=='TWD').mean()).rename('send_curr_twd_ratio').astype('float32')

    # Daily activity (burstiness)
    if 'txn_date_dt' in df.columns:
        df_day_from = df.dropna(subset=['txn_date_dt']).groupby(['from_acct', df['txn_date_dt'].dt.date]).agg(
            day_amt_sum=('txn_amt', 'sum'),
            day_cnt=('txn_amt', 'count')
        ).reset_index().rename(columns={'txn_date_dt': 'date'})
        gdf = df_day_from.groupby('from_acct', observed=True)
        send_active_days = gdf['date'].nunique().rename('send_active_days').astype('float32')
        send_day_cnt_mean = gdf['day_cnt'].mean().rename('send_day_cnt_mean').astype('float32')
        send_day_cnt_std  = gdf['day_cnt'].std(ddof=0).fillna(0).rename('send_day_cnt_std').astype('float32')
        send_burstiness   = _safe_div(send_day_cnt_std, send_day_cnt_mean.replace(0, np.nan)).fillna(0).rename('send_burstiness').astype('float32')
        # recency gap
        ref_date = df['txn_date_dt'].max()
        send_last_gap_days = g_from['txn_date_dt'].max().apply(lambda t: (ref_date - t).days if pd.notna(t) else np.nan)\
                              .fillna(9999).rename('send_last_gap_days').astype('float32')
    else:
        send_active_days = send_day_cnt_mean = send_day_cnt_std = send_burstiness = send_last_gap_days = pd.Series(dtype='float32')

    # Amount buckets proportion
    send_bucket_props = []
    for b in ['S','SM','ML','L']:
        send_bucket_props.append(
            g_from.apply(lambda s, b=b: (s['amt_bucket']==b).mean()).rename(f"send_amt_{b}_prop").astype('float32')
        )

    left = pd.concat([
        send_cnt, send_amt_agg, send_unique_ctp,
        send_hour_mean, send_hour_std, send_night_cnt,
        send_self_cnt, send_self_ratio, send_to_esun_ratio,
        send_curr_nunique, send_curr_twd_ratio,
        send_active_days, send_day_cnt_mean, send_day_cnt_std, send_burstiness, send_last_gap_days
    ] + send_channel_props + send_bucket_props, axis=1).reset_index().rename(columns={'from_acct':'acct'})

    # ---- Receiver aggregates ----
    g_to = df.groupby('to_acct', observed=True)

    recv_cnt = g_to.size().rename('recv_cnt').astype('float32')
    recv_amt_agg = g_to['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).rename(
        columns=lambda c: f"recv_amt_{c}"
    ).astype('float32')

    recv_unique_ctp = g_to['from_acct'].nunique().rename('recv_unique_ctp').astype('float32')
    recv_hour_mean = g_to['hour'].mean().rename('recv_hour_mean').astype('float32')
    recv_hour_std = g_to['hour'].std(ddof=0).fillna(0).rename('recv_hour_std').astype('float32')
    recv_night_cnt = g_to['hour'].apply(lambda s: ((s>=22) | (s<=6)).sum()).rename('recv_night_cnt').astype('float32')
    recv_self_cnt = g_to['is_self_txn_f'].sum(min_count=1).fillna(0).rename('recv_self_cnt').astype('float32')
    recv_self_ratio = _safe_div(recv_self_cnt, recv_cnt.replace(0, np.nan)).fillna(0).rename('recv_self_ratio').astype('float32')
    recv_from_esun_ratio = g_to['from_acct_type'].mean().rename('recv_from_esun_ratio').astype('float32')

    recv_channel_props = []
    for ch in top_channels:
        recv_channel_props.append(
            g_to.apply(lambda s, ch=ch: (s['channel_type_norm'] == ch).mean()).rename(f"recv_ch_{ch}_prop").astype('float32')
        )

    recv_curr_nunique = g_to['currency_type_norm'].nunique().rename('recv_curr_nunique').astype('float32')
    recv_curr_twd_ratio = g_to.apply(lambda s: (s['currency_type_norm']=='TWD').mean()).rename('recv_curr_twd_ratio').astype('float32')

    # daily stats for receiver
    if 'txn_date_dt' in df.columns:
        df_day_to = df.dropna(subset=['txn_date_dt']).groupby(['to_acct', df['txn_date_dt'].dt.date]).agg(
            day_amt_sum=('txn_amt', 'sum'),
            day_cnt=('txn_amt', 'count')
        ).reset_index().rename(columns={'txn_date_dt': 'date'})
        gdt = df_day_to.groupby('to_acct', observed=True)
        recv_active_days = gdt['date'].nunique().rename('recv_active_days').astype('float32')
        recv_day_cnt_mean = gdt['day_cnt'].mean().rename('recv_day_cnt_mean').astype('float32')
        recv_day_cnt_std  = gdt['day_cnt'].std(ddof=0).fillna(0).rename('recv_day_cnt_std').astype('float32')
        recv_burstiness   = _safe_div(recv_day_cnt_std, recv_day_cnt_mean.replace(0, np.nan)).fillna(0).rename('recv_burstiness').astype('float32')
        ref_date = df['txn_date_dt'].max()
        recv_last_gap_days = g_to['txn_date_dt'].max().apply(lambda t: (ref_date - t).days if pd.notna(t) else np.nan)\
                              .fillna(9999).rename('recv_last_gap_days').astype('float32')
    else:
        recv_active_days = recv_day_cnt_mean = recv_day_cnt_std = recv_burstiness = recv_last_gap_days = pd.Series(dtype='float32')

    recv_bucket_props = []
    for b in ['S','SM','ML','L']:
        recv_bucket_props.append(
            g_to.apply(lambda s, b=b: (s['amt_bucket']==b).mean()).rename(f"recv_amt_{b}_prop").astype('float32')
        )

    right = pd.concat([
        recv_cnt, recv_amt_agg, recv_unique_ctp,
        recv_hour_mean, recv_hour_std, recv_night_cnt,
        recv_self_cnt, recv_self_ratio, recv_from_esun_ratio,
        recv_curr_nunique, recv_curr_twd_ratio,
        recv_active_days, recv_day_cnt_mean, recv_day_cnt_std, recv_burstiness, recv_last_gap_days
    ] + recv_channel_props + recv_bucket_props, axis=1).reset_index().rename(columns={'to_acct':'acct'})

    # ---- Graph-like features ----
    out_sets = df.groupby('from_acct')['to_acct'].apply(set)
    in_sets = df.groupby('to_acct')['from_acct'].apply(set)
    all_accts = set(out_sets.index).union(set(in_sets.index))

    reci_ratio = {}
    total_degree = {}
    bi_degree = {}
    for a in all_accts:
        outs = out_sets.get(a, set())
        ins  = in_sets.get(a, set())
        deg = len(outs.union(ins))
        bi  = len(outs.intersection(ins))
        total_degree[a] = deg
        bi_degree[a]    = bi
        reci_ratio[a]   = _safe_div(bi, deg)

    df_graph = pd.DataFrame({
        'acct': list(all_accts),
        'graph_degree': [total_degree[a] for a in all_accts],
        'graph_bi_degree': [bi_degree[a] for a in all_accts],
        'graph_reciprocity': [reci_ratio[a] for a in all_accts],
    }).astype({'graph_degree':'float32','graph_bi_degree':'float32','graph_reciprocity':'float32'})

    # ---- Merge sender/receiver/graph ----
    feat = pd.merge(left, right, on='acct', how='outer')
    feat = feat.merge(df_graph, on='acct', how='left')

    # Totals & normalized ratios
    feat['total_amt_sum'] = feat['send_amt_sum'].fillna(0) + feat['recv_amt_sum'].fillna(0)
    feat['net_out_amt']   = feat['send_amt_sum'].fillna(0) - feat['recv_amt_sum'].fillna(0)
    feat['total_cnt']     = feat['send_cnt'].fillna(0)     + feat['recv_cnt'].fillna(0)
    feat['send_avg_amt']  = _safe_div(feat['send_amt_sum'].fillna(0), feat['send_cnt'].replace(0, np.nan)).fillna(0)
    feat['recv_avg_amt']  = _safe_div(feat['recv_amt_sum'].fillna(0), feat['recv_cnt'].replace(0, np.nan)).fillna(0)
    feat['send_unique_rate'] = _safe_div(feat['send_unique_ctp'].fillna(0), feat['send_cnt'].replace(0, np.nan)).fillna(0)
    feat['recv_unique_rate'] = _safe_div(feat['recv_unique_ctp'].fillna(0), feat['recv_cnt'].replace(0, np.nan)).fillna(0)

    # Channel entropy（sender/receiver）
    # sender distribution
    send_ch_dist = df.groupby(['from_acct','channel_type_norm']).size().groupby(level=0).apply(lambda s: s / s.sum())\
                      .unstack(fill_value=0)
    send_ch_entropy = send_ch_dist.apply(lambda row: _entropy(row.values.astype(np.float64)), axis=1)\
                                  .rename('send_ch_entropy').reset_index().rename(columns={'from_acct':'acct'})
    # receiver distribution
    recv_ch_dist = df.groupby(['to_acct','channel_type_norm']).size().groupby(level=0).apply(lambda s: s / s.sum())\
                      .unstack(fill_value=0)
    recv_ch_entropy = recv_ch_dist.apply(lambda row: _entropy(row.values.astype(np.float64)), axis=1)\
                                  .rename('recv_ch_entropy').reset_index().rename(columns={'to_acct':'acct'})

    feat = feat.merge(send_ch_entropy, on='acct', how='left').merge(recv_ch_entropy, on='acct', how='left')

    # Account type resolution (Esun flag)
    df_from_type = df[['from_acct', 'from_acct_type']].drop_duplicates().rename(columns={'from_acct':'acct','from_acct_type':'is_esun_from'})
    df_to_type   = df[['to_acct', 'to_acct_type']].drop_duplicates().rename(columns={'to_acct':'acct','to_acct_type':'is_esun_to'})
    feat = feat.merge(df_from_type, on='acct', how='left').merge(df_to_type, on='acct', how='left')
    feat['is_esun'] = feat[['is_esun_from','is_esun_to']].max(axis=1).fillna(0)

    # Dtype downsizing
    for c in feat.columns:
        if c != 'acct' and pd.api.types.is_float_dtype(feat[c]):
            feat[c] = feat[c].astype('float32')

    feat = feat.fillna(0)
    print(f"[OK] Feature engineering completed. Accounts={len(feat)} Features={feat.shape[1]-1}")
    return feat


# ==========================
# Split & Labels
# ==========================
def make_splits(feat_df: pd.DataFrame, df_alert: pd.DataFrame, df_test: pd.DataFrame):
    feat = feat_df.copy()
    alert_set = set(df_alert['acct'].astype(str))
    feat['label'] = feat['acct'].astype(str).isin(alert_set).astype(int)

    test_set = set(df_test['acct'].astype(str))

    train_df = feat[(~feat['acct'].astype(str).isin(test_set)) & (feat['is_esun'] == 1)].copy()
    X = train_df.drop(columns=['label'])
    y = train_df['label'].values

    # 保證 test 欄位與 train 對齊
    test_feat = feat[feat['acct'].astype(str).isin(test_set)].copy()
    X_test = df_test[['acct']].merge(test_feat.drop(columns=['label'], errors='ignore'), on='acct', how='left').fillna(0)

    print(f"[OK] Split -> Train={len(X)} (pos={sum(y)}, neg={len(y)-sum(y)}); Test={len(X_test)}")
    return X, y, X_test


# ==========================
# Modeling (LightGBM + OOF threshold)
# ==========================
def fit_lgbm_kfold(X: pd.DataFrame, y: np.ndarray, cfg: Dict):
    drop_cols = ['acct', 'is_esun', 'is_esun_from', 'is_esun_to']
    feat_cols = [c for c in X.columns if c not in drop_cols]

    # Sanitize names for LGB
    rename_map = {c: _sanitize_col(c) for c in feat_cols}
    Xs = X[feat_cols].rename(columns=rename_map).copy()
    safe_cols = list(Xs.columns)

    # class weight
    pos = y.sum()
    neg = len(y) - pos
    scale_pos_weight = max((neg / max(pos, 1)), 1.0)

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': cfg["n_estimators"],
        'learning_rate': cfg["learning_rate"],
        'num_leaves': cfg["num_leaves"],
        'max_depth': cfg["max_depth"],
        'seed': cfg["random_state"],
        'n_jobs': -1,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'is_unbalance': True,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }

    # GPU if available/required
    if cfg["use_gpu_if_available"]:
        params.update({
            'device': 'gpu',
            'device_type': 'gpu',   # 某些版本用 device_type
            'gpu_use_dp': False
        })

    kf = StratifiedKFold(n_splits=cfg["cv_folds"], shuffle=True, random_state=cfg["random_state"])
    oof = np.zeros(len(Xs), dtype=np.float64)
    models: List[lgb.LGBMClassifier] = []

    print("--- LightGBM Stratified K-Fold ---")
    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xs, y), 1):
        Xt, Xv = Xs.iloc[tr_idx], Xs.iloc[va_idx]
        yt, yv = y[tr_idx], y[va_idx]

        clf = lgb.LGBMClassifier(**params)
        clf.fit(
            Xt, yt,
            eval_set=[(Xv, yv)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(CONFIG["early_stopping_rounds"], verbose=False)]
        )
        oof[va_idx] = clf.predict_proba(Xv)[:, 1]
        models.append(clf)
        print(f"Fold {fold}: AUC={roc_auc_score(yv, oof[va_idx]):.4f}")

    # OOF metrics
    ap = average_precision_score(y, oof)
    auc = roc_auc_score(y, oof)
    prec, rec, thr = precision_recall_curve(y, oof)
    f1s = 2 * prec * rec / (prec + rec + 1e-12)
    best_idx = np.nanargmax(f1s)
    f1_best = f1s[best_idx] if best_idx < len(f1s) else 0.0
    thr_f1 = thr[best_idx] if best_idx < len(thr) else 0.5

    print(f"[OOF] AP={ap:.4f} AUC={auc:.4f} F1@best={f1_best:.4f} thr={thr_f1:.4f}")
    y_oof = (oof >= thr_f1).astype(int)
    print("[OOF] Classification report:\n", classification_report(y, y_oof, digits=4))
    return models, safe_cols, rename_map, oof, thr_f1


def choose_threshold(oof: np.ndarray, y: np.ndarray, cfg: Dict) -> float:
    mode = cfg["threshold_mode"]
    if mode == "f1":
        prec, rec, thr = precision_recall_curve(y, oof)
        f1s = 2 * prec * rec / (prec + rec + 1e-12)
        best_idx = np.nanargmax(f1s)
        return float(thr[best_idx] if best_idx < len(thr) else 0.5)
    elif mode == "topk":
        k = int(cfg["topk"])
        if k <= 0: return 0.5
        # 閥值 = 第 k 名的分數（由高到低）
        cut = np.partition(-oof, k-1)[k-1]
        return float(-cut - 1e-12)
    elif mode == "rate":
        rate = float(cfg["target_positive_rate"] or 0.1)
        q = 1.0 - rate
        return float(np.quantile(oof, q))
    else:
        return 0.5


def predict_on_test(models: List[lgb.LGBMClassifier],
                    feat_cols_safe: List[str],
                    rename_map: Dict[str, str],
                    X_test: pd.DataFrame,
                    threshold: float) -> Tuple[np.ndarray, np.ndarray]:
    # 應用同樣的欄名淨化
    rev_map = rename_map  # 原->淨化
    Xt = X_test[[c for c in X_test.columns if c != 'acct']].rename(columns=rev_map).copy()

    # 確保所有訓練欄位存在
    for c in feat_cols_safe:
        if c not in Xt.columns:
            Xt[c] = 0

    # 只留訓練用欄位
    Xt = Xt[feat_cols_safe]
    proba = np.zeros(len(Xt), dtype=np.float64)
    for m in models:
        proba += m.predict_proba(Xt)[:, 1]
    proba /= len(models)

    y_pred = (proba >= threshold).astype(int)
    return y_pred, proba


# ==========================
# Save artifacts
# ==========================
def save_submission(path: str, df_test: pd.DataFrame, X_test: pd.DataFrame, y_pred: np.ndarray):
    out = pd.DataFrame({'acct': X_test['acct'].values, 'label': y_pred.astype(int)})
    out = df_test[['acct']].merge(out, on='acct', how='left').fillna(0)
    out.to_csv(path, index=False, encoding='utf-8-sig')
    print(f"[OK] Saved submission => {path}")

def save_feature_importance(models: List[lgb.LGBMClassifier],
                            feat_cols_safe: List[str],
                            path: str,
                            topn: int = 200):
    # 平均 gain importance
    imps = np.zeros(len(feat_cols_safe))
    for m in models:
        imps += m.booster_.feature_importance(importance_type='gain')
    imps /= max(len(models), 1)
    df_imp = pd.DataFrame({'feature': feat_cols_safe, 'gain_importance': imps})
    df_imp = df_imp.sort_values('gain_importance', ascending=False).head(topn)
    df_imp.to_csv(path, index=False, encoding='utf-8-sig')
    print(f"[OK] Saved feature importance => {path}")


# ==========================
# Main
# ==========================
def main():
    cfg = CONFIG.copy()
    data_dir = cfg["data_dir"]

    # 1) Load
    df_txn, df_alert, df_test = load_csvs(data_dir)

    # 2) Feature engineering
    feat_df = engineer_features(df_txn)

    # 3) Split
    X, y, X_test = make_splits(feat_df, df_alert, df_test)

    # 4) Train + OOF threshold
    models, feat_cols_safe, rename_map, oof, thr_f1 = fit_lgbm_kfold(X, y, cfg)
        # 印出目前使用的裝置
    device_used = "GPU" if cfg["use_gpu_if_available"] and any(
        "gpu" in str(m.get_params().get("device", "")).lower()
        or "gpu" in str(m.get_params().get("device_type", "")).lower()
        for m in models
    ) else "CPU"
    print(f"[INFO] LightGBM is running on: {device_used}")


    # choose strategy (F1 / TopK / Rate)
    thr = choose_threshold(oof, y, cfg)
    print(f"[THR] Selected threshold ({cfg['threshold_mode']}): {thr:.6f}")

    # 5) Predict test
    y_pred, proba = predict_on_test(models, feat_cols_safe, rename_map, X_test, thr)

    # 6) Save
    save_submission("enhanced_result.csv", df_test, X_test, y_pred)
    save_feature_importance(models, feat_cols_safe, "feature_importance.csv", topn=cfg["save_feature_importance_top"])


if __name__ == "__main__":
    main()





[OK] Loaded datasets.


KeyboardInterrupt: 