In [5]:
# -*- coding: utf-8 -*-
"""
TransactionAlertPro_Optimized.py
Optimized Solution for 2025 Esun AI Challenge (Binary Classification)
"""

import os
import math
import warnings
from typing import Tuple, List, Dict

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score, roc_auc_score, classification_report
import lightgbm as lgb # 引入 LightGBM

warnings.filterwarnings("ignore")


# ======================================================================
# 1. Utilities (輔助函數 - 已修正 _safe_div)
# ======================================================================

def _to_numeric(s):
    """Best-effort numeric conversion; returns float (NaN if fail)."""
    try:
        return pd.to_numeric(s, errors="coerce")
    except Exception:
        return np.nan

def _to_datetime(s):
    """Best-effort datetime parsing."""
    try:
        return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
    except Exception:
        return pd.NaT

def _safe_div(a, b):
    """
    Safe division that works for scalars, numpy arrays, and pandas Series/DataFrames.
    Returns 0.0 where b is NaN or 0.
    (FIXED: Ensure Pandas Series output when input is Pandas Series)
    """
    if isinstance(a, (pd.Series, pd.DataFrame)) or isinstance(b, (pd.Series, pd.DataFrame)):
        mask = pd.notna(b) & (b != 0)
        
        result_array = np.where(mask, a / b, 0.0)

        if isinstance(a, pd.Series):
            return pd.Series(result_array, index=a.index)
        elif isinstance(b, pd.Series):
            return pd.Series(result_array, index=b.index)
        else:
            return result_array
    else:
        return (a / b) if (b is not None and b != 0) else 0.0

def _map_is_esun(v):
    """Map acct_type to 1 (esun) / 0 (others)."""
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    return 1 if s in {"1", "01", "esun", "ESUN", "玉山"} else 0

def _ensure_int(x):
    """Convert value to integer, return NaN on failure."""
    try:
        return int(x)
    except Exception:
        return np.nan

def _to_hour(v):
    """Best-effort extraction of hour (0-23) from time string."""
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    s2 = ''.join(ch for ch in s if (ch.isdigit() or ch == ':'))
    if ':' in s2:
        try:
            hh = int(s2.split(':')[0])
            return hh if 0 <= hh <= 23 else np.nan
        except Exception:
            return np.nan
    try:
        s2 = s2.zfill(6)
        hh = int(s2[:2])
        return hh if 0 <= hh <= 23 else np.nan
    except Exception:
        return np.nan

# ======================================================================
# 2. Data Loading
# ======================================================================
def load_csvs(dir_path: str):
    df_txn = pd.read_csv(os.path.join(dir_path, 'acct_transaction.csv'))
    df_alert = pd.read_csv(os.path.join(dir_path, 'acct_alert.csv'))
    df_test = pd.read_csv(os.path.join(dir_path, 'acct_predict.csv'))
    print("[OK] Loaded datasets.")
    return df_txn, df_alert, df_test


# ======================================================================
# 3. Feature Engineering (Enhanced)
# ======================================================================
def engineer_features_enhanced(df_txn: pd.DataFrame, df_alert: pd.DataFrame) -> pd.DataFrame:
    """
    Build robust account-level features using both sender and receiver roles.
    [ENHANCED]: Log amounts, time-of-day bins, and risk propagation features.
    """
    df = df_txn.copy()
    
    # --- Robust typing & Pre-processing ---
    if 'txn_amt' in df.columns:
        df['txn_amt'] = df['txn_amt'].apply(_to_numeric)
        df['txn_amt_log'] = np.log1p(df['txn_amt'])
    else:
        raise ValueError("Column 'txn_amt' not found.")

    if 'txn_date' in df.columns:
        df['txn_date_int'] = df['txn_date'].apply(_ensure_int) 
    else:
        df['txn_date_int'] = np.nan

    if 'txn_time' in df.columns:
        df['hour'] = df['txn_time'].apply(_to_hour)
    else:
        df['hour'] = np.nan

    for col in ['from_acct_type', 'to_acct_type']:
        if col in df.columns:
            df[col] = df[col].apply(_map_is_esun)
        else:
            df[col] = np.nan

    if 'is_self_txn' in df.columns:
        df['is_self_txn_f'] = df['is_self_txn'].map(lambda x: 1 if str(x).strip().upper() == 'Y' else (0 if str(x).strip().upper() == 'N' else np.nan))
    else:
        df['is_self_txn_f'] = np.nan

    if 'channel_type' in df.columns:
        df['channel_type_norm'] = df['channel_type'].astype(str).str.strip().fillna('UNK')
    else:
        df['channel_type_norm'] = 'UNK'

    if 'currency_type' in df.columns:
        df['currency_type_norm'] = df['currency_type'].astype(str).str.strip().fillna('UNK')
    else:
        df['currency_type_norm'] = 'UNK'

    if not set(['from_acct', 'to_acct']).issubset(df.columns):
        raise ValueError("from_acct and to_acct must exist.")

    # --- NEW: Time-of-Day Bins ---
    def _time_bin(h):
        if pd.isna(h): return 'UNK'
        if 6 <= h < 12: return 'MORN'
        elif 12 <= h < 18: return 'AFTN'
        elif 18 <= h < 22: return 'EVNG'
        else: return 'NGHT'
        
    df['time_bin'] = df['hour'].apply(_time_bin)

    # --- NEW: Risk Propagation Feature ---
    alert_accts = set(df_alert['acct'].astype(str))
    df['to_is_alert'] = df['to_acct'].astype(str).isin(alert_accts).astype(int)
    df['from_is_alert'] = df['from_acct'].astype(str).isin(alert_accts).astype(int)
    
    
    # --- per-account aggregates (sender side) ---
    g_from = df.groupby('from_acct')
    send_cnt = g_from.size().rename('send_cnt')
    
    # Amount Stats (Enhanced: Log, TWD/USD Sums)
    send_amt_agg = g_from['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).add_prefix('send_amt_')
    send_amt_log_agg = g_from['txn_amt_log'].agg(['mean', 'std']).add_prefix('send_amt_log_')
    send_twd_sum = g_from.apply(lambda x: x[x['currency_type_norm'] == 'TWD']['txn_amt'].sum()).rename('send_twd_sum')
    send_usd_sum = g_from.apply(lambda x: x[x['currency_type_norm'] == 'USD']['txn_amt'].sum()).rename('send_usd_sum')
    
    # Activity & Frequency
    send_active_days = g_from['txn_date_int'].nunique().rename('send_active_days')
    send_span = (g_from['txn_date_int'].max() - g_from['txn_date_int'].min()).replace(0, 1).rename('send_span')
    # ERROR LINE FIXED HERE: _safe_div now returns Series
    send_freq_per_day = _safe_div(send_cnt, send_span).rename('send_freq_per_day') 
    
    # Behavior & Category Stats
    send_unique_ctp = g_from['to_acct'].nunique().rename('send_unique_ctp')
    send_hour_agg = g_from['hour'].agg(['mean', 'std']).add_prefix('send_hour_')
    send_self_agg = g_from['is_self_txn_f'].agg(['sum', 'mean']).rename(index={'sum': 'send_self_cnt', 'mean': 'send_self_ratio'})
    send_to_esun_ratio = g_from['to_acct_type'].mean().rename('send_to_esun_ratio')
    
    # NEW: Risk Propagation Ratio
    send_to_alert_ratio = _safe_div(g_from['to_is_alert'].sum(), send_cnt).rename('send_to_alert_ratio')

    # Channel One-Hot Proportions
    top_channels = df['channel_type_norm'].value_counts().head(8).index.tolist()
    send_channel_props = [
        g_from.apply(lambda s, ch=ch: (s['channel_type_norm']==ch).mean()).rename(f'send_ch_{ch}_prop')
        for ch in top_channels
    ]
    
    # Merge sender features
    left = (
        pd.concat([
            send_cnt, send_amt_agg, send_amt_log_agg, send_twd_sum, send_usd_sum,
            send_active_days, send_span, send_freq_per_day, send_unique_ctp,
            send_hour_agg, send_self_agg, send_to_esun_ratio, send_to_alert_ratio,
            g_from['currency_type_norm'].nunique().rename('send_curr_nunique'), 
            g_from.apply(lambda s: (s['currency_type_norm']=='TWD').mean()).rename('send_curr_twd_ratio'),
        ] + send_channel_props, axis=1)
        .reset_index().rename(columns={'from_acct': 'acct'})
    )

    # --- per-account aggregates (receiver side) ---
    g_to = df.groupby('to_acct')
    recv_cnt = g_to.size().rename('recv_cnt')
    
    # Amount Stats (Enhanced: Log, TWD/USD Sums)
    recv_amt_agg = g_to['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).add_prefix('recv_amt_')
    recv_amt_log_agg = g_to['txn_amt_log'].agg(['mean', 'std']).add_prefix('recv_amt_log_')
    recv_twd_sum = g_to.apply(lambda x: x[x['currency_type_norm'] == 'TWD']['txn_amt'].sum()).rename('recv_twd_sum')
    recv_usd_sum = g_to.apply(lambda x: x[x['currency_type_norm'] == 'USD']['txn_amt'].sum()).rename('recv_usd_sum')
    
    # Activity & Frequency
    recv_active_days = g_to['txn_date_int'].nunique().rename('recv_active_days')
    recv_span = (g_to['txn_date_int'].max() - g_to['txn_date_int'].min()).replace(0, 1).rename('recv_span')
    # ERROR LINE FIXED HERE: _safe_div now returns Series
    recv_freq_per_day = _safe_div(recv_cnt, recv_span).rename('recv_freq_per_day')

    # Behavior & Category Stats
    recv_unique_ctp = g_to['from_acct'].nunique().rename('recv_unique_ctp')
    recv_hour_agg = g_to['hour'].agg(['mean', 'std']).add_prefix('recv_hour_')
    recv_self_agg = g_to['is_self_txn_f'].agg(['sum', 'mean']).rename(index={'sum': 'recv_self_cnt', 'mean': 'recv_self_ratio'})
    recv_from_esun_ratio = g_to['from_acct_type'].mean().rename('recv_from_esun_ratio')
    
    # NEW: Risk Propagation Ratio
    recv_from_alert_ratio = _safe_div(g_to['from_is_alert'].sum(), recv_cnt).rename('recv_from_alert_ratio')

    # Channel One-Hot Proportions
    recv_channel_props = [
        g_to.apply(lambda s, ch=ch: (s['channel_type_norm']==ch).mean()).rename(f'recv_ch_{ch}_prop')
        for ch in top_channels
    ]

    # Merge receiver features
    right = (
        pd.concat([
            recv_cnt, recv_amt_agg, recv_amt_log_agg, recv_twd_sum, recv_usd_sum,
            recv_active_days, recv_span, recv_freq_per_day, recv_unique_ctp,
            recv_hour_agg, recv_self_agg, recv_from_esun_ratio, recv_from_alert_ratio,
            g_to['currency_type_norm'].nunique().rename('recv_curr_nunique'), 
            g_to.apply(lambda s: (s['currency_type_norm']=='TWD').mean()).rename('recv_curr_twd_ratio'),
        ] + recv_channel_props, axis=1)
        .reset_index().rename(columns={'to_acct': 'acct'})
    )


    # --- graph-like reciprocity ---
    out_sets = df.groupby('from_acct')['to_acct'].apply(set)
    in_sets = df.groupby('to_acct')['from_acct'].apply(set)
    all_accts = set(out_sets.index).union(set(in_sets.index))

    reci_ratio = {}
    total_degree = {}
    bi_degree = {}

    for a in all_accts:
        outs = out_sets.get(a, set())
        ins = in_sets.get(a, set())
        deg = len(outs.union(ins))
        bi = len(outs.intersection(ins))
        total_degree[a] = deg
        bi_degree[a] = bi
        reci_ratio[a] = _safe_div(bi, deg)

    df_graph = pd.DataFrame({
        'acct': list(all_accts),
        'graph_degree': [total_degree[a] for a in all_accts],
        'graph_bi_degree': [bi_degree[a] for a in all_accts],
        'graph_reciprocity': [reci_ratio[a] for a in all_accts],
    })

    # --- merge all features to account level ---
    feat = pd.merge(left, right, on='acct', how='outer')
    feat = feat.merge(df_graph, on='acct', how='left')

    # Add simple totals / balances
    feat['total_amt_sum'] = feat['send_amt_sum'].fillna(0) + feat['recv_amt_sum'].fillna(0)
    feat['net_out_amt'] = feat['send_amt_sum'].fillna(0) - feat['recv_amt_sum'].fillna(0)
    feat['total_cnt'] = feat['send_cnt'].fillna(0) + feat['recv_cnt'].fillna(0)

    # Normalize some ratios
    feat['send_avg_amt'] = _safe_div(feat['send_amt_sum'].fillna(0), feat['send_cnt'].replace(0, np.nan))
    feat['recv_avg_amt'] = _safe_div(feat['recv_amt_sum'].fillna(0), feat['recv_cnt'].replace(0, np.nan))
    feat['send_unique_rate'] = _safe_div(feat['send_unique_ctp'].fillna(0), feat['send_cnt'].replace(0, np.nan))
    feat['recv_unique_rate'] = _safe_div(feat['recv_unique_ctp'].fillna(0), feat['recv_cnt'].replace(0, np.nan))

    # Determine account type (is_esun)
    df_from_type = df[['from_acct', 'from_acct_type']].drop_duplicates().rename(columns={'from_acct': 'acct', 'from_acct_type': 'is_esun_from'})
    df_to_type = df[['to_acct', 'to_acct_type']].drop_duplicates().rename(columns={'to_acct': 'acct', 'to_acct_type': 'is_esun_to'})
    feat = feat.merge(df_from_type, on='acct', how='left').merge(df_to_type, on='acct', how='left')
    feat['is_esun'] = feat[['is_esun_from', 'is_esun_to']].max(axis=1).fillna(0)

    # Fill NaNs with 0 for model; keep acct id
    feat = feat.fillna(0)
    print(f"[OK] Feature engineering completed. Accounts: {len(feat)}; Features: {feat.shape[1]-1}")
    return feat


# ======================================================================
# 4. Train / Test Split
# ======================================================================
def make_splits(feat_df: pd.DataFrame, df_alert: pd.DataFrame, df_test: pd.DataFrame):
    """
    Build training labels and test set.
    """
    feat = feat_df.copy()

    # Labels
    alert_set = set(df_alert['acct'].astype(str))
    feat['label'] = feat['acct'].astype(str).isin(alert_set).astype(int)

    # Test mask
    test_set = set(df_test['acct'].astype(str))

    # Keep only esun in train & exclude test set
    train_df = feat[(~feat['acct'].astype(str).isin(test_set)) & (feat['is_esun'] == 1)].copy()
    X = train_df.drop(columns=['label'])
    y = train_df['label'].values

    # Test data: exactly test acct list joined with features (missing -> 0)
    test_feat = feat[feat['acct'].astype(str).isin(test_set)].copy()
    X_test = df_test[['acct']].merge(test_feat.drop(columns=['label', 'is_esun_from', 'is_esun_to'], errors='ignore'), on='acct', how='left').fillna(0)
    X_test = X_test.merge(feat[['acct', 'is_esun']], on='acct', how='left').fillna(0)

    print(f"[OK] Split -> Train accounts: {len(X)} (pos={sum(y)}, neg={len(y)-sum(y)}); Test accounts: {len(X_test)}")
    return X, y, X_test


# ======================================================================
# 5. Modeling (LightGBM + K-Fold)
# ======================================================================

def fit_model_kfold(X: pd.DataFrame, y: np.ndarray, random_state: int = 42, n_splits: int = 5):
    """
    Train LightGBM using Stratified K-Fold and tune decision threshold on OOF predictions.
    """
    drop_cols = ['acct']
    feat_cols = [c for c in X.columns if c not in drop_cols]
    
    # LightGBM requires feature names without certain characters (<, >, :, =, [, ])
    X_features = X[feat_cols].copy() 
    
    safe_feat_cols = [c.replace('[', '_').replace(']', '_').replace('<', '_').replace('>', '_').replace(':', '_').replace('=', '_') for c in feat_cols]
    col_name_map = dict(zip(feat_cols, safe_feat_cols))
    
    # 修正: 在 DataFrame 上使用 rename
    X_safe = X_features.rename(columns=col_name_map)
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'auc', 
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'learning_rate': 0.03,
        'num_leaves': 31,
        'max_depth': 6,
        'seed': random_state,
        'n_jobs': -1,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'is_unbalance': True,
        'verbose': -1
    }

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_safe))
    trained_models = []

    print("--- Start LightGBM Stratified K-Fold Training ---")

    for fold, (train_index, val_index) in enumerate(kf.split(X_safe, y)):
        X_train, X_val = X_safe.iloc[train_index], X_safe.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        lgb_clf = lgb.LGBMClassifier(**lgb_params)
        
        lgb_clf.fit(X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    eval_metric='auc',
                    callbacks=[lgb.early_stopping(100, verbose=False)])

        oof_preds[val_index] = lgb_clf.predict_proba(X_val)[:, 1]
        trained_models.append(lgb_clf)
        print(f"Fold {fold+1} finished. AUC: {roc_auc_score(y_val, oof_preds[val_index]):.4f}")

    # Threshold tuning on OOF predictions
    prec, rec, thr = precision_recall_curve(y, oof_preds)
    f1s = 2 * prec * rec / (prec + rec + 1e-12) 
    
    best_idx = np.nanargmax(f1s)
    best_thr = thr[best_idx] if best_idx < len(thr) else 0.5
    
    y_oof_pred = (oof_preds >= best_thr).astype(int)
    f1_oof = f1_score(y, y_oof_pred)

    print(f"\n[OOF] Total AUC={roc_auc_score(y, oof_preds):.4f} 🎯 F1@best={f1_oof:.4f} 🧪 Threshold={best_thr:.4f}")
    print("[OOF] Classification report:\n", classification_report(y, y_oof_pred, digits=4))
    
    return trained_models, safe_feat_cols, best_thr


def predict_test_kfold(trained_models: List, feat_cols: List[str], threshold: float, X_test: pd.DataFrame):
    """
    Predict test data using K-Fold models (Averaging probabilities).
    """
    X_test_safe = X_test.copy()
    
    original_feat_cols = [c.replace('_', '[', 1).replace('_', ']', 1).replace('_', ':') for c in feat_cols]
    col_name_map = dict(zip(original_feat_cols, feat_cols))
    
    cols_to_rename = {k: v for k, v in col_name_map.items() if k in X_test_safe.columns}
    X_test_safe = X_test_safe.rename(columns=cols_to_rename)


    test_preds = np.zeros(len(X_test_safe))
    for model in trained_models:
        test_preds += model.predict_proba(X_test_safe[feat_cols])[:, 1]
    
    avg_proba = test_preds / len(trained_models)
    y_pred = (avg_proba >= threshold).astype(int)
    return y_pred, avg_proba


# ======================================================================
# 6. Output
# ======================================================================
def save_submission(path: str, df_test: pd.DataFrame, X_test: pd.DataFrame, y_pred: np.ndarray):
    """Saves the final prediction result in the required format."""
    df_pred = pd.DataFrame({
        'acct': X_test['acct'].values,
        'label': y_pred.astype(int)
    })
    out = df_test[['acct']].merge(df_pred, on='acct', how='left').fillna(0)
    out.to_csv(path, index=False, encoding='utf-8-sig')
    print(f"[OK] Saved submission to: {path}")


# ======================================================================
# 7. Main
# ======================================================================
def main():
    # === change this to your data directory ===
    dir_path = "data" 

    try:
        # 1. 資料載入
        df_txn, df_alert, df_test = load_csvs(dir_path)
        
        # 2. 增強特徵工程
        feat_df = engineer_features_enhanced(df_txn, df_alert)
        
        # 3. 數據切分
        X, y, X_test = make_splits(feat_df, df_alert, df_test)
        
        # 4. LightGBM K-Fold 訓練與門檻調優
        # 🚨 警告: 此步驟需要 lightgbm 函式庫
        trained_models, feat_cols, thr = fit_model_kfold(X, y, random_state=42, n_splits=5)
        
        # 5. K-Fold 預測
        y_pred, _ = predict_test_kfold(trained_models, feat_cols, thr, X_test)
        
        # 6. 儲存結果
        save_submission("enhanced_result.csv", df_test, X_test, y_pred)
        
    except FileNotFoundError as e:
        print(f"[ERROR] 檔案找不到：{e}. 請確保您的 CSV 檔案存放在 '{dir_path}' 資料夾中，且檔名正確。")
    except ImportError:
        print("\n🚨 ERROR: 找不到 lightgbm 函式庫。請執行 pip install lightgbm")
    except Exception as e:
        print(f"[ERROR] 執行過程中發生錯誤: {e}")


if __name__ == "__main__":
    main()

[OK] Loaded datasets.
[OK] Feature engineering completed. Accounts: 1800106; Features: 75
[OK] Split -> Train accounts: 328988 (pos=1004, neg=327984); Test accounts: 4780
--- Start LightGBM Stratified K-Fold Training ---
Fold 1 finished. AUC: 0.9649
Fold 2 finished. AUC: 0.9715
Fold 3 finished. AUC: 0.9465
Fold 4 finished. AUC: 0.9735
Fold 5 finished. AUC: 0.9572

[OOF] Total AUC=0.9600 🎯 F1@best=0.5107 🧪 Threshold=0.9891
[OOF] Classification report:
               precision    recall  f1-score   support

           0     0.9982    0.9992    0.9987    327984
           1     0.6346    0.4273    0.5107      1004

    accuracy                         0.9975    328988
   macro avg     0.8164    0.7133    0.7547    328988
weighted avg     0.9971    0.9975    0.9973    328988

[ERROR] 執行過程中發生錯誤: "['is_esun_from', 'is_esun_to', 'is_esun'] not in index"


In [6]:
# -*- coding: utf-8 -*-
"""
TransactionAlertPro_Optimized_Fixed.py
Optimized Solution for 2025 Esun AI Challenge (Binary Classification)
- Fixes the KeyError by ensuring train/test feature parity
- Cleans LightGBM feature-name sanitization (consistent rename map)
"""

import os
import warnings
from typing import List, Dict

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, classification_report
import lightgbm as lgb  # pip install lightgbm

warnings.filterwarnings("ignore")


# ======================================================================
# 1. Utilities
# ======================================================================

def _to_numeric(s):
    try:
        return pd.to_numeric(s, errors="coerce")
    except Exception:
        return np.nan

def _ensure_int(x):
    try:
        return int(x)
    except Exception:
        return np.nan

def _to_hour(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    s2 = ''.join(ch for ch in s if (ch.isdigit() or ch == ':'))
    if ':' in s2:
        try:
            hh = int(s2.split(':')[0])
            return hh if 0 <= hh <= 23 else np.nan
        except Exception:
            return np.nan
    try:
        s2 = s2.zfill(6)
        hh = int(s2[:2])
        return hh if 0 <= hh <= 23 else np.nan
    except Exception:
        return np.nan

def _safe_div(a, b):
    """Safe division: works for scalars/Series/DataFrames; returns 0.0 where b is NaN or 0."""
    if isinstance(a, (pd.Series, pd.DataFrame)) or isinstance(b, (pd.Series, pd.DataFrame)):
        mask = pd.notna(b) & (b != 0)
        result_array = np.where(mask, a / b, 0.0)
        if isinstance(a, pd.Series):
            return pd.Series(result_array, index=a.index)
        elif isinstance(b, pd.Series):
            return pd.Series(result_array, index=b.index)
        else:
            return result_array
    else:
        return (a / b) if (b is not None and b != 0) else 0.0

def _map_is_esun(v):
    if pd.isna(v):
        return np.nan
    s = str(v).strip()
    return 1 if s in {"1", "01", "esun", "ESUN", "玉山"} else 0


# ======================================================================
# 2. Data Loading
# ======================================================================
def load_csvs(dir_path: str):
    df_txn = pd.read_csv(os.path.join(dir_path, 'acct_transaction.csv'))
    df_alert = pd.read_csv(os.path.join(dir_path, 'acct_alert.csv'))
    df_test = pd.read_csv(os.path.join(dir_path, 'acct_predict.csv'))
    print("[OK] Loaded datasets.")
    return df_txn, df_alert, df_test


# ======================================================================
# 3. Feature Engineering (Enhanced)
# ======================================================================
def engineer_features_enhanced(df_txn: pd.DataFrame, df_alert: pd.DataFrame) -> pd.DataFrame:
    """
    Build robust account-level features using both sender and receiver roles.
    Includes: log-amounts, time-of-day bins, simple risk-propagation, graph features.
    """
    df = df_txn.copy()

    # --- Robust typing & Pre-processing ---
    if 'txn_amt' not in df.columns:
        raise ValueError("Column 'txn_amt' not found.")
    df['txn_amt'] = df['txn_amt'].apply(_to_numeric)
    df['txn_amt_log'] = np.log1p(df['txn_amt'])

    df['txn_date_int'] = df['txn_date'].apply(_ensure_int) if 'txn_date' in df.columns else np.nan
    df['hour'] = df['txn_time'].apply(_to_hour) if 'txn_time' in df.columns else np.nan

    for col in ['from_acct_type', 'to_acct_type']:
        df[col] = df[col].apply(_map_is_esun) if col in df.columns else np.nan

    if 'is_self_txn' in df.columns:
        df['is_self_txn_f'] = df['is_self_txn'].map(
            lambda x: 1 if str(x).strip().upper() == 'Y' else (0 if str(x).strip().upper() == 'N' else np.nan)
        )
    else:
        df['is_self_txn_f'] = np.nan

    df['channel_type_norm'] = df['channel_type'].astype(str).str.strip().fillna('UNK') if 'channel_type' in df.columns else 'UNK'
    df['currency_type_norm'] = df['currency_type'].astype(str).str.strip().fillna('UNK') if 'currency_type' in df.columns else 'UNK'

    if not set(['from_acct', 'to_acct']).issubset(df.columns):
        raise ValueError("from_acct and to_acct must exist.")

    # --- Time-of-Day Bins ---
    def _time_bin(h):
        if pd.isna(h): return 'UNK'
        if 6 <= h < 12: return 'MORN'
        elif 12 <= h < 18: return 'AFTN'
        elif 18 <= h < 22: return 'EVNG'
        else: return 'NGHT'
    df['time_bin'] = df['hour'].apply(_time_bin)

    # --- Risk Propagation Feature ---
    alert_accts = set(df_alert['acct'].astype(str))
    df['to_is_alert'] = df['to_acct'].astype(str).isin(alert_accts).astype(int)
    df['from_is_alert'] = df['from_acct'].astype(str).isin(alert_accts).astype(int)

    # Sender side aggregates
    g_from = df.groupby('from_acct')
    send_cnt = g_from.size().rename('send_cnt')

    send_amt_agg = g_from['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).add_prefix('send_amt_')
    send_amt_log_agg = g_from['txn_amt_log'].agg(['mean', 'std']).add_prefix('send_amt_log_')
    send_twd_sum = g_from.apply(lambda x: x[x['currency_type_norm'] == 'TWD']['txn_amt'].sum()).rename('send_twd_sum')
    send_usd_sum = g_from.apply(lambda x: x[x['currency_type_norm'] == 'USD']['txn_amt'].sum()).rename('send_usd_sum')

    send_active_days = g_from['txn_date_int'].nunique().rename('send_active_days')
    send_span = (g_from['txn_date_int'].max() - g_from['txn_date_int'].min()).replace(0, 1).rename('send_span')
    send_freq_per_day = _safe_div(send_cnt, send_span).rename('send_freq_per_day')

    send_unique_ctp = g_from['to_acct'].nunique().rename('send_unique_ctp')
    send_hour_agg = g_from['hour'].agg(['mean', 'std']).add_prefix('send_hour_')
    send_self_agg = g_from['is_self_txn_f'].agg(['sum', 'mean']).rename(index={'sum': 'send_self_cnt', 'mean': 'send_self_ratio'})
    send_to_esun_ratio = g_from['to_acct_type'].mean().rename('send_to_esun_ratio')
    send_to_alert_ratio = _safe_div(g_from['to_is_alert'].sum(), send_cnt).rename('send_to_alert_ratio')

    top_channels = df['channel_type_norm'].value_counts().head(8).index.tolist()
    send_channel_props = [
        g_from.apply(lambda s, ch=ch: (s['channel_type_norm'] == ch).mean()).rename(f'send_ch_{ch}_prop')
        for ch in top_channels
    ]

    left = (
        pd.concat([
            send_cnt, send_amt_agg, send_amt_log_agg, send_twd_sum, send_usd_sum,
            send_active_days, send_span, send_freq_per_day, send_unique_ctp,
            send_hour_agg, send_self_agg, send_to_esun_ratio, send_to_alert_ratio,
            g_from['currency_type_norm'].nunique().rename('send_curr_nunique'),
            g_from.apply(lambda s: (s['currency_type_norm'] == 'TWD').mean()).rename('send_curr_twd_ratio'),
        ] + send_channel_props, axis=1)
        .reset_index().rename(columns={'from_acct': 'acct'})
    )

    # Receiver side aggregates
    g_to = df.groupby('to_acct')
    recv_cnt = g_to.size().rename('recv_cnt')

    recv_amt_agg = g_to['txn_amt'].agg(['sum', 'mean', 'std', 'max', 'min', 'median']).add_prefix('recv_amt_')
    recv_amt_log_agg = g_to['txn_amt_log'].agg(['mean', 'std']).add_prefix('recv_amt_log_')
    recv_twd_sum = g_to.apply(lambda x: x[x['currency_type_norm'] == 'TWD']['txn_amt'].sum()).rename('recv_twd_sum')
    recv_usd_sum = g_to.apply(lambda x: x[x['currency_type_norm'] == 'USD']['txn_amt'].sum()).rename('recv_usd_sum')

    recv_active_days = g_to['txn_date_int'].nunique().rename('recv_active_days')
    recv_span = (g_to['txn_date_int'].max() - g_to['txn_date_int'].min()).replace(0, 1).rename('recv_span')
    recv_freq_per_day = _safe_div(recv_cnt, recv_span).rename('recv_freq_per_day')

    recv_unique_ctp = g_to['from_acct'].nunique().rename('recv_unique_ctp')
    recv_hour_agg = g_to['hour'].agg(['mean', 'std']).add_prefix('recv_hour_')
    recv_self_agg = g_to['is_self_txn_f'].agg(['sum', 'mean']).rename(index={'sum': 'recv_self_cnt', 'mean': 'recv_self_ratio'})
    recv_from_esun_ratio = g_to['from_acct_type'].mean().rename('recv_from_esun_ratio')
    recv_from_alert_ratio = _safe_div(g_to['from_is_alert'].sum(), recv_cnt).rename('recv_from_alert_ratio')

    recv_channel_props = [
        g_to.apply(lambda s, ch=ch: (s['channel_type_norm'] == ch).mean()).rename(f'recv_ch_{ch}_prop')
        for ch in top_channels
    ]

    right = (
        pd.concat([
            recv_cnt, recv_amt_agg, recv_amt_log_agg, recv_twd_sum, recv_usd_sum,
            recv_active_days, recv_span, recv_freq_per_day, recv_unique_ctp,
            recv_hour_agg, recv_self_agg, recv_from_esun_ratio, recv_from_alert_ratio,
            g_to['currency_type_norm'].nunique().rename('recv_curr_nunique'),
            g_to.apply(lambda s: (s['currency_type_norm'] == 'TWD').mean()).rename('recv_curr_twd_ratio'),
        ] + recv_channel_props, axis=1)
        .reset_index().rename(columns={'to_acct': 'acct'})
    )

    # Graph-like features
    out_sets = df.groupby('from_acct')['to_acct'].apply(set)
    in_sets = df.groupby('to_acct')['from_acct'].apply(set)
    all_accts = set(out_sets.index).union(set(in_sets.index))

    reci_ratio = {}
    total_degree = {}
    bi_degree = {}
    for a in all_accts:
        outs = out_sets.get(a, set())
        ins = in_sets.get(a, set())
        deg = len(outs.union(ins))
        bi = len(outs.intersection(ins))
        total_degree[a] = deg
        bi_degree[a] = bi
        reci_ratio[a] = _safe_div(bi, deg)

    df_graph = pd.DataFrame({
        'acct': list(all_accts),
        'graph_degree': [total_degree[a] for a in all_accts],
        'graph_bi_degree': [bi_degree[a] for a in all_accts],
        'graph_reciprocity': [reci_ratio[a] for a in all_accts],
    })

    # Merge all
    feat = pd.merge(left, right, on='acct', how='outer')
    feat = feat.merge(df_graph, on='acct', how='left')

    # Totals
    feat['total_amt_sum'] = feat['send_amt_sum'].fillna(0) + feat['recv_amt_sum'].fillna(0)
    feat['net_out_amt'] = feat['send_amt_sum'].fillna(0) - feat['recv_amt_sum'].fillna(0)
    feat['total_cnt'] = feat['send_cnt'].fillna(0) + feat['recv_cnt'].fillna(0)

    # Ratios
    feat['send_avg_amt'] = _safe_div(feat['send_amt_sum'].fillna(0), feat['send_cnt'].replace(0, np.nan))
    feat['recv_avg_amt'] = _safe_div(feat['recv_amt_sum'].fillna(0), feat['recv_cnt'].replace(0, np.nan))
    feat['send_unique_rate'] = _safe_div(feat['send_unique_ctp'].fillna(0), feat['send_cnt'].replace(0, np.nan))
    feat['recv_unique_rate'] = _safe_div(feat['recv_unique_ctp'].fillna(0), feat['recv_cnt'].replace(0, np.nan))

    # Account type resolution
    df_from_type = df[['from_acct', 'from_acct_type']].drop_duplicates().rename(columns={'from_acct': 'acct', 'from_acct_type': 'is_esun_from'})
    df_to_type = df[['to_acct', 'to_acct_type']].drop_duplicates().rename(columns={'to_acct': 'acct', 'to_acct_type': 'is_esun_to'})
    feat = feat.merge(df_from_type, on='acct', how='left').merge(df_to_type, on='acct', how='left')
    feat['is_esun'] = feat[['is_esun_from', 'is_esun_to']].max(axis=1).fillna(0)

    feat = feat.fillna(0)
    print(f"[OK] Feature engineering completed. Accounts: {len(feat)}; Features: {feat.shape[1]-1}")
    return feat


# ======================================================================
# 4. Train / Test Split
# ======================================================================
def make_splits(feat_df: pd.DataFrame, df_alert: pd.DataFrame, df_test: pd.DataFrame):
    feat = feat_df.copy()

    alert_set = set(df_alert['acct'].astype(str))
    feat['label'] = feat['acct'].astype(str).isin(alert_set).astype(int)

    test_set = set(df_test['acct'].astype(str))

    # Train: exclude test accounts; Esun only (match test distribution)
    train_df = feat[(~feat['acct'].astype(str).isin(test_set)) & (feat['is_esun'] == 1)].copy()
    X = train_df.drop(columns=['label'])
    y = train_df['label'].values

    # Test: join features for the listed accounts; keep all columns
    test_feat = feat[feat['acct'].astype(str).isin(test_set)].copy()
    X_test = df_test[['acct']].merge(
        test_feat.drop(columns=['label'], errors='ignore'),
        on='acct', how='left'
    ).fillna(0)

    print(f"[OK] Split -> Train accounts: {len(X)} (pos={sum(y)}, neg={len(y)-sum(y)}); Test accounts: {len(X_test)}")
    return X, y, X_test


# ======================================================================
# 5. Modeling (LightGBM + Stratified K-Fold)
# ======================================================================
def fit_model_kfold(X: pd.DataFrame, y: np.ndarray, random_state: int = 42, n_splits: int = 5):
    """
    Train LightGBM using Stratified K-Fold; tune decision threshold on OOF predictions.
    Returns: models, safe feature names, best threshold, rename_map
    """
    # Exclude identifiers/control flags from model features
    drop_cols = ['acct', 'is_esun', 'is_esun_from', 'is_esun_to']
    feat_cols = [c for c in X.columns if c not in drop_cols]
    X_features = X[feat_cols].copy()

    # Sanitize feature names for LightGBM
    def sanitize(name: str) -> str:
        return (name.replace('[', '_')
                    .replace(']', '_')
                    .replace('<', '_')
                    .replace('>', '_')
                    .replace(':', '_')
                    .replace('=', '_'))
    safe_feat_cols = [sanitize(c) for c in feat_cols]
    rename_map = dict(zip(feat_cols, safe_feat_cols))
    X_safe = X_features.rename(columns=rename_map)

    lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'learning_rate': 0.03,
        'num_leaves': 31,
        'max_depth': 6,
        'seed': random_state,
        'n_jobs': -1,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'is_unbalance': True,
        'verbose': -1
    }

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_safe))
    models: List[lgb.LGBMClassifier] = []

    print("--- Start LightGBM Stratified K-Fold Training ---")
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X_safe, y)):
        X_tr, X_va = X_safe.iloc[tr_idx], X_safe.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )

        oof_preds[va_idx] = clf.predict_proba(X_va)[:, 1]
        models.append(clf)
        print(f"Fold {fold+1} finished. AUC: {roc_auc_score(y_va, oof_preds[va_idx]):.4f}")

    # Threshold tuning on OOF
    prec, rec, thr = precision_recall_curve(y, oof_preds)
    f1s = 2 * prec * rec / (prec + rec + 1e-12)
    best_idx = np.nanargmax(f1s)
    best_thr = thr[best_idx] if best_idx < len(thr) else 0.5

    y_oof_pred = (oof_preds >= best_thr).astype(int)
    f1_oof = f1_score(y, y_oof_pred)

    print(f"\n[OOF] Total AUC={roc_auc_score(y, oof_preds):.4f} 🎯 F1@best={f1_oof:.4f} 🧪 Threshold={best_thr:.4f}")
    print("[OOF] Classification report:\n", classification_report(y, y_oof_pred, digits=4))

    return models, safe_feat_cols, best_thr, rename_map


def predict_test_kfold(trained_models: List[lgb.LGBMClassifier],
                       feat_cols: List[str],
                       threshold: float,
                       X_test: pd.DataFrame,
                       rename_map: Dict[str, str]):
    """
    Predict test probabilities by averaging over K models.
    Uses the same rename_map as training to align feature names.
    """
    # Apply same sanitization to X_test columns
    X_test_safe = X_test.rename(columns=rename_map).copy()

    # Ensure all expected columns exist
    for c in feat_cols:
        if c not in X_test_safe.columns:
            X_test_safe[c] = 0

    # Keep only the feature columns for prediction
    proba_sum = np.zeros(len(X_test_safe))
    for model in trained_models:
        proba_sum += model.predict_proba(X_test_safe[feat_cols])[:, 1]

    avg_proba = proba_sum / len(trained_models)
    y_pred = (avg_proba >= threshold).astype(int)
    return y_pred, avg_proba


# ======================================================================
# 6. Output
# ======================================================================
def save_submission(path: str, df_test: pd.DataFrame, X_test: pd.DataFrame, y_pred: np.ndarray):
    df_pred = pd.DataFrame({'acct': X_test['acct'].values, 'label': y_pred.astype(int)})
    out = df_test[['acct']].merge(df_pred, on='acct', how='left').fillna(0)
    out.to_csv(path, index=False, encoding='utf-8-sig')
    print(f"[OK] Saved submission to: {path}")


# ======================================================================
# 7. Main
# ======================================================================
def main():
    dir_path = "data"  # change to your folder

    try:
        df_txn, df_alert, df_test = load_csvs(dir_path)
        feat_df = engineer_features_enhanced(df_txn, df_alert)
        X, y, X_test = make_splits(feat_df, df_alert, df_test)

        models, feat_cols, thr, rename_map = fit_model_kfold(X, y, random_state=42, n_splits=5)
        y_pred, _ = predict_test_kfold(models, feat_cols, thr, X_test, rename_map)

        save_submission("enhanced_result.csv", df_test, X_test, y_pred)

    except FileNotFoundError as e:
        print(f"[ERROR] File not found: {e}. Ensure CSVs exist in '{dir_path}' with correct names.")
    except ImportError:
        print("\n🚨 ERROR: lightgbm not installed. Please: pip install lightgbm")
    except Exception as e:
        print(f"[ERROR] Exception: {e}")


if __name__ == "__main__":
    main()


[OK] Loaded datasets.
[OK] Feature engineering completed. Accounts: 1800106; Features: 75
[OK] Split -> Train accounts: 328988 (pos=1004, neg=327984); Test accounts: 4780
--- Start LightGBM Stratified K-Fold Training ---
Fold 1 finished. AUC: 0.9645
Fold 2 finished. AUC: 0.9471
Fold 3 finished. AUC: 0.9530
Fold 4 finished. AUC: 0.9657
Fold 5 finished. AUC: 0.9713

[OOF] Total AUC=0.9571 🎯 F1@best=0.4974 🧪 Threshold=0.9804
[OOF] Classification report:
               precision    recall  f1-score   support

           0     0.9984    0.9986    0.9985    327984
           1     0.5184    0.4781    0.4974      1004

    accuracy                         0.9971    328988
   macro avg     0.7584    0.7384    0.7480    328988
weighted avg     0.9969    0.9971    0.9970    328988

[OK] Saved submission to: enhanced_result.csv
