In [2]:
"""
local_train.py
- Advanced FE
- TimeSeriesSplit CV
- ElasticNet, LightGBM, XGBoost OOF
- Blend weight optimization
- Sharpe-based strategy eval
- Final model training (전체 데이터 재학습)
- >>> export_models.py에서 Kaggle 제출용 파일로 저장
"""

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import lightgbm as lgb
try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False


DATA_PATH = Path("../data")
EXPORT_PATH = Path("export_kaggle")
EXPORT_PATH.mkdir(exist_ok=True)

target_col = "market_forward_excess_returns"
leak_cols = ["forward_returns", "risk_free_rate"]


# ========================
# 1) Load raw data
# ========================
train_df_raw = pd.read_csv(DATA_PATH / "train.csv").sort_values("date_id")
train_df = train_df_raw.drop(columns=leak_cols).reset_index(drop=True)


# ========================
# 2) Feature Engineering
# ========================
def generate_FE(df, target_col):
    df = df.copy()
    y = df[target_col]
    y_lag1 = y.shift(1)

    # lag features
    lags = [1,2,5,10,21,63]
    for l in lags:
        df[f"{target_col}_lag{l}"] = y.shift(l)

    # rolling windows
    for w in [5,10,21,63]:
        df[f"roll_mean_{w}"] = y_lag1.rolling(w).mean()
        df[f"roll_std_{w}"]  = y_lag1.rolling(w).std()
        df[f"roll_min_{w}"]  = y_lag1.rolling(w).min()
        df[f"roll_max_{w}"]  = y_lag1.rolling(w).max()

    # vol regime
    df["vol21"] = y_lag1.rolling(21).std()
    df["vol63"] = y_lag1.rolling(63).std()
    df["high_vol"] = (df["vol21"] > df["vol63"]).astype(int)
    df["vol_slope"] = df["vol21"] / (df["vol63"] + 1e-9)

    # macro shock (E*)
    macro_cols = [c for c in df.columns if c.startswith("E")]
    for col in macro_cols:
        m_lag1 = df[col].shift(1)
        m_rmean = m_lag1.rolling(63).mean()
        m_rstd  = m_lag1.rolling(63).std()
        df[f"{col}_z"] = (m_lag1 - m_rmean) / (m_rstd + 1e-9)
        df[f"{col}_shock"] = (df[f"{col}_z"].abs() > 2).astype(int)

    shock_cols = [c for c in df.columns if c.endswith("_shock")]
    df["macro_shock_sum"] = df[shock_cols].sum(axis=1)
    df["macro_crisis"] = (df["macro_shock_sum"] >= 3).astype(int)

    # interaction M x V
    m_cols = [c for c in df.columns if c.startswith("M")][:5]
    v_cols = [c for c in df.columns if c.startswith("V")][:5]
    for m in m_cols:
        for v in v_cols:
            df[f"{m}_x_{v}"] = df[m] * df[v]

    df = df.dropna().reset_index(drop=True)
    return df


print(">>> FE 시작")
fe_df = generate_FE(train_df, target_col)
print("FE shape:", fe_df.shape)


# ========================
# 3) TS-CV OOF predictions
# ========================
def ts_cv(fe_df, target_col, pca_components=15, n_splits=5):
    df = fe_df.copy()
    y = df[target_col].values
    feature_cols = [c for c in df.columns if c != target_col]
    X = df[feature_cols].values
    n = len(df)

    oof = {
        "ElasticNet": np.full(n, np.nan),
        "LightGBM": np.full(n, np.nan),
    }
    if HAS_XGB:
        oof["XGBoost"] = np.full(n, np.nan)

    metrics = {k: {"rmse":[], "corr":[]} for k in oof.keys()}

    tscv = TimeSeriesSplit(n_splits=n_splits)
    for fold, (tr, val) in enumerate(tscv.split(X),1):
        print(f"\n=== Fold {fold} ===")

        X_tr, X_val = X[tr], X[val]
        y_tr, y_val = y[tr], y[val]

        # ENet
        scaler = StandardScaler().fit(X_tr)
        X_tr_s = scaler.transform(X_tr)
        X_val_s = scaler.transform(X_val)

        pca = PCA(n_components=pca_components).fit(X_tr_s)
        X_tr_p = pca.transform(X_tr_s)
        X_val_p = pca.transform(X_val_s)

        enet = ElasticNet(alpha=1e-3, l1_ratio=0.1, max_iter=5000)
        enet.fit(X_tr_p, y_tr)
        pred_en = enet.predict(X_val_p)

        rmse_en = np.sqrt(mean_squared_error(y_val, pred_en))
        corr_en = np.corrcoef(y_val, pred_en)[0,1]
        metrics["ElasticNet"]["rmse"].append(rmse_en)
        metrics["ElasticNet"]["corr"].append(corr_en)
        oof["ElasticNet"][val] = pred_en

        print(f"[ElasticNet] RMSE={rmse_en:.6f}, Corr={corr_en:.4f}")

        # LGB
        lgb_train = lgb.Dataset(X_tr, label=y_tr)
        lgb_valid = lgb.Dataset(X_val, label=y_val)
        params = {
            "objective":"regression",
            "metric":"rmse",
            "learning_rate":0.03,
            "num_leaves":63
        }
        lgbm = lgb.train(params, lgb_train, num_boost_round=800)
        pred_lgb = lgbm.predict(X_val)

        rmse_lgb = np.sqrt(mean_squared_error(y_val, pred_lgb))
        corr_lgb = np.corrcoef(y_val, pred_lgb)[0,1]
        metrics["LightGBM"]["rmse"].append(rmse_lgb)
        metrics["LightGBM"]["corr"].append(corr_lgb)
        oof["LightGBM"][val] = pred_lgb

        print(f"[LightGBM] RMSE={rmse_lgb:.6f}, Corr={corr_lgb:.4f}")

        # XGB
        if HAS_XGB:
            dtr = xgb.DMatrix(X_tr, label=y_tr)
            dval = xgb.DMatrix(X_val, label=y_val)
            x_params = {
                "objective":"reg:squarederror",
                "eval_metric":"rmse",
                "eta":0.03,
                "max_depth":6
            }
            xgbm = xgb.train(x_params, dtr, num_boost_round=800)
            pred_x = xgbm.predict(dval)

            rmse_x = np.sqrt(mean_squared_error(y_val, pred_x))
            corr_x = np.corrcoef(y_val, pred_x)[0,1]
            metrics["XGBoost"]["rmse"].append(rmse_x)
            metrics["XGBoost"]["corr"].append(corr_x)
            oof["XGBoost"][val] = pred_x

            print(f"[XGBoost] RMSE={rmse_x:.6f}, Corr={corr_x:.4f}")

    return y, oof, metrics, feature_cols


print(">>> TS-CV 시작")
y, oof, metrics, feature_cols = ts_cv(fe_df, target_col)


# ========================
# 4) Blend weight search
# ========================
def metric_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

def optimize_blend(y, oof):
    valid_mask = ~np.isnan(list(oof.values())[0])
    y_valid = y[valid_mask]

    preds_v = {k:oof[k][valid_mask] for k in oof}

    names = list(oof.keys())
    best_rmse = 1e9
    best_w = None

    weights = np.arange(0,1.01,0.05)

    if len(names)==2:
        m1,m2 = names
        for w in weights:
            blend = w*preds_v[m1] + (1-w)*preds_v[m2]
            rmse = metric_rmse(y_valid, blend)
            if rmse < best_rmse:
                best_rmse = rmse
                best_w = {m1:w, m2:1-w}
    else:
        for w1 in weights:
            for w2 in weights:
                if w1 + w2 > 1: continue
                w3 = 1 - w1 - w2
                m1,m2,m3 = names
                blend = w1*preds_v[m1] + w2*preds_v[m2] + w3*preds_v[m3]
                rmse = metric_rmse(y_valid, blend)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_w = {m1:w1, m2:w2, m3:w3}

    return best_w, best_rmse


print(">>> Blend 탐색")
best_w, best_rmse = optimize_blend(y, oof)
print("Best Blend:", best_w, "RMSE:", best_rmse)


# ========================
# 5) FINAL 모델 재학습 (전체 데이터)
# ========================
print(">>> FINAL MODEL TRAIN")

X_all = fe_df[feature_cols].values
y_all = fe_df[target_col].values

scaler = StandardScaler().fit(X_all)
X_scaled = scaler.transform(X_all)

pca = PCA(n_components=15).fit(X_scaled)
X_p = pca.transform(X_scaled)

enet_final = ElasticNet(alpha=1e-3, l1_ratio=0.1, max_iter=5000).fit(X_p, y_all)

lgb_train = lgb.Dataset(X_all, label=y_all)
lgb_final = lgb.train({
    "objective":"regression",
    "metric":"rmse",
    "learning_rate":0.03,
    "num_leaves":63
}, lgb_train, num_boost_round=800)

if HAS_XGB:
    dtr = xgb.DMatrix(X_all, label=y_all)
    xgb_final = xgb.train({
        "objective":"reg:squarederror",
        "eta":0.03,
        "max_depth":6
    }, dtr, num_boost_round=800)
else:
    xgb_final = None


# ========================
# 6) Export for Kaggle
# ========================

import pickle, json

pickle.dump(scaler, open(EXPORT_PATH/"scaler.pkl","wb"))
pickle.dump(pca, open(EXPORT_PATH/"pca.pkl","wb"))
pickle.dump(enet_final, open(EXPORT_PATH/"enet.pkl","wb"))

lgb_final.save_model(str(EXPORT_PATH/"lgb.txt"))

if xgb_final is not None:
    xgb_final.save_model(str(EXPORT_PATH/"xgb.json"))

json.dump(feature_cols, open(EXPORT_PATH/"feature_list.json","w"))
json.dump(best_w, open(EXPORT_PATH/"blend_weights.json","w"))

print("\n=== ALL DONE ===")
print("Exported files:", list(EXPORT_PATH.iterdir()))


>>> FE 시작
FE shape: (1989, 189)
>>> TS-CV 시작

=== Fold 1 ===
[ElasticNet] RMSE=0.014965, Corr=0.0744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16412
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 183
[LightGBM] [Info] Start training from score -0.000048
[LightGBM] RMSE=0.014503, Corr=0.0573
[XGBoost] RMSE=0.014951, Corr=0.0799

=== Fold 2 ===
[ElasticNet] RMSE=0.009820, Corr=0.1064
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32391
[LightGBM] [Info] Number of data points in the train set: 665, number of used features: 187
[LightGBM] [Info] Start training from score 0.000205
[LightGBM] RMSE=0.011504, Corr=0.0210
[XGBoost] RMSE=0.015962, Corr=0.0454

=== Fol

In [5]:
"""
local_train.py
- FE 기반 TS-CV / OOF / Blend / Sharpe 분석
- 최종 Kaggle 제출용 raw feature 모델(예: LGBM) 선정 및 학습
"""

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import lightgbm as lgb
try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False


DATA_PATH = Path("../data")
EXPORT_PATH = Path("export_kaggle")
EXPORT_PATH.mkdir(exist_ok=True)

target_col = "market_forward_excess_returns"
leak_cols = ["forward_returns", "risk_free_rate"]


# ==========================
# Load Data
# ==========================
train_df_raw = pd.read_csv(DATA_PATH / "train.csv").sort_values("date_id")
train_df = train_df_raw.drop(columns=leak_cols).reset_index(drop=True)


# ==========================
# Feature Engineering (FE)
# ==========================
def generate_FE(df, target_col):
    df = df.copy()
    y = df[target_col]
    y_lag1 = y.shift(1)

    # lag features
    for l in [1,2,5,10,21,63]:
        df[f"{target_col}_lag{l}"] = y.shift(l)

    # rolling stats
    for w in [5,10,21,63]:
        df[f"roll_mean_{w}"] = y_lag1.rolling(w).mean()
        df[f"roll_std_{w}"]  = y_lag1.rolling(w).std()
        df[f"roll_min_{w}"]  = y_lag1.rolling(w).min()
        df[f"roll_max_{w}"]  = y_lag1.rolling(w).max()

    # volatility regime
    df["vol21"] = y_lag1.rolling(21).std()
    df["vol63"] = y_lag1.rolling(63).std()
    df["high_vol"] = (df["vol21"] > df["vol63"]).astype(int)
    df["vol_slope"] = df["vol21"] / (df["vol63"] + 1e-9)

    # macro shocks (E*)
    macro_cols = [c for c in df.columns if c.startswith("E")]
    for col in macro_cols:
        m_lag1 = df[col].shift(1)
        m_rmean = m_lag1.rolling(63).mean()
        m_rstd  = m_lag1.rolling(63).std()
        df[f"{col}_z"] = (m_lag1 - m_rmean) / (m_rstd + 1e-9)
        df[f"{col}_shock"] = (df[f"{col}_z"].abs() > 2).astype(int)

    shock_cols = [c for c in df.columns if c.endswith("_shock")]
    df["macro_shock_sum"] = df[shock_cols].sum(axis=1)
    df["macro_crisis"] = (df["macro_shock_sum"] >= 3).astype(int)

    # interaction M x V
    m_cols = [c for c in df.columns if c.startswith("M")][:5]
    v_cols = [c for c in df.columns if c.startswith("V")][:5]
    for m in m_cols:
        for v in v_cols:
            df[f"{m}_x_{v}"] = df[m] * df[v]

    return df.dropna().reset_index(drop=True)


print(">>> Running FE …")
fe_df = generate_FE(train_df, target_col)
print(fe_df.shape)


# ==========================
# TS-CV + OOF Predictions
# ==========================
def ts_cv(fe_df, target_col):
    df = fe_df.copy()
    y = df[target_col].values
    feature_cols = [c for c in df.columns if c != target_col]
    X = df[feature_cols].values
    n = len(df)

    oof = {"ElasticNet": np.full(n, np.nan), "LightGBM": np.full(n, np.nan)}
    if HAS_XGB:
        oof["XGBoost"] = np.full(n, np.nan)

    metrics = {k: [] for k in oof}

    tscv = TimeSeriesSplit(n_splits=5)

    for fold, (tr,val) in enumerate(tscv.split(X),1):
        print(f"\n=== Fold {fold} ===")

        X_tr, X_val = X[tr], X[val]
        y_tr, y_val = y[tr], y[val]

        # ElasticNet (scaled+PCA)
        scaler = StandardScaler().fit(X_tr)
        X_tr_s = scaler.transform(X_tr)
        X_val_s = scaler.transform(X_val)

        pca = PCA(n_components=15).fit(X_tr_s)
        X_tr_p = pca.transform(X_tr_s)
        X_val_p = pca.transform(X_val_s)

        enet = ElasticNet(alpha=1e-3, l1_ratio=0.1).fit(X_tr_p, y_tr)
        pred_en = enet.predict(X_val_p)
        oof["ElasticNet"][val] = pred_en

        # LightGBM
        lgbm = lgb.train(
            {"objective":"regression","metric":"rmse","learning_rate":0.03,"num_leaves":63},
            lgb.Dataset(X_tr, label=y_tr),
            num_boost_round=800
        )
        pred_lgb = lgbm.predict(X_val)
        oof["LightGBM"][val] = pred_lgb

        if HAS_XGB:
            dtr = xgb.DMatrix(X_tr, label=y_tr)
            dval = xgb.DMatrix(X_val, label=y_val)
            xgbm = xgb.train(
                {"objective":"reg:squarederror","eta":0.03,"max_depth":6},
                dtr, num_boost_round=800
            )
            pred_x = xgbm.predict(dval)
            oof["XGBoost"][val] = pred_x

    return y, oof, feature_cols


print(">>> Running TS-CV …")
y, oof, feature_cols = ts_cv(fe_df, target_col)


# ==========================
# OOF 기반 Blend weight
# ==========================
def best_blend(y, oof):
    valid = ~np.isnan(next(iter(oof.values())))
    y_valid = y[valid]
    pv = {m:oof[m][valid] for m in oof}

    weights = np.arange(0,1.01,0.05)
    best_rmse, best_w = 1e9, None
    names = list(oof.keys())

    if len(names)==2:
        m1,m2 = names
        for w1 in weights:
            w2 = 1-w1
            pred = pv[m1]*w1 + pv[m2]*w2
            rmse = np.sqrt(mean_squared_error(y_valid, pred))
            if rmse < best_rmse:
                best_rmse, best_w = rmse, {m1:w1, m2:w2}
    else:
        m1,m2,m3 = names
        for w1 in weights:
            for w2 in weights:
                if w1+w2>1: continue
                w3 = 1-w1-w2
                pred = pv[m1]*w1 + pv[m2]*w2 + pv[m3]*w3
                rmse = np.sqrt(mean_squared_error(y_valid, pred))
                if rmse < best_rmse:
                    best_rmse, best_w = rmse, {m1:w1,m2:w2,m3:w3}

    return best_w, best_rmse


print(">>> Searching best blend …")
blend_w, blend_rmse = best_blend(y, oof)
print("Best Blend:", blend_w, "RMSE:", blend_rmse)


# ==========================
# 최종 Kaggle 제출용 RAW 모델 학습
# ==========================

print(">>> Training final RAW model for Kaggle …")

# raw features: FE 없이 train_df 사용
raw_features = [c for c in train_df.columns if c != target_col]
X_raw = train_df[raw_features].values
y_raw = train_df[target_col].values

raw_lgb = lgb.train(
    {"objective":"regression","metric":"rmse","learning_rate":0.03,"num_leaves":63},
    lgb.Dataset(X_raw, label=y_raw),
    num_boost_round=1200
)

import pickle, json
pickle.dump(raw_features, open(EXPORT_PATH/"raw_feature_list.pkl","wb"))
raw_lgb.save_model(str(EXPORT_PATH/"raw_model_lgb.txt"))

print("Training complete.")
print("Saved to export_kaggle/")


>>> Running FE …
(1989, 189)
>>> Running TS-CV …

=== Fold 1 ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16412
[LightGBM] [Info] Number of data points in the train set: 334, number of used features: 183
[LightGBM] [Info] Start training from score -0.000048

=== Fold 2 ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32391
[LightGBM] [Info] Number of data points in the train set: 665, number of used features: 187
[LightGBM] [Info] Start training from score 0.000205

=== Fold 3 ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38071
[LightGBM] [Info] Num