In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl

# 모델링 라이브러리
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

import kaggle_evaluation.default_inference_server

# 경로 설정
DATA_PATH = Path("dataset")
if not DATA_PATH.exists():
    DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction")

print(f"Data Path: {DATA_PATH}")

Data Path: /kaggle/input/hull-tactical-market-prediction


In [2]:
train_df_raw = pd.read_csv(DATA_PATH / "train.csv").sort_values("date_id").reset_index(drop=True)

target_col = "market_forward_excess_returns"
# 학습 시 제거할 Leak 컬럼들
leak_cols = ["forward_returns", "risk_free_rate", "date_id"]

# is_scored 등이 있다면 제거 리스트에 포함
extra_drop_cols = ["is_scored", "lagged_forward_returns", "lagged_market_forward_excess_returns", "lagged_risk_free_rate"]

print(f"Train data shape: {train_df_raw.shape}")

Train data shape: (9021, 98)


In [3]:
def generate_FE_interaction_regime(df, target_col="market_forward_excess_returns"):
    """
    Train/Inference 공용 FE 함수 (dropna 없음)
    """
    df = df.copy()

    # ---- Target and Lag1 ----
    if target_col in df.columns:
        y = df[target_col]
        y_lag1 = y.shift(1)
    else:
        y = pd.Series([np.nan] * len(df), index=df.index)
        y_lag1 = y.shift(1)

    # 1) Lag Features
    lags = [1, 2, 5, 10, 21, 63]
    for l in lags:
        df[f"{target_col}_lag{l}"] = y.shift(l)

    # 2) Rolling Stats
    windows = [5, 10, 21, 63]
    for w in windows:
        roll_mean = y_lag1.rolling(w).mean()
        roll_std = y_lag1.rolling(w).std()
        df[f"roll_mean_{w}"] = roll_mean
        df[f"roll_std_{w}"] = roll_std
        df[f"roll_min_{w}"] = y_lag1.rolling(w).min()
        df[f"roll_max_{w}"] = y_lag1.rolling(w).max()
        df[f"zscore_{w}"] = (y_lag1 - roll_mean) / (roll_std + 1e-9)

    # 3) Volatility Regime
    df["vol21"] = y_lag1.rolling(21).std()
    df["vol63"] = y_lag1.rolling(63).std()
    df["high_vol"] = (df["vol21"] > df["vol63"]).astype(int)

    df["vol21_q90"] = df["vol21"].rolling(252, min_periods=50).quantile(0.9)
    df["crisis"] = (df["vol21"] > df["vol21_q90"]).astype(int)
    df.drop(columns=["vol21_q90"], inplace=True)

    df["vol_slope"] = df["vol21"] / (df["vol63"] + 1e-9)

    # 4) Macro Shock Regime
    macro_cols = [c for c in df.columns if c.startswith("E")]
    for col in macro_cols:
        m_lag1 = df[col].shift(1)
        roll_mean_E = m_lag1.rolling(63).mean()
        roll_std_E = m_lag1.rolling(63).std()
        df[f"{col}_z"] = (m_lag1 - roll_mean_E) / (roll_std_E + 1e-9)
        df[f"{col}_shock"] = (df[f"{col}_z"].abs() > 2).astype(int)

    shock_cols = [c for c in df.columns if c.endswith("_shock")]
    if len(shock_cols) > 0:
        df["macro_shock_sum"] = df[shock_cols].sum(axis=1)
        df["macro_crisis"] = (df["macro_shock_sum"] >= 3).astype(int)
    else:
        df["macro_shock_sum"] = 0
        df["macro_crisis"] = 0

    # 5) Momentum / Volatility Interaction
    momentum_cols = [c for c in df.columns if c.startswith("M")]
    vol_cols = [c for c in df.columns if c.startswith("V")]
    for m in momentum_cols[:5]:
        for v in vol_cols[:5]:
            df[f"{m}_x_{v}"] = df[m] * df[v]

    # 6) Macro Spread Interaction
    macro_pairs = [("E2", "E11"), ("E7", "E12"), ("E3", "E5")]
    for a, b in macro_pairs:
        if a in df.columns and b in df.columns:
            df[f"{a}_minus_{b}"] = df[a] - df[b]

    # 7) Return Shock
    y_std_hist = y_lag1.rolling(252, min_periods=50).std()
    df["return_shock"] = (y_lag1.abs() > 2 * (y_std_hist + 1e-9)).astype(int)

    return df

In [4]:
# 1. Feature Engineering
print("Generating features...")
train_fe = generate_FE_interaction_regime(train_df_raw, target_col)
train_fe = train_fe.dropna().reset_index(drop=True)

# 2. X, y 분리 (Leak 컬럼들 제거)
drop_list = [target_col] + leak_cols + extra_drop_cols
X_train = train_fe.drop(columns=drop_list, errors='ignore')
y_train = train_fe[target_col]

print(f"Training Features Shape: {X_train.shape}")

# ====================================================
# Model 1: ElasticNet Pipeline (PCA 포함)
# ====================================================
print("Training ElasticNet...")
# ElasticNet은 다중공선성에 약하므로 PCA를 포함한 파이프라인 구성
model_en = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)), # PCA 적용
    ('regressor', ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42))
])
model_en.fit(X_train, y_train)


# ====================================================
# Model 2: LightGBM (Raw Features)
# ====================================================
print("Training LightGBM...")
# LightGBM은 결측치/스케일링/다중공선성에 강하므로 Raw Feature 사용
# 단, sklearn pipeline과 맞추기 위해 Imputer 정도는 통일해도 좋음 (여기선 바로 사용)
model_lgb = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=31,
    random_state=42,
    verbose=-1
)
# LightGBM은 컬럼명에 특수문자가 있으면 에러날 수 있으므로 rename 필요할 수도 있음
# 여기선 그대로 진행
model_lgb.fit(X_train, y_train)

print("All models trained!")

Generating features...
Training Features Shape: (1989, 196)
Training ElasticNet...


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  return op(a, b)
  df["return_shock"] = (y_lag1.abs() > 2 * (y_std_hist + 1e-9)).astype(int)


Training LightGBM...
All models trained!


In [9]:
from collections import deque
#==========================================
# Weights Configuration
# ==========================================
W_EN = 0.95  # ElasticNet Weight
W_LGB = 0.5  # LightGBM Weight (Sum = 1.45)

# ==========================================
# Global Buffer Initialization
# ==========================================
MAX_HISTORY_LEN = 500
VOL_WINDOW = 20  # 변동성 계산할 최근 윈도우 (영업일 기준 한 달)

if 'train_df_raw' in globals():
    # 학습 데이터의 마지막 부분을 버퍼로 사용
    # 주의: Leak 컬럼은 제거하지 않고 원본 그대로 둬야 FE 계산 가능
    # (FE 함수 내부에서 필요한 컬럼들이 있을 수 있음)
    history_df = train_df_raw.tail(MAX_HISTORY_LEN).copy()
    print(f"History initialized with {len(history_df)} rows.")
else:
    print("Warning: train_df_raw not found. Starting empty.")
    history_df = pd.DataFrame()

past_weights = deque([1.0] * VOL_WINDOW, maxlen=VOL_WINDOW)

History initialized with 500 rows.


In [20]:
def predict(test: pl.DataFrame) -> float:
    """
    ElasticNet(0.95) + LightGBM(0.5) Blending Inference
    Updates history for ALL rows, but returns ONLY the first row's prediction.
    """
    global history_df, model_en, model_lgb, past_weights
    
    if 'model_en' not in globals() or 'model_lgb' not in globals():
        return 0.0
    
    try:
        # 1. 데이터 수신 (Polars -> Pandas)
        current_batch_pd = test.to_pandas()
        batch_size = len(current_batch_pd)
        
        # 2. History 병합 (배치 전체를 병합해야 시간 순서가 유지됨)
        full_df = pd.concat([history_df, current_batch_pd], axis=0, ignore_index=True)
        
        # 3. FE 적용 (전체 데이터 대상)
        fe_full_df = generate_FE_interaction_regime(full_df, target_col)
        
        # 4. 현재 배치 부분만 추출
        X_test_fe = fe_full_df.iloc[-batch_size:].copy()
        
        # 5. History Buffer 갱신 (다음 턴을 위해 저장)
        history_df = full_df.iloc[-MAX_HISTORY_LEN:].copy()
        
        # -----------------------------------------------
        # Preprocessing
        # -----------------------------------------------
        drop_list = [target_col] + leak_cols + extra_drop_cols
        X_test_clean = X_test_fe.drop(columns=drop_list, errors='ignore')
        X_test_clean = X_test_clean.fillna(0)
        
        # -----------------------------------------------
        # Prediction (전체 배치 예측)
        # -----------------------------------------------
        # 배치가 여러 개라도 한 번에 예측해야 효율적이고, 
        # 뒤쪽 데이터의 히스토리 업데이트를 위해 필요함.
        pred_en = model_en.predict(X_test_clean)
        pred_lgb = model_lgb.predict(X_test_clean)
        
        final_pred_vals = (pred_en * W_EN) + (pred_lgb * W_LGB)
        
        k_factor = 0.5
        raw_weights = 1.0 + (final_pred_vals * k_factor)
        
        # -----------------------------------------------
        # Post-Processing: Volatility Control (Sequential)
        # -----------------------------------------------
        final_weights = []
        MAX_VOL_RATIO = 1.15
        
        for w in raw_weights:
            # 1. 가상 히스토리 RMS 계산
            current_rms = np.sqrt(np.mean(np.square(list(past_weights) + [w])))
            
            final_w = w
            # 2. 변동성 스케일링
            if current_rms > MAX_VOL_RATIO:
                scale_factor = MAX_VOL_RATIO / (current_rms + 1e-9)
                final_w = w * scale_factor
            
            # 3. Clipping
            final_w = np.clip(final_w, 0.0, 2.0)
            
            # 4. 저장 및 히스토리 업데이트
            final_weights.append(final_w)
            past_weights.append(final_w)
            
        # -----------------------------------------------
        # Return (First Row Only)
        # -----------------------------------------------
        # 요청하신 대로 첫 번째 결과만 float로 반환
        return float(final_weights[0])
            
    except Exception as e:
        print(f"Error in predict: {e}")
        return 0.0

In [11]:
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    print("Local testing mode - Inference Server ready")

Local testing mode - Inference Server ready


In [21]:
train_df_raw = pd.read_csv(DATA_PATH / "test.csv").sort_values("date_id").reset_index(drop=True)
predict(pl.DataFrame(test_df.iloc[1:,:]))

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df[f"{m}_x_{v}"] = df[m] * df[v]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  return op(a, b)
  df["return_shock"] = (y_lag1.abs() > 2 * (y_std_hist + 1e-9)).astype(int)


0.9990978495240869

In [22]:
predictions = []
test_df = pd.read_csv(DATA_PATH / "test.csv")
for i in range(len(test_df)):
    try:
        # 1. 현재 행 가져오기 (Pandas DataFrame 형태 유지)
        row_pd = test_df.iloc[[i]]
        
        # 2. Polars로 변환 (predict 함수가 Polars를 받으므로)
        row_pl = pl.from_pandas(row_pd)
        
        # 3. 예측 수행
        pred_weight = predict(row_pl)
        
        # 4. 결과 저장
        predictions.append(pred_weight)
        
    except Exception as e:
        print(f"Error at index {i}: {e}")
        predictions.append(0.0) # 에러 시 기본값(현금)

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  df[f"{m}_x_{v}"] = df[m] * df[v]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  df[f"{a}_minus_{b}"] = df[a] - df[b]
  return op(a, b)
  df["return_shock"] = (y_lag1.abs() > 2 * (y_std_hist + 1e-9)).astype(int)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b