In [1]:
# ============================================================
# modeling_v12_lgbm_seed.py
# - v10 pairs (BayesOpt 필터 반영된 pairs_v10_best.csv) 사용
# - 추가 EDA 필터링 없음 (전처리에서 이미 최적 필터 적용)
# - train_month(B안) 기반 pivot → pair별 시계열 feature → LGBM seed ensemble
# - sample_submission 미사용: 우리가 찾은 pair만 예측
# - 제출 시 value >= 1000 인 pair만 남김
# ============================================================

import pandas as pd
import numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# ------------------------------------------------------------
# 0. PATH 설정
# ------------------------------------------------------------
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"

TRAIN_MONTH_PATH = DATA_DIR / "processed" / "train_month.csv"          # B안 결과
PAIRS_PATH       = DATA_DIR / "processed" / "v10_pairs" / "pairs_v10_best.csv"

OUTPUT_DIR       = DATA_DIR / "processed" / "v12_model_output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# 0-1. 팀원 1등 코드 기준 required feature 목록
# ------------------------------------------------------------
REQUIRED_COLS = [
    # follower 기준 값
    "b_t",              # 현시점 값
    "b_t_1",            # 1개월 전
    "b_t_2",            # 2개월 전
    "b_roll3",          # 최근 3개월 평균
    "b_diff1",          # 1개월 변화량 (b_t - b_t_1)
    "b_diff2",          # 2개월 변화량 (b_t - b_t_2)
    "b_pct1",           # 직전 대비 증감율
    "b_pct2",           # 2기 전 대비 증감율
    "b_std3",           # 최근 3개월 표준편차
    "b_expanding_mean", # 전체 누적 평균
    "b_zscore3",        # 3개월 z-score

    # leader 기준 값
    "a_t_lag",
    "a_t_lag_1",
    "a_diff_lag",
    "a_pct_lag",

    # 계절성
    "month",
    "quarter",
]


# ============================================================
# 1. train_month → pivot 생성
#    (전처리에서 만든 train_month.csv 기반, hs4 포함)
# ============================================================
def load_monthly_data():
    df = pd.read_csv(TRAIN_MONTH_PATH)

    df["year"] = df["year"].astype(int)
    df["month"] = df["month"].astype(int)
    df["ym"] = pd.to_datetime(
        df["year"].astype(str) + "-" + df["month"].astype(str) + "-01"
    )

    pivot = (
        df.pivot_table(
            index="ym",
            columns="item_id",
            values="value",
            aggfunc="sum",
        )
        .sort_index()
        .fillna(0.0)
    )

    return df, pivot


# ============================================================
# 2. single-pair 시계열 피처 생성
# ============================================================
def build_pair_frame(pivot, leader, follower, lag, corr):
    """
    - follower 시계열 b_t 기준 lag/diff/ratio/roll/stat
    - leader 시계열 a_t_lag, a_t_lag_1, a_diff_lag, a_pct_lag
    - calendar feature: month, quarter
    - target_value: 다음 달 follower value
    """
    a = pivot[leader]
    b = pivot[follower]

    df = pd.DataFrame(
        {
            "date": pivot.index,
            "b_t": b.values,
            "b_t_1": b.shift(1).values,
            "b_t_2": b.shift(2).values,
        }
    )

    # follower rolling/변화량/비율
    df["b_roll3"] = df["b_t"].rolling(3).mean()
    df["b_diff1"] = df["b_t"] - df["b_t_1"]
    df["b_diff2"] = df["b_t"] - df["b_t_2"]

    df["b_pct1"] = df["b_diff1"] / (df["b_t_1"].replace(0, np.nan) + 1e-6)
    df["b_pct2"] = df["b_diff2"] / (df["b_t_2"].replace(0, np.nan) + 1e-6)

    df["b_std3"] = df["b_t"].rolling(3).std()
    df["b_expanding_mean"] = df["b_t"].expanding().mean()
    df["b_zscore3"] = (df["b_t"] - df["b_roll3"]) / (df["b_std3"] + 1e-6)

    # leader lag features
    df["a_t_lag"] = a.shift(lag).values
    df["a_t_lag_1"] = a.shift(lag + 1).values
    df["a_diff_lag"] = df["a_t_lag"] - df["a_t_lag_1"]
    df["a_pct_lag"] = df["a_diff_lag"] / (df["a_t_lag_1"].replace(0, np.nan) + 1e-6)

    # calendar features
    df["month"] = df["date"].dt.month
    df["quarter"] = df["date"].dt.quarter

    # target: 다음 달 follower value
    df["target_value"] = b.shift(-1).values
    df["target_log"] = np.log1p(df["target_value"].clip(lower=0))
    df["target_date"] = df["date"] + pd.offsets.MonthBegin(1)

    # 메타
    df["leading_item_id"] = leader
    df["following_item_id"] = follower
    df["lag_val"] = lag
    df["corr"] = corr

    # NaN / Inf 제거
    df = df.replace([np.inf, -np.inf], np.nan).dropna()

    return df


def build_training_data(pivot, pairs_df):
    frames = []
    for _, row in pairs_df.iterrows():
        f = build_pair_frame(
            pivot,
            row["leading_item_id"],
            row["following_item_id"],
            row["best_lag"],
            row["max_corr"],
        )
        # 너무 짧은 시계열은 제외
        if len(f) > 12:
            frames.append(f)

    if not frames:
        raise ValueError("훈련 데이터 없음. (frames 비어 있음)")

    return pd.concat(frames, ignore_index=True)


# ============================================================
# 3. LGBM seed ensemble
# ============================================================
def train_single_lgbm(X_train, y_train, X_valid, y_valid, seed):

    model = LGBMRegressor(
        objective="regression",
        n_estimators=1400,
        learning_rate=0.045,
        num_leaves=80,
        max_depth=-1,
        min_child_samples=30,
        subsample=0.8,
        subsample_freq=2,
        colsample_bytree=0.85,
        reg_lambda=3.0,
        reg_alpha=1.0,
        random_state=seed,
        verbosity=-1,
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="l2",
    )

    pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, pred))
    return model, rmse


def train_lgbm_seed_ensemble(train_df, seeds=[42, 2024, 777]):
    """
    - target_date 기준 time-based split
      train: target_date <= 2024-12-01
      valid: 2025-01-01 ~ 2025-05-01
    """
    exclude = [
        "target_value",
        "target_log",
        "date",
        "target_date",
        "leading_item_id",
        "following_item_id",
        "lag_val",
        "corr",
    ]

    # 팀원 required_cols 기준으로 feature set 구성
    base_feature_cols = [c for c in REQUIRED_COLS if c in train_df.columns]
    # 혹시 누락된 게 있으면 로그로 확인하기 좋게 출력
    missing = [c for c in REQUIRED_COLS if c not in train_df.columns]
    if missing:
        print("[WARN] 다음 feature는 train_df에 없음 (무시됨):", missing)

    feature_cols = [c for c in base_feature_cols if c not in exclude]

    train_cutoff = pd.Timestamp("2024-12-01")
    valid_start = pd.Timestamp("2025-01-01")
    valid_end = pd.Timestamp("2025-05-01")

    train_mask = train_df["target_date"] <= train_cutoff
    valid_mask = (train_df["target_date"] >= valid_start) & (
        train_df["target_date"] <= valid_end
    )

    X_train = train_df.loc[train_mask, feature_cols]
    y_train = train_df.loc[train_mask, "target_log"]

    X_valid = train_df.loc[valid_mask, feature_cols]
    y_valid = train_df.loc[valid_mask, "target_log"]

    if len(X_train) == 0 or len(X_valid) == 0:
        raise ValueError("train/valid 분할 후 데이터가 0입니다. (기간 확인 필요)")

    print(f"[INFO] num_features: {len(feature_cols)}")
    print("[INFO] feature_cols:", feature_cols)

    models, rmses = [], []
    for seed in seeds:
        model, rmse = train_single_lgbm(X_train, y_train, X_valid, y_valid, seed)
        print(f"[Seed {seed}] RMSE = {rmse:.5f}")
        models.append(model)
        rmses.append(rmse)

    print("Avg RMSE:", np.mean(rmses))
    return models, feature_cols


def predict_ensemble(models, X):
    preds = [m.predict(X) for m in models]
    return np.mean(preds, axis=0)


# ============================================================
# 4. inference: 마지막 row를 feature 로 사용
# ============================================================
def build_inference_features(pivot, pairs_df):
    rows = []
    for _, row in pairs_df.iterrows():
        df = build_pair_frame(
            pivot,
            row["leading_item_id"],
            row["following_item_id"],
            row["best_lag"],
            row["max_corr"],
        )
        if df.empty:
            continue
        rows.append(df.iloc[-1].copy())

    if not rows:
        return pd.DataFrame()

    return pd.DataFrame(rows).reset_index(drop=True)


# ============================================================
# 5. 제출 (우리가 찾은 pair만, value>=1000 필터)
# ============================================================
def create_submission(pairs_df, pred_df, models, feature_cols):
    if pred_df.empty:
        raise ValueError("pred_df 비어 있음. (inference feature 없음)")

    sub = pairs_df[["leading_item_id", "following_item_id"]].copy()

    sub = sub.merge(
        pred_df[["leading_item_id", "following_item_id"] + feature_cols],
        on=["leading_item_id", "following_item_id"],
        how="inner",
    )

    if len(sub) == 0:
        raise ValueError("merge 후 예측 가능한 pair가 없습니다.")

    X_test = sub[feature_cols].fillna(0.0)
    y_pred = np.expm1(predict_ensemble(models, X_test))
    y_pred = np.maximum(0, y_pred)

    sub["value"] = y_pred.round().astype(int)

    # 대장 요청: value >= 1000 인 pair만 제출
    sub = sub[sub["value"] >= 1000].reset_index(drop=True)

    print(f"[INFO] submission rows after value>=1000 filter: {len(sub)}")
    return sub[["leading_item_id", "following_item_id", "value"]]


# ============================================================
# 6. MAIN  (★ 추가 EDA 필터링 없음)
# ============================================================
def main():
    print("=== [1] Load monthly & pivot ===")
    _, pivot = load_monthly_data()

    print("=== [2] Load pairs_v10_best (BayesOpt 필터 반영본) ===")
    pairs_df = pd.read_csv(PAIRS_PATH)
    print(f"[INFO] pairs_v10_best rows: {len(pairs_df)}")

    print("=== [3] Build training df ===")
    train_df = build_training_data(pivot, pairs_df)
    print(f"[INFO] training rows: {len(train_df)}")

    print("=== [4] Train LGBM seed ensemble ===")
    models, feature_cols = train_lgbm_seed_ensemble(train_df)

    print("=== [5] Build inference features (마지막 시점 기준) ===")
    pred_df = build_inference_features(pivot, pairs_df)
    print(f"[INFO] pred_df rows: {len(pred_df)}")

    print("=== [6] Create submission (value>=1000 필터 포함) ===")
    submission = create_submission(pairs_df, pred_df, models, feature_cols)

    out_path = OUTPUT_DIR / "submission_v12_lgbm_seed_from_best_pairs_valge1000.csv"
    submission.to_csv(out_path, index=False)
    print(f"[SAVE] {out_path}")


if __name__ == "__main__":
    main()


=== [1] Load monthly & pivot ===
=== [2] Load pairs_v10_best (BayesOpt 필터 반영본) ===
[INFO] pairs_v10_best rows: 2234
=== [3] Build training df ===
[INFO] training rows: 72224
=== [4] Train LGBM seed ensemble ===
[INFO] num_features: 17
[INFO] feature_cols: ['b_t', 'b_t_1', 'b_t_2', 'b_roll3', 'b_diff1', 'b_diff2', 'b_pct1', 'b_pct2', 'b_std3', 'b_expanding_mean', 'b_zscore3', 'a_t_lag', 'a_t_lag_1', 'a_diff_lag', 'a_pct_lag', 'month', 'quarter']
[Seed 42] RMSE = 1.66593
[Seed 2024] RMSE = 1.70937
[Seed 777] RMSE = 1.69033
Avg RMSE: 1.6885418729021966
=== [5] Build inference features (마지막 시점 기준) ===
[INFO] pred_df rows: 2232
=== [6] Create submission (value>=1000 필터 포함) ===
[INFO] submission rows after value>=1000 filter: 2153
[SAVE] /data/ephemeral/home/data/processed/v12_model_output/submission_v12_lgbm_seed_from_best_pairs_valge1000.csv
