In [2]:
# ============================================================
# modeling_v10_lgbm_seed.py
# - pair 기반 LightGBM 회귀 (단일 LGBM + seed 앙상블)
# - sample_submission 사용 X → 우리가 찾은 pair만 제출
# ============================================================

import pandas as pd
import numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error


# ============================================================
# 0. PATH 설정
# ============================================================
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"

TRAIN_MONTH_PATH = DATA_DIR / "processed" / "train_month.csv"
PAIRS_PATH       = DATA_DIR / "processed" / "v10_pairs" / "pairs_v10_best.csv"

OUTPUT_DIR       = DATA_DIR / "processed" / "v10_model_output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


# ============================================================
# 1. train_month → pivot 생성
# ============================================================
def load_monthly_data():
    df = pd.read_csv(TRAIN_MONTH_PATH)
    df["ym"] = pd.to_datetime(df["year"].astype(str) + "-" +
                              df["month"].astype(str) + "-01")

    pivot = (
        df.pivot_table(index="ym", columns="item_id", values="value", aggfunc="sum")
          .sort_index()
          .fillna(0.0)
    )
    return df, pivot


# ============================================================
# 2. single-pair 시계열 피처 생성
# ============================================================
def build_pair_frame(pivot, leader, follower, lag, corr):
    a = pivot[leader]
    b = pivot[follower]

    df = pd.DataFrame({
        "date": pivot.index,
        "b_t": b.values,
        "b_t_1": b.shift(1).values,
        "b_t_2": b.shift(2).values,
        "a_t_lag": a.shift(lag).values,
        "a_t_lag_1": a.shift(lag + 1).values,
    })

    df["b_diff1"] = df["b_t"] - df["b_t_1"]
    df["b_pct1"] = (df["b_diff1"]) / (df["b_t_1"].replace(0, np.nan) + 1e-6)

    df["target_value"] = b.shift(-1).values
    df["target_log"] = np.log1p(df["target_value"].clip(lower=0))
    df["target_date"] = df["date"] + pd.offsets.MonthBegin(1)

    df["leading_item_id"] = leader
    df["following_item_id"] = follower
    df["lag_val"] = lag
    df["corr"] = corr

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.dropna()

    return df


def build_training_data(pivot, pairs_df):
    frames = []
    for _, row in pairs_df.iterrows():
        f = build_pair_frame(
            pivot,
            row["leading_item_id"],
            row["following_item_id"],
            row["best_lag"],
            row["max_corr"],
        )
        if len(f) > 12:
            frames.append(f)

    if not frames:
        raise ValueError("훈련 데이터 없음.")

    return pd.concat(frames, ignore_index=True)


# ============================================================
# 3. LGBM 단일 모델 + SEED ensemble
# ============================================================
def train_single_lgbm(X_train, y_train, X_valid, y_valid, seed):
    model = LGBMRegressor(
        objective="regression",
        n_estimators=1400,
        learning_rate=0.045,
        num_leaves=80,
        max_depth=-1,
        min_child_samples=30,
        subsample=0.8,
        subsample_freq=2,
        colsample_bytree=0.85,
        reg_lambda=3.0,
        reg_alpha=1.0,
        random_state=seed,
        verbosity=-1,
    )


    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="l2",
    )

    pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, pred))
    return model, rmse


def train_lgbm_seed_ensemble(train_df, seeds=[42, 43, 44]):
    exclude = [
        "target_value", "target_log",
        "date", "target_date",
        "leading_item_id", "following_item_id"
    ]
    feature_cols = [c for c in train_df.columns if c not in exclude]

    train_cutoff = pd.Timestamp("2024-12-01")
    valid_start  = pd.Timestamp("2025-01-01")
    valid_end    = pd.Timestamp("2025-05-01")

    train_mask = train_df["target_date"] <= train_cutoff
    valid_mask = (train_df["target_date"] >= valid_start) & \
                 (train_df["target_date"] <= valid_end)

    X_train = train_df.loc[train_mask, feature_cols]
    y_train = train_df.loc[train_mask, "target_log"]

    X_valid = train_df.loc[valid_mask, feature_cols]
    y_valid = train_df.loc[valid_mask, "target_log"]

    models, rmses = [], []

    for seed in seeds:
        model, rmse = train_single_lgbm(X_train, y_train, X_valid, y_valid, seed)
        print(f"[Seed {seed}] RMSE = {rmse:.5f}")
        models.append(model)
        rmses.append(rmse)

    print("Avg RMSE:", np.mean(rmses))
    return models, feature_cols


def predict_ensemble(models, X):
    preds = [m.predict(X) for m in models]
    return np.mean(preds, axis=0)


# ============================================================
# 4. inference: 마지막 row를 feature 로 사용
# ============================================================
def build_inference_features(pivot, pairs_df):
    rows = []
    for _, row in pairs_df.iterrows():
        df = build_pair_frame(
            pivot,
            row["leading_item_id"],
            row["following_item_id"],
            row["best_lag"],
            row["max_corr"],
        )
        if df.empty:
            continue
        rows.append(df.iloc[-1].copy())

    if not rows:
        return pd.DataFrame()

    return pd.DataFrame(rows).reset_index(drop=True)


# ============================================================
# 5. 제출 생성 (우리가 찾은 pair만)
# ============================================================
def create_submission(pairs_df, pred_df, models, feature_cols):
    if pred_df.empty:
        raise ValueError("pred_df 비어 있음.")

    sub = pairs_df[["leading_item_id", "following_item_id"]].copy()

    sub = sub.merge(
        pred_df[["leading_item_id", "following_item_id"] + feature_cols],
        on=["leading_item_id", "following_item_id"],
        how="inner"
    )

    X_test = sub[feature_cols].fillna(0.0)
    y_pred = np.expm1(predict_ensemble(models, X_test))
    y_pred = np.maximum(0, y_pred)

    sub["value"] = y_pred.round().astype(int)

    print(f"[INFO] submission rows: {len(sub)}")
    return sub[["leading_item_id", "following_item_id", "value"]]


# ============================================================
# 6. MAIN
# ============================================================
def main():
    print("=== Load monthly & pivot ===")
    _, pivot = load_monthly_data()

    print("=== Load best pairs ===")
    pairs_df = pd.read_csv(PAIRS_PATH)
    print(f"[INFO] pairs: {len(pairs_df)}")

    print("=== Build training df ===")
    train_df = build_training_data(pivot, pairs_df)
    print(f"[INFO] training rows: {len(train_df)}")

    print("=== Train LGBM seed ensemble ===")
    models, feature_cols = train_lgbm_seed_ensemble(train_df)

    print("=== Build inference features ===")
    pred_df = build_inference_features(pivot, pairs_df)
    print(f"[INFO] pred_df rows: {len(pred_df)}")

    print("=== Create submission ===")
    submission = create_submission(pairs_df, pred_df, models, feature_cols)

    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
    out_path = OUTPUT_DIR / "submission_v10_lgbm_seed.csv"
    submission.to_csv(out_path, index=False)

    print(f"[SAVE] {out_path}")


if __name__ == "__main__":
    main()


=== Load monthly & pivot ===
=== Load best pairs ===
[INFO] pairs: 1513
=== Build training df ===
[INFO] training rows: 52721
=== Train LGBM seed ensemble ===
[Seed 42] RMSE = 2.27621
[Seed 43] RMSE = 2.26147
[Seed 44] RMSE = 2.27165
Avg RMSE: 2.269775444343751
=== Build inference features ===
[INFO] pred_df rows: 1513
=== Create submission ===
[INFO] submission rows: 1513
[SAVE] /data/ephemeral/home/data/processed/v10_model_output/submission_v10_lgbm_seed.csv
