In [1]:
# =============================================================
# 0. 환경 설정
# =============================================================
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# =============================================================
# 1. Load dataset
# =============================================================
FE_PATH = "../../notebooks/preprocessing/fe_v1.csv"
DIR_PATH = "../../notebooks/preprocessing/direction_df_autoN.csv"   # ★ 중요 ★

fe_df = pd.read_csv(FE_PATH)
direction_df = pd.read_csv(DIR_PATH)

print("FE loaded:", fe_df.shape)
print("Direction loaded:", direction_df.shape)
print("Direction cols:", direction_df.columns.tolist())


# =============================================================
# 2. Split Train / Valid
# =============================================================
train_df = fe_df[fe_df["t"] <= 36].copy()
valid_df = fe_df[fe_df["t"] > 36].copy()

drop_cols = ["target", "leader", "follower"]

X_train = train_df.drop(columns=drop_cols)
y_train = train_df["target"]

X_valid = valid_df.drop(columns=drop_cols)
y_valid = valid_df["target"]

print("Train:", X_train.shape, "Valid:", X_valid.shape)


# =============================================================
# 3. LightGBM
# =============================================================
model = lgb.LGBMRegressor(
    objective="regression",
    metric="rmse",
    boosting_type="gbdt",
    n_estimators=5000,
    learning_rate=0.015,
    num_leaves=127,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=1.0,
    reg_lambda=1.0,
    min_child_samples=20,
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=True)]
)


# =============================================================
# 4. Validation RMSE
# =============================================================
pred_valid = model.predict(X_valid, num_iteration=model.best_iteration_)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
print("VALID RMSE:", rmse)


# =============================================================
# 5. Inference (t == 42)
# =============================================================
test_df = fe_df[fe_df["t"] == 42].copy()
test_features = test_df.drop(columns=drop_cols)

test_pred = model.predict(test_features, num_iteration=model.best_iteration_)
print("test_pred:", test_pred.shape)


# =============================================================
# 6. FE 순서 기반 direction_df 정렬
# =============================================================
order_df = test_df[["leader", "follower"]].reset_index(drop=True)

direction_sorted = order_df.merge(
    direction_df,
    on=["leader", "follower"],
    how="left"
)

assert len(direction_sorted) == len(test_pred), \
    f"Mismatch: sorted={len(direction_sorted)}, pred={len(test_pred)}"

direction_sorted["value"] = (
    pd.Series(test_pred).clip(lower=0).round().astype(int)
)


# =============================================================
# 7. sample_submission 생성
# =============================================================
sample = pd.read_csv("../../data/raw/sample_submission.csv")

if "value" in sample.columns:
    sample = sample.drop(columns=["value"])

sub = sample.merge(
    direction_sorted[["leader", "follower", "value"]],
    left_on=["leading_item_id", "following_item_id"],
    right_on=["leader", "follower"],
    how="inner"
)

print("FINAL SUB SHAPE:", sub.shape)


# =============================================================
# 8. 최종 저장
# =============================================================
sub = sub[["leading_item_id", "following_item_id", "value"]]
sub.to_csv("submission_gbdt_final.csv", index=False)
print("Saved: submission_gbdt_final.csv")


FE loaded: (8568, 17)
Direction loaded: (204, 26)
Direction cols: ['item_i', 'item_j', 'cos_val', 'cos_wgt', 'sign_agree_val', 'cc_val', 'lag_val', 'dtw_dist', 'dtw_sim', 'hs_dist', 'hs4_i', 'hs4_j', 'hs3_i', 'hs3_j', 'hs2_i', 'hs2_j', 'cluster_i', 'cluster_j', 'same_cluster', 'dtw_norm', 'hs_dist_norm', 'cluster_sim', 'lag_dir', 'score_v1', 'leader', 'follower']
Train: (7344, 14) Valid: (1224, 14)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 7344, number of used features: 14
[LightGBM] [Info] Start training from score 5177038.987745
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[110]	valid_0's rmse: 3.41295e+06
VALID RMSE: 3412946.971898972
test_pred: (204,)
FINAL SUB SHAPE: (204, 5)
Saved: submission_gbdt_final.csv


In [3]:
sub['value'].describe()


count    2.040000e+02
mean     4.491353e+06
std      8.104013e+06
min      1.011119e+06
25%      1.053516e+06
50%      1.315238e+06
75%      4.337183e+06
max      4.548604e+07
Name: value, dtype: float64