In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor



In [2]:

# ============================================================
# 0. Í≤ΩÎ°ú
# ============================================================
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR.parents[1] / "data" / "processed"

PAIR_PATH = DATA_DIR / "pair_train.csv"
FE_PATH   = DATA_DIR / "features_basic.csv"
MONTHLY_PATH = DATA_DIR / "train_monthly.csv"

print("Loading...")
pair_train = pd.read_csv(PAIR_PATH)
fe = pd.read_csv(FE_PATH)
monthly = pd.read_csv(MONTHLY_PATH)

print(pair_train.shape, fe.shape)


Loading...
(343354, 8) (300, 11)


In [3]:
# ============================================================
# 1. Feature merge (pair_train + FE)
# ============================================================

# follower item feature merge
pair_train = pair_train.merge(
    fe.add_prefix("f_"), 
    left_on="following_item_id", 
    right_on="f_item_id",
    how="left"
)

# leader item feature merge
pair_train = pair_train.merge(
    fe.add_prefix("l_"), 
    left_on="leading_item_id", 
    right_on="l_item_id",
    how="left"
)

print("Merged pair_train:", pair_train.shape)


Merged pair_train: (3090186, 30)


In [4]:
# ============================================================
# 2. Train/Valid split
# ============================================================

target = pair_train["target"]
drop_cols = [
    "target", 
    "leading_item_id", "following_item_id",
    "f_item_id", "l_item_id"
]

X = pair_train.drop(columns=drop_cols, errors="ignore")

X_train, X_valid, y_train, y_valid = train_test_split(
    X, target, test_size=0.2, random_state=42
)




In [7]:

# ============================================================
# 3. LightGBM Î™®Îç∏
# ============================================================

model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,

    # ÏïàÏ†ïÏÑ± Í∞ïÌôî
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_samples=30,

    random_state=42,
    verbose=-1,
)

print("Training LGBM...")
model.set_params(verbose=-1)  # ÎòêÎäî 50, 200 Îì±ÏúºÎ°ú Î∞îÍøîÎèÑ Îê®

model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="l2"
)



Training LGBM...


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.03
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:



# ============================================================
# 4. 2025-08 ÏòàÏ∏°ÏùÑ ÏúÑÌïú feature ÏÉùÏÑ±
# ============================================================

# pivot (itemÎ≥Ñ ÏãúÍ≥ÑÏó¥)
monthly["ym"] = pd.to_datetime(monthly["year"].astype(str) + "-" + monthly["month"].astype(str))
pivot_item = monthly.pivot(index="item_id", columns="ym", values="total_value").fillna(0)

months = pivot_item.columns.to_list()
t_last = len(months) - 1
t_prev = t_last - 1


# baselineÏóêÏÑú Ï∂îÏ∂úÌïú pairs ÏÇ¨Ïö©
# pair_trainÍ≥º ÎèôÏùºÌïú leading/following pair ÌïÑÏöî
pairs = pair_train[["leading_item_id","following_item_id","best_lag","max_corr"]].drop_duplicates()


rows = []
for row in tqdm(pairs.itertuples(), total=len(pairs)):
    L = row.leading_item_id
    F = row.following_item_id
    lag = int(row.best_lag)
    corr = float(row.max_corr)

    xs = pivot_item.loc[L].values
    ys = pivot_item.loc[F].values
    
    if t_last - lag < 0:
        continue
    
    feat = {
        "b_t": ys[t_last],
        "b_t_1": ys[t_prev],
        "a_t_lag": xs[t_last - lag],
        "max_corr": corr,
        "best_lag": lag
    }
    
    # follower item FE
    frow = fe[fe["item_id"] == F]
    for c in frow.columns:
        if c == "item_id": continue
        feat["f_" + c] = frow[c].values[0]
    
    # leader item FE
    lrow = fe[fe["item_id"] == L]
    for c in lrow.columns:
        if c == "item_id": continue
        feat["l_" + c] = lrow[c].values[0]
    
    rows.append({
        "leading_item_id": L,
        "following_item_id": F,
        **feat
    })

test_df = pd.DataFrame(rows)
print("Test features:", test_df.shape)



In [None]:

# ============================================================
# 5. ÏòàÏ∏°
# ============================================================

test_X = test_df.drop(columns=["leading_item_id","following_item_id"])
preds = model.predict(test_X)

preds = np.maximum(preds, 0).round().astype(int)

submission = test_df[["leading_item_id","following_item_id"]].copy()
submission["value"] = preds

print(submission.head())



In [None]:

# ============================================================
# 6. Ï†ÄÏû•
# ============================================================
OUT = BASE_DIR / "v1_submit_lgbm.csv"
submission.to_csv(OUT, index=False)

print(f"üî• Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû• ÏôÑÎ£å: {OUT}")
