In [1]:
# ============================================================
# raw_ensemble_baseline_v1_weighted_gpu.py
#  - Raw baseline + LGBM/XGB/CAT ensemble (GPU for XGB/Cat)
#  - Ensemble weights (무역 특화 A안):
#       LGBM=0.60, XGB=0.25, CatBoost=0.15
# ============================================================

import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

# Optional Models
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except:
    HAS_XGB = False
    print("⚠ XGBRegressor not installed")

try:
    from catboost import CatBoostRegressor
    HAS_CAT = True
except:
    HAS_CAT = False
    print("⚠ CatBoost not installed")

# ============================================================
# 0. PATH & LOAD
# ============================================================
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"
TRAIN_PATH = DATA_DIR / "raw" / "train.csv"

print("TRAIN_PATH:", TRAIN_PATH)
train = pd.read_csv(TRAIN_PATH)


TRAIN_PATH: /data/ephemeral/home/data/raw/train.csv


In [7]:

# ============================================================
# 1. Pivot (raw)
# ============================================================
monthly = (
    train.groupby(["item_id","year","month"], as_index=False)["value"]
         .sum()
)
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)
pivot = (
    monthly.pivot(index="item_id", columns="ym", values="value")
           .fillna(0.0)
)
print("pivot shape:", pivot.shape)

# ============================================================
# 2. Comovement Search
# ============================================================
def safe_corr(x,y):
    if np.std(x)==0 or np.std(y)==0:
        return 0.0
    return float(np.corrcoef(x,y)[0,1])

def find_comovement_pairs(pivot_df, max_lag=6, min_nonzero=12, corr_threshold=0.4):
    items = pivot_df.index.to_list()
    months = pivot_df.columns.to_list()
    n = len(months)
    res=[]

    for leader in tqdm(items, desc="Searching pairs"):
        x = pivot_df.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if follower == leader:
                continue
            y = pivot_df.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_corr = 0.0
            best_lag  = None
            for lag in range(1, max_lag+1):
                if n <= lag:
                    continue
                c = safe_corr(x[:-lag], y[lag:])
                if abs(c) > abs(best_corr):
                    best_corr = c
                    best_lag  = lag

            if best_lag is not None and abs(best_corr) >= corr_threshold:
                res.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr
                })

    return pd.DataFrame(res)

pairs = find_comovement_pairs(pivot)
print("pairs:", len(pairs))

# ============================================================
# 3. Build Training Data
# ============================================================
def build_training_data(pivot_df, pairs_df):
    months = pivot_df.columns.to_list()
    n = len(months)
    rows=[]

    for row in pairs_df.itertuples(index=False):
        A,B = row.leading_item_id, row.following_item_id
        lag  = int(row.best_lag)
        corr = float(row.max_corr)

        if A not in pivot_df.index or B not in pivot_df.index:
            continue

        sA = pivot_df.loc[A].values.astype(float)
        sB = pivot_df.loc[B].values.astype(float)

        for t in range(max(lag,1), n-1):
            rows.append({
                "b_t"     : sB[t],
                "b_t_1"   : sB[t-1],
                "a_t_lag" : sA[t-lag],
                "max_corr": corr,
                "best_lag": lag,
                "target"  : sB[t+1],
            })

    return pd.DataFrame(rows)

df_train = build_training_data(pivot, pairs)
print("train:", df_train.shape)

# ============================================================
# 4. Prepare X/y + Stratified bins
# ============================================================
FEATS = ["b_t","b_t_1","a_t_lag","max_corr","best_lag"]

X_all = df_train[FEATS].values
y_all = df_train["target"].values.astype(float)

y_log = np.log1p(np.clip(y_all,0,None))
ranked = pd.Series(y_log).rank(method="first")
bins = pd.qcut(ranked, q=5, labels=False, duplicates="drop").fillna(0).astype(int)

print("Bins:\n", bins.value_counts().sort_index())

# ============================================================
# 5. Model configurations (GPU for XGB/Cat)
# ============================================================
MODEL_CONFIGS=[]

def add_lgb(name, seed, params):
    MODEL_CONFIGS.append({"name":name,"type":"lgbm","seed":seed,"params":params})

def add_xgb(name, seed, params):
    if HAS_XGB:
        MODEL_CONFIGS.append({"name":name,"type":"xgb","seed":seed,"params":params})

def add_cat(name, seed, params):
    if HAS_CAT:
        MODEL_CONFIGS.append({"name":name,"type":"cat","seed":seed,"params":params})

# LightGBM (CPU)
lgb_base = dict(
    objective="regression",
    n_estimators=2000,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=0.5,
    n_jobs=-1,
    verbosity=-1,
)

add_lgb("lgb_seed7",    7,    dict(lgb_base, num_leaves=63, max_depth=-1))
add_lgb("lgb_seed42",   42,   dict(lgb_base, num_leaves=31, max_depth=6))
add_lgb("lgb_seed2025", 2025, dict(lgb_base, num_leaves=31, max_depth=4))

# XGBoost (GPU)
xgb_base = dict(
    objective="reg:squarederror",
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    tree_method="hist",   # ← GPU와 호환되는 유일한 tree_method
    device="cuda:0",       # ← GPU 사용 선언
    n_jobs=-1,
    verbosity=0,
)

add_xgb("xgb_seed7", 7, xgb_base)
add_xgb("xgb_seed42", 42, dict(xgb_base, max_depth=3))

# CatBoost (GPU)
cat_base = dict(
    loss_function="RMSE",
    depth=6,
    learning_rate=0.05,
    iterations=2000,
    task_type="GPU",   # GPU
    devices="0",       # GPU 0
    verbose=False,
)
add_cat("cat_seed7", 7, cat_base)

print("Total configs:", len(MODEL_CONFIGS))


pivot shape: (100, 43)


Searching pairs: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.92it/s]


pairs: 1425
train: (54743, 6)
Bins:
 0    10949
1    10948
2    10949
3    10948
4    10949
Name: count, dtype: int64
Total configs: 6


In [None]:

# ============================================================
# 6. K-Fold Training (Fold RMSE만 출력)
# ============================================================
lgb_models=[]
xgb_models=[]
cat_models=[]

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for cfg in MODEL_CONFIGS:
    name  = cfg["name"]
    mtype = cfg["type"]
    seed  = cfg["seed"]
    params= cfg["params"]

    print(f"\n▶ Training {name} ({mtype})")

    for fold,(tr_idx,va_idx) in enumerate(kf.split(X_all,bins)):
        X_tr, X_va = X_all[tr_idx], X_all[va_idx]
        y_tr, y_va = y_all[tr_idx], y_all[va_idx]

        if mtype=="lgbm":
            model = LGBMRegressor(**params, random_state=seed+fold)
        elif mtype=="xgb":
            model = XGBRegressor(**params, random_state=seed+fold)
        elif mtype=="cat":
            model = CatBoostRegressor(**params, random_seed=seed+fold)
        else:
            continue

        model.fit(X_tr, y_tr)

        pred = model.predict(X_va)
        rmse = np.sqrt(mean_squared_error(y_va, pred))
        print(f"  Fold {fold} RMSE: {rmse:.4f}")

        if mtype=="lgbm":
            lgb_models.append(model)
        elif mtype=="xgb":
            xgb_models.append(model)
        elif mtype=="cat":
            cat_models.append(model)

print(f"\nLGBM: {len(lgb_models)}, XGB: {len(xgb_models)}, CAT: {len(cat_models)}")

# ============================================================
# 7. Weighted Ensemble Prediction
# ============================================================
def predict_weighted(pivot_df, pairs_df,
                     lgb_models, xgb_models, cat_models):

    months = pivot_df.columns.to_list()
    n = len(months)
    t_last = n-1
    t_prev = n-2

    rows=[]
    for row in tqdm(pairs_df.itertuples(index=False), total=len(pairs_df), desc="Predicting"):
        A = row.leading_item_id
        B = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        if A not in pivot_df.index or B not in pivot_df.index:
            continue

        sA = pivot_df.loc[A].values.astype(float)
        sB = pivot_df.loc[B].values.astype(float)

        if t_last-lag < 0:
            continue

        x = np.array([[sB[t_last], sB[t_prev], sA[t_last-lag], corr, lag]])

        preds_lgb = [m.predict(x)[0] for m in lgb_models]
        preds_xgb = [m.predict(x)[0] for m in xgb_models]
        preds_cat = [m.predict(x)[0] for m in cat_models]

        wL, wX, wC = 0.60, 0.25, 0.15

        pred = (
            (np.mean(preds_lgb) if preds_lgb else 0)*wL +
            (np.mean(preds_xgb) if preds_xgb else 0)*wX +
            (np.mean(preds_cat) if preds_cat else 0)*wC
        )
        pred = max(0.0, pred)
        pred = int(round(pred))

        rows.append({
            "leading_item_id":A,
            "following_item_id":B,
            "value":pred
        })

    return pd.DataFrame(rows)

# ============================================================
# 8. Save submission
# ============================================================
submission = predict_weighted(pivot, pairs,
                              lgb_models, xgb_models, cat_models)

save_path = "raw_weighted_ensemble_gpu_submit.csv"
submission.to_csv(save_path, index=False)
print("\nSaved:", save_path)
print(submission.head())



▶ Training lgb_seed7 (lgbm)




  Fold 0 RMSE: 1668956.7072




  Fold 1 RMSE: 1949047.9942




  Fold 2 RMSE: 1988456.6189




  Fold 3 RMSE: 2167954.3708




  Fold 4 RMSE: 2078962.4028

▶ Training lgb_seed42 (lgbm)




  Fold 0 RMSE: 1939167.5263




  Fold 1 RMSE: 2146670.3895




  Fold 2 RMSE: 2087979.8643




  Fold 3 RMSE: 2276150.4611




  Fold 4 RMSE: 2134862.7286

▶ Training lgb_seed2025 (lgbm)




  Fold 0 RMSE: 2196165.3205




  Fold 1 RMSE: 2412812.8819




  Fold 2 RMSE: 2339163.3660




  Fold 3 RMSE: 2541117.0705




  Fold 4 RMSE: 2315243.4592

▶ Training xgb_seed7 (xgb)
  Fold 0 RMSE: 2465197.2739
  Fold 1 RMSE: 2494672.2400
  Fold 2 RMSE: 2484133.7888
  Fold 3 RMSE: 2972885.7334
  Fold 4 RMSE: 2691279.9543

▶ Training xgb_seed42 (xgb)
  Fold 0 RMSE: 2743068.5052
  Fold 1 RMSE: 2760386.6827
  Fold 2 RMSE: 2702938.3551
  Fold 3 RMSE: 3083424.5063
  Fold 4 RMSE: 2878237.2606

▶ Training cat_seed7 (cat)
  Fold 0 RMSE: 3377472.5938
  Fold 1 RMSE: 3784340.9829
  Fold 2 RMSE: 3527391.3386
  Fold 3 RMSE: 3575451.7266
  Fold 4 RMSE: 3544055.4303

LGBM: 15, XGB: 10, CAT: 5


Predicting:  28%|█████████████████████████▋                                                                   | 394/1425 [00:57<02:27,  6.99it/s]