In [12]:
# ============================================================
# KMU Comovement - Í∞ïÌôî Î≤†Ïù¥Ïä§ÎùºÏù∏
# - rule-based (corr + stability)
# - LightGBM (main) + CatBoost (secondary)
# - log-target + ÏïôÏÉÅÎ∏î
# ============================================================

import math
import numpy as np
import pandas as pd
from tqdm import tqdm

# ------------------------------------------------------------
# 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú
# ------------------------------------------------------------
train = pd.read_csv("./train.csv")
sub_template = pd.read_csv("./sample_submission.csv")

print("train.shape:", train.shape)
print("submission template shape:", sub_template.shape)

# ------------------------------------------------------------
# 2. ÏõîÎ≥Ñ ÏßëÍ≥Ñ + ÌîºÎ≤ó (item_id √ó ym)
# ------------------------------------------------------------
# (1) item_id √ó year √ó month Ìï©ÏÇ∞
monthly = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)["value"]
    .sum()
)

# (2) year, month ‚Üí ym (datetime Ïõî Îã®ÏúÑ)
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)

# (3) item_id √ó ym ÌîºÎ≤ó
pivot = (
    monthly
    .pivot(index="item_id", columns="ym", values="value")
    .fillna(0.0)
)

print("pivot shape:", pivot.shape)
display(pivot.head())

# ------------------------------------------------------------
# 3. Í≥µÌñâÏÑ± Ïåç ÌÉêÏÉâ - corr + stability rule-based
# ------------------------------------------------------------
def safe_corr(x, y):
    """
    ÌëúÏ§ÄÌé∏Ï∞® 0 Îì± ÏòàÏô∏ ÏºÄÏù¥Ïä§ ÏïàÏ†Ñ Ï≤òÎ¶¨ ÏÉÅÍ¥ÄÍ≥ÑÏàò
    """
    x = np.asarray(x, float)
    y = np.asarray(y, float)
    if x.size == 0 or y.size == 0:
        return 0.0
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])


def stability_score(a_seg, b_seg):
    """
    Í∞ÑÎã® stability ÏßÄÌëú:
    - ÏõîÎ≥Ñ Î≥ÄÌôîÎüâ(Œî)Ïùò Î∞©Ìñ•Ïù¥ ÏñºÎßàÎÇò ÏûêÏ£º ÏùºÏπòÌïòÎäîÏßÄ ÎπÑÏú®
    """
    a_seg = np.asarray(a_seg, float)
    b_seg = np.asarray(b_seg, float)

    if len(a_seg) <= 2:
        return 0.0

    da = np.diff(a_seg)
    db = np.diff(b_seg)

    sign_a = np.sign(da)
    sign_b = np.sign(db)

    mask = (sign_a != 0) | (sign_b != 0)
    if mask.sum() == 0:
        return 0.0

    agree = (sign_a[mask] == sign_b[mask]).mean()
    return float(agree)


def find_comovement_pairs(
    pivot,
    max_lag: int = 6,
    min_nonzero: int = 6,
    corr_threshold: float = 0.3,
    stability_threshold: float = 0.55,
):
    """
    corr + stability Îëò Îã§ ÎßåÏ°±ÌïòÎäî (A,B) ÏåçÎßå Í≥µÌñâÏÑ± ÏåçÏúºÎ°ú Ï±ÑÌÉù
    """
    items = pivot.index.to_list()
    months = pivot.columns.to_list()
    n_months = len(months)

    results = []

    for leader in tqdm(items, desc="find_comovement_pairs - leader"):
        x = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if follower == leader:
                continue

            y = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_corr = 0.0
            best_lag = None
            best_stab = 0.0

            for lag in range(1, max_lag + 1):
                t_start = lag
                t_end = n_months

                a_seg = x[t_start - lag : t_end - lag]
                b_seg = y[t_start:t_end]

                if len(a_seg) < 3:
                    continue

                c = safe_corr(a_seg, b_seg)
                s = stability_score(a_seg, b_seg)

                # corr Ï§ëÏã¨ÏúºÎ°ú ÏÑ†ÌÉù (stabilityÎäî Í∏∞Î°ùÎßå)
                if abs(c) > abs(best_corr):
                    best_corr = c
                    best_lag = lag
                    best_stab = s

            if (
                best_lag is not None
                and abs(best_corr) >= corr_threshold
                and best_stab >= stability_threshold
            ):
                results.append(
                    {
                        "leading_item_id": leader,
                        "following_item_id": follower,
                        "best_lag": int(best_lag),
                        "max_corr": float(best_corr),
                        "stability": float(best_stab),
                    }
                )

    pairs = pd.DataFrame(results)
    return pairs


# Ïã§Ï†ú Ïã§Ìñâ
pairs = find_comovement_pairs(
    pivot,
    max_lag=6,
    min_nonzero=6,
    corr_threshold=0.3,      # ÌäúÎãù Ìè¨Ïù∏Ìä∏ 1
    stability_threshold=0.63 # ÌäúÎãù Ìè¨Ïù∏Ìä∏ 2
)

print("ÌÉêÏÉâÎêú Í≥µÌñâÏÑ± Ïåç Ïàò:", len(pairs))
display(pairs.head())

# ------------------------------------------------------------
# 4. ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ ÏÉùÏÑ± (self-supervised)
# ------------------------------------------------------------
def build_training_data(pivot, pairs):
    """
    Í≥µÌñâÏÑ±Ïåç + ÏãúÍ≥ÑÏó¥ÏóêÏÑú (X, y) ÎßåÎì§Í∏∞
    - target: Îã§ÏùåÎã¨ b_{t+1} Ïùò log1p Í∞í
    """
    months = pivot.columns.to_list()
    n_months = len(months)
    rows = []

    for row in tqdm(pairs.itertuples(index=False), total=len(pairs), desc="build_training_data"):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        stab = float(row.stability)

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        # t-1, t, t+1, t-lag Î™®Îëê Ï°¥Ïû¨Ìï¥Ïïº Ìï®
        t_min = max(lag, 1)
        t_max = n_months - 1  # t+1ÍπåÏßÄ Ïù¥Ïö©

        for t in range(t_min, t_max):
            b_t = b_series[t]
            b_t_1 = b_series[t - 1]
            a_t_lag = a_series[t - lag]
            y = b_series[t + 1]

            rows.append(
                {
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "t_idx": t,
                    "b_t": b_t,
                    "b_t_1": b_t_1,
                    "a_t_lag": a_t_lag,
                    "max_corr": corr,
                    "best_lag": lag,
                    "stability": stab,
                    "log_b_t": math.log1p(b_t),
                    "log_b_t_1": math.log1p(b_t_1),
                    "log_a_t_lag": math.log1p(a_t_lag),
                    "delta_b": b_t - b_t_1,
                    "rel_delta_b": (b_t - b_t_1) / (b_t_1 + 1.0),
                    "target": y,
                    "log_target": math.log1p(y),
                }
            )

    df = pd.DataFrame(rows)
    return df


df_train_model = build_training_data(pivot, pairs)
print("df_train_model shape:", df_train_model.shape)
display(df_train_model.head())

# ------------------------------------------------------------
# 5. LightGBM (main) + CatBoost (secondary) ÌïôÏäµ
# ------------------------------------------------------------
feature_cols = [
    "b_t",
    "b_t_1",
    "a_t_lag",
    "max_corr",
    "best_lag",
    "stability",
    "log_b_t",
    "log_b_t_1",
    "log_a_t_lag",
    "delta_b",
    "rel_delta_b",
]

train_X = df_train_model[feature_cols]
train_y = df_train_model["log_target"].values  # log1p ÌÉÄÍπÉ

models = {}

# 5-1. LightGBM
try:
    import lightgbm as lgb

    print("Training LightGBMRegressor...")
    lgb_model = lgb.LGBMRegressor(
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=-1,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
    )
    lgb_model.fit(train_X, train_y)
    models["lgb"] = lgb_model
except ImportError:
    print("LightGBMÏù¥ ÏÑ§ÏπòÎêòÏñ¥ ÏûàÏßÄ ÏïäÏäµÎãàÎã§. !pip install lightgbm ÌõÑ Îã§Ïãú ÏãúÎèÑÌïòÏÑ∏Ïöî.")

# 5-2. CatBoost
try:
    from catboost import CatBoostRegressor

    print("Training CatBoostRegressor...")
    cat_model = CatBoostRegressor(
        depth=8,
        learning_rate=0.05,
        n_estimators=1000,
        loss_function="RMSE",
        random_seed=42,
        verbose=False,
    )
    cat_model.fit(train_X, train_y)
    models["cat"] = cat_model
except ImportError:
    print("CatBoostÍ∞Ä ÏÑ§ÏπòÎêòÏñ¥ ÏûàÏßÄ ÏïäÏäµÎãàÎã§. !pip install catboost ÌõÑ Îã§Ïãú ÏãúÎèÑÌïòÏÑ∏Ïöî.")


# 5-3. log-space ÏïôÏÉÅÎ∏î Ìï®Ïàò
def ensemble_predict_log(X):
    """
    Îì±Î°ùÎêú Î™®Îç∏Îì§(LGB, CatBoost Îì±)Ïùò log ÏòàÏ∏°Í∞íÏùÑ ÌèâÍ∑†
    """
    preds = []
    for name, m in models.items():
        p = m.predict(X)
        preds.append(p)

    if len(preds) == 0:
        # Î∞©Ïñ¥ ÏΩîÎìú: Î™®Îç∏Ïù¥ ÌïòÎÇòÎèÑ ÏóÜÏúºÎ©¥ 0
        return np.zeros(X.shape[0])

    preds = np.vstack(preds)  # (n_models, n_samples)
    avg_log = preds.mean(axis=0)
    return avg_log

# ------------------------------------------------------------
# 6. Ï∂îÎ°†Ïö©: (A,B) pairÎ≥Ñ best_lag, corr, stability Í≥ÑÏÇ∞
# ------------------------------------------------------------
def compute_best_lag_corr_stab_for_pair(pivot, leader, follower, max_lag=6):
    """
    inferenceÏö© (A,B) ÏåçÏóê ÎåÄÌï¥ best_lag, max_corr, stability Í≥ÑÏÇ∞
    """
    x = pivot.loc[leader].values.astype(float)
    y = pivot.loc[follower].values.astype(float)
    n_months = x.shape[0]

    best_corr = 0.0
    best_lag = 1
    best_stab = 0.0

    for lag in range(1, max_lag + 1):
        t_start = lag
        t_end = n_months

        a_seg = x[t_start - lag : t_end - lag]
        b_seg = y[t_start:t_end]

        if len(a_seg) < 3:
            continue

        c = safe_corr(a_seg, b_seg)
        s = stability_score(a_seg, b_seg)

        if abs(c) > abs(best_corr):
            best_corr = c
            best_lag = lag
            best_stab = s

    return best_lag, best_corr, best_stab

# ------------------------------------------------------------
# 7. Ï†úÏ∂ú ÌååÏùº Ï†ÑÏ≤¥Ïóê ÎåÄÌï¥ ÏïôÏÉÅÎ∏î ÏòàÏ∏°
# ------------------------------------------------------------
def predict_for_submission_only_pairs(pivot, pairs, max_lag=6):
    """
    Í≥µÌñâÏÑ± ÏûàÎã§Í≥† rule-basedÎ°ú ÏÑ†Ï†ïÎêú pairs Ïóê ÎåÄÌï¥ÏÑúÎßå
    Îã§ÏùåÎã¨ Î¨¥Ïó≠ÎüâÏùÑ ÏòàÏ∏°Ìï¥ÏÑú Ï†úÏ∂úÏö© DataFrame ÏÉùÏÑ±
    """
    months = pivot.columns.to_list()
    n_months = len(months)

    t_last = n_months - 1   # ÎßàÏßÄÎßâ Îã¨ index
    t_prev = n_months - 2   # ÎßàÏßÄÎßâ-1 Îã¨ index

    preds = []

    for row in tqdm(pairs.itertuples(index=False), total=len(pairs), desc="predict_only_pairs"):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)
        stab = float(row.stability)

        # ÌòπÏãúÎùºÎèÑ ÌîºÎ≤óÏóê ÏóÜÎäî ÏïÑÏù¥ÌÖúÏù¥Î©¥ Ïä§ÌÇµ (ÎòêÎäî 0 ÏòàÏ∏°)
        if leader not in pivot.index or follower not in pivot.index:
            continue

        a_series = pivot.loc[leader].values.astype(float)
        b_series = pivot.loc[follower].values.astype(float)

        if t_last - lag < 0:
            # lagÍ∞Ä ÎÑàÎ¨¥ ÌÅ¨Î©¥ Í∑∏ÎÉ• ÎßàÏßÄÎßâ Í∞íÏúºÎ°ú ÎåÄÏ≤¥
            y_pred = float(b_series[t_last])
        else:
            b_t = b_series[t_last]
            b_t_1 = b_series[t_prev]
            a_t_lag = a_series[t_last - lag]

            feat = {
                "b_t": b_t,
                "b_t_1": b_t_1,
                "a_t_lag": a_t_lag,
                "max_corr": corr,
                "best_lag": lag,
                "stability": stab,
                "log_b_t": math.log1p(b_t),
                "log_b_t_1": math.log1p(b_t_1),
                "log_a_t_lag": math.log1p(a_t_lag),
                "delta_b": b_t - b_t_1,
                "rel_delta_b": (b_t - b_t_1) / (b_t_1 + 1.0),
            }

            # üî¥ Ïó¨Í∏∞ DataFrameÏúºÎ°ú ÎßåÎì§Ïñ¥Ï§òÏïº LGBM Í≤ΩÍ≥†ÎèÑ Ïïà Îú∏
            X_test_df = pd.DataFrame([feat])[feature_cols]
            log_pred = float(ensemble_predict_log(X_test_df)[0])
            y_pred = math.expm1(log_pred)

        y_pred = max(0.0, y_pred)
        y_pred = int(round(y_pred))

        preds.append(
            {
                "leading_item_id": leader,
                "following_item_id": follower,
                "value": y_pred,
            }
        )

    df_pred = pd.DataFrame(preds)
    return df_pred


# Ïã§Ï†ú ÏòàÏ∏° & Ï†ÄÏû•
submission = predict_for_submission_only_pairs(pivot, pairs, max_lag=6)
print("Ï†úÏ∂úÏö© Ìñâ Í∞úÏàò:", len(submission))
submission.head()

submission.to_csv("submission_only_comovement_pairs.csv", index=False)



train.shape: (10836, 9)
submission template shape: (9900, 3)
pivot shape: (100, 43)


ym,2022-01-01,2022-02-01,2022-03-01,2022-04-01,2022-05-01,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,...,2024-10-01,2024-11-01,2024-12-01,2025-01-01,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,14276.0,52347.0,53549.0,0.0,26997.0,84489.0,0.0,0.0,0.0,0.0,...,428725.0,144248.0,26507.0,25691.0,25805.0,0.0,38441.0,0.0,441275.0,533478.0
AHMDUILJ,242705.0,120847.0,197317.0,126142.0,71730.0,149138.0,186617.0,169995.0,140547.0,89292.0,...,123085.0,143451.0,78649.0,125098.0,80404.0,157401.0,115509.0,127473.0,89479.0,101317.0
ANWUJOKX,0.0,0.0,0.0,63580.0,81670.0,26424.0,8470.0,0.0,0.0,80475.0,...,0.0,0.0,0.0,27980.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,383999.0,512813.0,217064.0,470398.0,539873.0,582317.0,759980.0,216019.0,537693.0,205326.0,...,683581.0,2147.0,0.0,25013.0,77.0,20741.0,2403.0,3543.0,32430.0,40608.0
ATLDMDBO,143097177.0,103568323.0,118403737.0,121873741.0,115024617.0,65716075.0,146216818.0,97552978.0,72341427.0,87454167.0,...,60276050.0,30160198.0,42613728.0,64451013.0,38667429.0,29354408.0,42450439.0,37136720.0,32181798.0,57090235.0


find_comovement_pairs - leader: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:13<00:00,  7.33it/s]

ÌÉêÏÉâÎêú Í≥µÌñâÏÑ± Ïåç Ïàò: 394





Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr,stability
0,AANGBULD,DDEXPPXU,2,0.383169,0.65
1,AHMDUILJ,ATLDMDBO,4,0.483281,0.657895
2,AHMDUILJ,BEZYMBBT,4,0.319341,0.684211
3,AHMDUILJ,BUZIIBYG,4,0.461739,0.657895
4,AHMDUILJ,DUCMGGNW,1,0.450955,0.731707


build_training_data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 394/394 [00:00<00:00, 6259.49it/s]

df_train_model shape: (15093, 16)





Unnamed: 0,leading_item_id,following_item_id,t_idx,b_t,b_t_1,a_t_lag,max_corr,best_lag,stability,log_b_t,log_b_t_1,log_a_t_lag,delta_b,rel_delta_b,target,log_target
0,AANGBULD,DDEXPPXU,2,38475.0,12187.0,14276.0,0.383169,2,0.65,10.55779,9.408207,9.566405,26288.0,2.156876,23209.0,10.052338
1,AANGBULD,DDEXPPXU,3,23209.0,38475.0,52347.0,0.383169,2,0.65,10.052338,10.55779,10.865669,-15266.0,-0.396767,37804.0,10.540197
2,AANGBULD,DDEXPPXU,4,37804.0,23209.0,53549.0,0.383169,2,0.65,10.540197,10.052338,10.888371,14595.0,0.628824,27145.0,10.208985
3,AANGBULD,DDEXPPXU,5,27145.0,37804.0,0.0,0.383169,2,0.65,10.208985,10.540197,0.0,-10659.0,-0.281947,1210.0,7.099202
4,AANGBULD,DDEXPPXU,6,1210.0,27145.0,26997.0,0.383169,2,0.65,7.099202,10.208985,10.203518,-25935.0,-0.955389,5943.0,8.690138


Training LightGBMRegressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001884 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2334
[LightGBM] [Info] Number of data points in the train set: 15093, number of used features: 11
[LightGBM] [Info] Start training from score 12.702178
Training CatBoostRegressor...


predict_only_pairs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 394/394 [00:03<00:00, 117.68it/s]

Ï†úÏ∂úÏö© Ìñâ Í∞úÏàò: 394



