In [1]:
# ================================================================
# rule_lgbm_trade_v1.py
#  - 팀원 v3 기반 + 무역 도메인 지식 반영 버전
#  - preprocess_v6_1 결과물(pivot_value_v6.csv) 사용
#  - HS2 그룹별 trade-aware comovement 탐색
# ================================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
from lightgbm import LGBMRegressor

# ================================================================
# 0. PATH
# ================================================================
BASE = Path.cwd().resolve()
DATA = BASE.parents[1] / "data" / "processed"

PIVOT_PATH   = DATA / "pivot_value_v6.csv"     # item_id × t pivot
MONTHLY_PATH = DATA / "train_monthly_v6.csv"   # raw monthly panel


# ================================================================
# 1. Load pivot & monthly
# ================================================================
def load_data():
    pv = pd.read_csv(PIVOT_PATH, index_col=0)
    monthly = pd.read_csv(MONTHLY_PATH)
    return pv, monthly

pivot, monthly = load_data()

# index: item_id, columns: t (0~N)
items = pivot.index.tolist()
t_cols = [col for col in pivot.columns if str(col).isdigit()]
t_cols = sorted(t_cols, key=lambda x: int(x))

# attach hs2 meta
meta = monthly.groupby("item_id")[["hs2","hs3","hs4"]].agg(lambda x: x.mode().iloc[0])
pivot = pivot.merge(meta, left_index=True, right_index=True)


# ================================================================
# 2. Utility
# ================================================================
def safe_corr(x, y):
    if np.std(x)==0 or np.std(y)==0:
        return 0.0
    return float(np.corrcoef(x, y)[0,1])


# ================================================================
# 3. 도메인 기반 lag/corr 룰
# ================================================================
def calc_lag_range(hs2):
    """HS2에 따라 lag 범위를 도메인 기반으로 설정"""
    hs2 = str(hs2)

    # 화학/정밀화학
    if hs2 in ["28","29","32","33","34","38"]:
        return range(1, 6+1)  # 넓게 탐색

    # 금속·기계
    if hs2 in ["72","73","74","75","76","79","80","81","84","85"]:
        return range(1, 6+1)

    # 광물·에너지
    if hs2 in ["26","27"]:
        return range(1, 4+1)

    # 섬유·의류(계절)
    if hs2 in ["50","51","52","53","54","55","56","57","58","59","60","61","62","63"]:
        return range(1, 3+1)

    # default
    return range(1, 4+1)


def calc_corr_threshold(hs2_lead, hs2_foll):
    """HS2 기반으로 corr threshold 결정"""
    hs2_lead = str(hs2_lead)
    hs2_foll = str(hs2_foll)

    # 원재료 ↔ 중간재 (28~38 vs 72~85): 약한 corr도 허용
    if hs2_lead in ["28","29","32","34","38"] and hs2_foll in ["72","73","75","76","79","80","81","84","85"]:
        return 0.25

    # 에너지(27)는 가격 시계열 영향 → 조금 낮게
    if hs2_lead == "27" or hs2_foll == "27":
        return 0.20

    # 보통 case
    return 0.40


# ================================================================
# 4. 공행성 탐색
# ================================================================
def find_pairs(pivot):
    pairs = []

    for A in items:
        sA = pivot.loc[A, t_cols].values.astype(float)
        hs2_A = pivot.loc[A, "hs2"]

        if np.count_nonzero(sA) < 10:
            continue

        for B in items:
            if A == B:
                continue

            sB = pivot.loc[B, t_cols].values.astype(float)
            if np.count_nonzero(sB) < 10:
                continue

            hs2_B = pivot.loc[B, "hs2"]

            lags = calc_lag_range(hs2_A)
            thr  = calc_corr_threshold(hs2_A, hs2_B)

            best_corr = 0
            best_lag  = None

            for lag in lags:
                if len(sA) <= lag:
                    continue
                x = sA[:-lag]
                y = sB[lag:]
                c = safe_corr(x, y)
                if abs(c) > abs(best_corr):
                    best_corr = c
                    best_lag = lag

            if best_lag is not None and abs(best_corr) >= thr:
                pairs.append({
                    "leading_item_id" : A,
                    "following_item_id": B,
                    "best_lag" : best_lag,
                    "max_corr" : best_corr,
                    "hs2_lead": hs2_A,
                    "hs2_foll": hs2_B
                })

    return pd.DataFrame(pairs)


pairs = find_pairs(pivot)
print("Detected pairs:", len(pairs))


# ================================================================
# 5. Training frame
# ================================================================
def build_pair_data(A, B, lag, corr):
    sA = pivot.loc[A, t_cols].values.astype(float)
    sB = pivot.loc[B, t_cols].values.astype(float)
    n  = len(t_cols)

    rows=[]
    for t in range(lag, n-1):
        rows.append({
            "b_t"     : sB[t],
            "b_t_1"   : sB[t-1],
            "b_t_2"   : sB[t-2] if t>=2 else 0,
            "b_roll3" : np.mean(sB[max(0,t-2):t+1]),
            "a_t_lag" : sA[t-lag],
            "a_lag_1" : sA[t-lag-1] if t-lag-1>=0 else sA[t-lag],
            "diff_ab" : sA[t-lag] - sB[t],
            "corr"    : corr,
            "lag"     : lag,
            "target"  : sB[t+1],
            "hs2_lead": pivot.loc[A,"hs2"],
            "hs2_foll": pivot.loc[B,"hs2"]
        })

    return rows


def build_training_data(pairs):
    rows=[]
    for r in pairs.itertuples():
        rows += build_pair_data(
            r.leading_item_id,
            r.following_item_id,
            r.best_lag,
            r.max_corr
        )
    return pd.DataFrame(rows)


train_df = build_training_data(pairs)
print("Train df:", train_df.shape)


# ================================================================
# 6. Train / Valid split (time-based)
# ================================================================
# t 값 계산
N = len(t_cols)
# train: t <= N-6
# valid: N-5 ~ N-2 (최근 4개월)
train_mask = np.array([(i <= N-6) for i in range(N)])
valid_mask = np.array([(N-5 <= i <= N-2) for i in range(N)])


# ================================================================
# 7. LightGBM
# ================================================================
features = [
    "b_t","b_t_1","b_t_2","b_roll3",
    "a_t_lag","a_lag_1","diff_ab",
    "corr","lag",
]

X = train_df[features].values
y = np.log1p(train_df["target"].values)

model = LGBMRegressor(
    objective="regression",
    n_estimators=1200,
    num_leaves=63,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X, y)

print("Model trained.")


# ================================================================
# 8. Predict for 2025-08 (t = N)
# ================================================================
def predict_next_month():
    rows=[]

    t_last = N-1
    t_prev = N-2

    for r in pairs.itertuples():
        A, B = r.leading_item_id, r.following_item_id
        lag, corr = r.best_lag, r.max_corr

        sA = pivot.loc[A, t_cols].values.astype(float)
        sB = pivot.loc[B, t_cols].values.astype(float)

        if t_last-lag < 0:
            continue

        x = np.array([[
            sB[t_last],
            sB[t_prev],
            sB[t_prev-1] if t_prev-1>=0 else sB[t_prev],
            np.mean(sB[max(0,t_last-2):t_last+1]),
            sA[t_last-lag],
            sA[t_last-lag-1] if t_last-lag-1>=0 else sA[t_last-lag],
            sA[t_last-lag]-sB[t_last],
            corr,
            lag
        ]])

        pred = model.predict(x)[0]
        pred = max(0, np.expm1(pred))
        pred = int(round(pred))

        rows.append({
            "leading_item_id":A,
            "following_item_id":B,
            "value":pred
        })

    return pd.DataFrame(rows)

submission = predict_next_month()
submission.to_csv("submission_rule_trade_v1.csv", index=False)
print("Saved submission_rule_trade_v1.csv")


Detected pairs: 1666
Train df: (64685, 12)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2047
[LightGBM] [Info] Number of data points in the train set: 64685, number of used features: 9
[LightGBM] [Info] Start training from score 11.964322
Model trained.
Saved submission_rule_trade_v1.csv
