In [4]:
# ============================================================
# pair_generate_v10_best.py (EDA 포함, dtaidistance 제거 버전)
# ============================================================

from __future__ import annotations
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression


# ============================================================
# PATH
# ============================================================
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"

RAW_PATH          = DATA_DIR / "raw" / "train.csv"
PROCESSED_DIR     = DATA_DIR / "processed"
TRAIN_MONTH_PATH  = PROCESSED_DIR / "train_month.csv"

OUT_DIR           = PROCESSED_DIR / "v10_pairs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

PIVOT_PATH        = OUT_DIR / "monthly_pivot_v10.csv"
BEST_PAIR_PATH    = OUT_DIR / "pairs_v10_best.csv"


# ============================================================
# 0. Pivot + train_month(B안) 생성
# ============================================================
def build_train_month_and_pivot(raw_path: Path):
    df = pd.read_csv(raw_path)

    df["year"] = df["year"].astype(int)
    df["month"] = df["month"].astype(int)
    df["hs4"] = df["hs4"].astype(str).str.zfill(4)
    df["value"] = df["value"].astype(float)

    df["ym"] = pd.to_datetime(df["year"].astype(str)+"-"+df["month"].astype(str)+"-01")

    # B안 train_month
    train_month = (
        df.groupby(["item_id","hs4","year","month"], as_index=False)["value"]
          .sum()
    )
    train_month.to_csv(TRAIN_MONTH_PATH, index=False)

    monthly = (
        df.groupby(["item_id","ym"], as_index=False)["value"]
          .sum()
          .rename(columns={"value": "value_sum"})
    )

    pivot = (
        monthly.pivot_table(index="ym", columns="item_id", values="value_sum")
               .sort_index()
               .fillna(0.0)
    )

    pivot.to_csv(PIVOT_PATH)
    return train_month, monthly, pivot


# ============================================================
# 1. base pair mining
# ============================================================
def safe_corr(x, y):
    if x.std() == 0 or y.std() == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])


def mine_all_pairs(pivot, max_lag=6, min_nonzero=8):
    values_np = pivot.values
    items = pivot.columns.tolist()

    pairs = []

    for i, leader in enumerate(items):
        A = values_np[:, i].astype(float)
        if np.count_nonzero(A) < min_nonzero:
            continue

        for j, follower in enumerate(items):
            if i == j:
                continue
            B = values_np[:, j].astype(float)
            if np.count_nonzero(B) < min_nonzero:
                continue

            best_corr = 0.0
            best_lag  = None

            for lag in range(1, max_lag+1):
                if len(A) <= lag:
                    break
                c = safe_corr(A[:-lag], B[lag:])
                if abs(c) > abs(best_corr):
                    best_corr = c
                    best_lag  = lag

            if best_lag is not None:
                pairs.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                })

    return pd.DataFrame(pairs)


# ============================================================
# 2. Pure Python DTW (fast banded)
# ============================================================
def dtw_distance(a, b, band=20):
    n, m = len(a), len(b)
    dp = np.full((n+1, m+1), np.inf)
    dp[0, 0] = 0.0

    for i in range(1, n+1):
        j_start = max(1, i-band)
        j_end   = min(m, i+band)
        for j in range(j_start, j_end+1):
            cost = abs(a[i-1] - b[j-1])
            dp[i, j] = cost + min(
                dp[i-1, j],
                dp[i, j-1],
                dp[i-1, j-1],
            )
    return dp[n, m]


# ============================================================
# 3. EDA Feature 생성
# ============================================================
def period_corr(x, y, start, end):
    if end > len(x):
        return np.nan
    xa, ya = x[start:end], y[start:end]
    if xa.std()==0 or ya.std()==0:
        return np.nan
    return np.corrcoef(xa, ya)[0,1]


def trend_slope(series):
    X = np.arange(len(series)).reshape(-1,1)
    lr = LinearRegression().fit(X, series)
    return lr.coef_[0]


def rolling_corr_std(a, b, lag, window=12):
    xa = a[:-lag]
    ya = b[lag:]
    s = pd.Series(xa).rolling(window).corr(pd.Series(ya))
    return s.std()


def detrended_snr(series):
    x = np.arange(len(series))
    lr = LinearRegression().fit(x.reshape(-1,1), series)
    trend = lr.predict(x.reshape(-1,1))
    noise = series - trend
    return trend.var() / (noise.var() + 1e-6)


def series_vol(x):
    return np.std(np.diff(x))


def attach_eda_features(pairs_df, pivot, train_raw):

    hs4_map = (
        train_raw[["item_id","hs4"]]
        .drop_duplicates()
        .set_index("item_id")["hs4"]
    )

    eda_rows = []
    for _, r in pairs_df.iterrows():
        A = pivot[r.leading_item_id].values
        B = pivot[r.following_item_id].values
        lag = int(r.best_lag)

        corr_e = period_corr(A[:-lag], B[lag:], 0, 24)
        corr_m = period_corr(A[:-lag], B[lag:], 12, 36)
        corr_range = abs(corr_e - corr_m) if (not pd.isna(corr_e) and not pd.isna(corr_m)) else np.nan

        tA = trend_slope(A)
        tB = trend_slope(B)

        rc_std = rolling_corr_std(A, B, lag)
        dtw_d = dtw_distance(A, B)
        snr   = detrended_snr(B)
        fv    = series_vol(B)

        eda_rows.append({
            "corr_early": corr_e,
            "corr_mid":   corr_m,
            "corr_range": corr_range,
            "trend_leader": tA,
            "trend_follower": tB,
            "trend_match": np.sign(tA*tB),
            "rollcorr_std": rc_std,
            "dtw_dist": dtw_d,
            "snr_detrend": snr,
            "f_vol": fv,
            "hs4_leader": hs4_map[r.leading_item_id],
            "hs4_follower": hs4_map[r.following_item_id],
            "hs4_equal": int(hs4_map[r.leading_item_id] == hs4_map[r.following_item_id]),
            "hs4_prefix2_equal": int(str(hs4_map[r.leading_item_id])[:2] == str(hs4_map[r.following_item_id])[:2]),
            "hs4_dist": abs(int(hs4_map[r.leading_item_id]) - int(hs4_map[r.following_item_id])),
        })

    return pd.concat([pairs_df.reset_index(drop=True), pd.DataFrame(eda_rows)], axis=1)


# ============================================================
# 4. main
# ============================================================
def main():
    train_raw = pd.read_csv(RAW_PATH)

    # pivot
    _, _, pivot = build_train_month_and_pivot(RAW_PATH)

    # base pairs
    base_pairs = mine_all_pairs(pivot)

    # EDA features
    full_pairs = attach_eda_features(base_pairs, pivot, train_raw)

    # save
    full_pairs.to_csv(BEST_PAIR_PATH, index=False)
    print(f"[SAVE] pairs_v10_best.csv → {BEST_PAIR_PATH} (rows={len(full_pairs)})")
    print("✅ DONE")


if __name__ == "__main__":
    main()


[SAVE] pairs_v10_best.csv → /data/ephemeral/home/data/processed/v10_pairs/pairs_v10_best.csv (rows=8556)
✅ DONE
