In [11]:
# ============================================================
# fe_pairwise_v11.py : Pairwise Feature Engineering (+ segment-wise similarity)
#  - v6 전처리(smooth_value pivot) 기반
#  - 변동성 필터 (active_months, CV, std)
#  - 강화된 scale_ratio 필터 (0.1~10)
#  - lag corr (1~4개월)
#  - ⚡ segment-wise corr (6개월 창, 3개월 간격)
#  - ⚡ 정규화된 diff 기반 shape corr(norm_diff_corr) 필터
# ============================================================

import numpy as np
import pandas as pd
from itertools import combinations
import warnings
warnings.filterwarnings("ignore")

# ============================================================
# 0. Load preprocessed data (v6)
# ============================================================
monthly = pd.read_csv("../../data/processed/train_monthly_v6.csv")
pivot   = pd.read_csv("../../data/processed/pivot_value_v6.csv", index_col=0)
item_summary = pd.read_csv("../../data/processed/item_summary_v6.csv")

# index: item_id, columns: t (시간순 정렬)
pivot = pivot.sort_index()

# ============================================================
# 1. 변동성 필터: 유효 거래 개수 + CV + 표준편차
# ============================================================
valid_items = item_summary[
    (item_summary["active_months"] >= 6)   # 최소 6개월 이상 거래
    & (item_summary["value_cv"] >= 0.25)   # 변동계수
    & (item_summary["value_std"] > 0)      # 완전 평탄 X
]["item_id"].tolist()

pivot = pivot.loc[valid_items]

# meta
hs_map  = item_summary.set_index("item_id")[["hs2", "hs3", "hs4"]].to_dict("index")
std_map = item_summary.set_index("item_id")["value_std"].to_dict()

items = list(pivot.index)

# ============================================================
# 2. HS distance
# ============================================================
def hs_distance(a, b):
    ha, hb = hs_map[a], hs_map[b]
    if ha["hs4"] == hb["hs4"]:
        return 0
    if ha["hs3"] == hb["hs3"]:
        return 1
    if ha["hs2"] == hb["hs2"]:
        return 2
    return 3

# ============================================================
# 3. helper: safe_corr
# ============================================================
def safe_corr(x, y):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if len(x) < 3 or len(y) < 3:
        return 0.0
    if np.std(x) < 1e-6 or np.std(y) < 1e-6:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

# ============================================================
# 4. 정규화된 diff 기반 shape corr (핵심 추가)
#    - 각 시계열을 차분 후 z-score로 정규화
#    - 모양만 비교하기 위한 corr
# ============================================================
def norm_diff_corr(a, b):
    s1 = pivot.loc[a].values.astype(float)
    s2 = pivot.loc[b].values.astype(float)

    d1 = np.diff(s1)
    d2 = np.diff(s2)

    if len(d1) < 3 or len(d2) < 3:
        return 0.0
    if np.std(d1) < 1e-6 or np.std(d2) < 1e-6:
        return 0.0

    z1 = (d1 - d1.mean()) / (np.std(d1) + 1e-9)
    z2 = (d2 - d2.mean()) / (np.std(d2) + 1e-9)

    return float(np.corrcoef(z1, z2)[0, 1])

# ============================================================
# 5. Cross-correlation for lag (A leads B)
# ============================================================
def lag_corr(a, b, max_lag=4):
    """A가 선행, B가 후행이라고 가정하고 lag-corr 계산"""
    s1 = pivot.loc[a].values.astype(float)
    s2 = pivot.loc[b].values.astype(float)

    best_corr = -1.0
    best_lag  = None

    for lag in range(1, max_lag + 1):
        x = s1[:-lag]
        y = s2[lag:]
        c = safe_corr(x, y)
        if c > best_corr:
            best_corr = c
            best_lag  = lag

    return best_corr, best_lag

# ============================================================
# 6. Rolling Corr (3M, 6M)
# ============================================================
def rolling_corr(a, b, w):
    s1 = pd.Series(pivot.loc[a].values.astype(float))
    s2 = pd.Series(pivot.loc[b].values.astype(float))
    return s1.rolling(w).corr(s2).mean()  # NaN은 자동 무시

# ============================================================
# 7. Segment-wise similarity
#    - window_size: 6개월
#    - step: 3개월 (겹치게)
# ============================================================
def segment_corr_stats(a, b, window=6, step=3, high_th=0.30):
    s1 = pivot.loc[a].values.astype(float)
    s2 = pivot.loc[b].values.astype(float)
    n = len(s1)

    seg_corrs = []

    for start in range(0, n - window + 1, step):
        x = s1[start:start + window]
        y = s2[start:start + window]
        c = safe_corr(x, y)
        seg_corrs.append(c)

    if len(seg_corrs) == 0:
        return 0.0, 0.0, 0

    seg_corrs = np.array(seg_corrs, dtype=float)

    # 양수인 window들만 모아 평균
    pos = seg_corrs[seg_corrs > 0]
    mean_pos = float(pos.mean()) if len(pos) > 0 else 0.0

    max_corr = float(seg_corrs.max())
    high_cnt = int((seg_corrs >= high_th).sum())

    return max_corr, mean_pos, high_cnt

# ============================================================
# 8. Main Loop
# ============================================================
rows = []

for A, B in combinations(items, 2):

    # -----------------------------
    # 8-1. scale 필터 (leading이 너무 작거나 너무 큰 경우 컷)
    # -----------------------------
    std_A = float(std_map.get(A, pivot.loc[A].std()))
    std_B = float(std_map.get(B, pivot.loc[B].std()))

    if std_A < 1e-3 or std_B < 1e-3:
        continue

    scale_ratio = std_A / (std_B + 1e-9)

    # 규모 차이를 너무 크게 허용하지 않음 (너무 크면 사실상 다른 세계)
    # ✅ 강화된 기준: 0.1 ~ 10
    if not (0.1 <= scale_ratio <= 10.0):
        continue

    # -----------------------------
    # 8-2. HS 정보
    # -----------------------------
    hs4_same = int(hs_map[A]["hs4"] == hs_map[B]["hs4"])
    hs3_same = int(hs_map[A]["hs3"] == hs_map[B]["hs3"])
    hs2_same = int(hs_map[A]["hs2"] == hs_map[B]["hs2"])
    hs_dist  = hs_distance(A, B)

    # -----------------------------
    # 8-3. global rolling corr (3, 6개월)
    # -----------------------------
    corr3 = rolling_corr(A, B, 3)
    corr6 = rolling_corr(A, B, 6)

    # 너무 완전 0 근처면 애초에 후보 아님
    if (corr3 <= 0.0) and (corr6 <= 0.0):
        continue

    # -----------------------------
    # 8-4. segment-wise corr (핵심)
    # -----------------------------
    seg_max, seg_mean_pos, seg_high_cnt = segment_corr_stats(
        A, B, window=6, step=3, high_th=0.30
    )

    # 최소 조건:
    #  - corr>=0.3인 segment가 2개 이상
    #  - 최대 segment corr >= 0.4
    if seg_high_cnt < 2:
        continue
    if seg_max < 0.40:
        continue

    # -----------------------------
    # 8-5. 정규화 diff shape corr (추가 필터)
    # -----------------------------
    nd_corr = norm_diff_corr(A, B)

    # 모양 자체가 너무 다르면 컷
    if nd_corr < 0.15:
        continue

    # -----------------------------
    # 8-6. lag corr (A leads B)
    # -----------------------------
    lag_corr_val, best_lag = lag_corr(A, B, max_lag=4)
    if best_lag is None:
        continue

    # HS가 멀수록 더 강한 lag corr 요구
    if hs4_same or hs3_same:
        min_lag = 0.15
    else:
        min_lag = 0.25

    if lag_corr_val < min_lag:
        continue

    # -----------------------------
    # 8-7. pair 저장
    # -----------------------------
    rows.append([
        A, B,
        hs4_same, hs3_same, hs2_same, hs_dist,
        corr3, corr6,
        seg_max, seg_mean_pos, seg_high_cnt,
        nd_corr,
        best_lag, lag_corr_val,
        scale_ratio,
    ])

pair_df = pd.DataFrame(rows, columns=[
    "item_i","item_j",
    "hs4_same","hs3_same","hs2_same","hs_distance",
    "rolling_corr3","rolling_corr6",
    "seg_max_corr","seg_mean_pos_corr","seg_high_cnt",
    "norm_diff_corr",
    "best_lag","lag_corr",
    "scale_ratio",
])

pair_df.to_csv("pair_features_v11_segment.csv", index=False)
print("Saved v11 FE file. shape:", pair_df.shape)


Saved v11 FE file. shape: (118, 15)
