In [7]:
# ============================================================
# üì¶ Feature Engineering (FE) + pair_train ‚Äî v2 SAFE
# ============================================================

import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm   


In [8]:

# ------------------------------------------------------------
# 0Ô∏è‚É£ Path & Load
# ------------------------------------------------------------
BASE_DIR = Path.cwd().resolve()
CLEAN_PATH = BASE_DIR.parents[1] / "data" / "interim" / "train_clean.csv"
MONTHLY_PATH = BASE_DIR.parents[1] / "data" / "processed" / "train_monthly.csv"

print("Loading cleaned datasets...")
df = pd.read_csv(CLEAN_PATH)
monthly = pd.read_csv(MONTHLY_PATH)
print("‚úî Loaded:", df.shape, monthly.shape)

# ------------------------------------------------------------
# 1Ô∏è‚É£ HS ÏΩîÎìú & YM ÏÉùÏÑ± (Ìïú Î≤àÎßå!)
# ------------------------------------------------------------
monthly["hs3"] = monthly["hs4"].astype(str).str[:3]
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)

# ------------------------------------------------------------
# 2Ô∏è‚É£ pivot_item / pivot_hs3 ÏÉùÏÑ±
# ------------------------------------------------------------
pivot_item = monthly.pivot(
    index="item_id", columns="ym", values="total_value"
).fillna(0.0)

monthly_hs3 = (
    monthly.groupby(["hs3", "ym"], as_index=False)["total_value"].sum()
)
pivot_hs3 = monthly_hs3.pivot(
    index="hs3", columns="ym", values="total_value"
).fillna(0.0)

print("pivot_item:", pivot_item.shape)
print("pivot_hs3:", pivot_hs3.shape)

# ------------------------------------------------------------
# 3Ô∏è‚É£ zero_ratio / volatility / CV (item Îã®ÏúÑ 1 row)
# ------------------------------------------------------------
def compute_zero_vol(series: pd.Series):
    arr = series.values.astype(float)
    zero_ratio = (arr == 0).mean()
    vol = arr.std()
    cv = arr.std() / (arr.mean() + 1e-9)
    return zero_ratio, vol, cv

records = []
for item in pivot_item.index:
    zr, vol, cv = compute_zero_vol(pivot_item.loc[item])
    records.append([item, zr, vol, cv])

meta = pd.DataFrame(records, columns=["item_id", "zero_ratio", "volatility", "cv"])
print("‚úî meta feature done:", meta.shape)

# ------------------------------------------------------------
# 4Ô∏è‚É£ HS3 ‚Üí HS4 single best leader (itemÎãπ 1 row)
# ------------------------------------------------------------
pairs_hs3 = []

for hs3 in pivot_hs3.index:
    leader_s = pivot_hs3.loc[hs3].values.astype(float)
    for item in pivot_item.index:
        follower_s = pivot_item.loc[item].values.astype(float)

        best_lag = None
        best_corr = -999.0

        for lag in range(1, 7):
            if len(leader_s) <= lag:
                continue
            corr = np.corrcoef(leader_s[:-lag], follower_s[lag:])[0, 1]
            if corr > best_corr:
                best_corr = corr
                best_lag = lag

        pairs_hs3.append([hs3, item, best_lag, best_corr])

pairs_hs3 = pd.DataFrame(
    pairs_hs3,
    columns=["lead_hs3", "item_id", "best_lag_hs3", "corr_hs3"],
)

# item_idÎ≥ÑÎ°ú corr_hs3 ÏµúÍ≥† 1Í∞úÎßå ÎÇ®Í∏∞Í∏∞
pairs_hs3_best = (
    pairs_hs3.sort_values("corr_hs3", ascending=False)
             .groupby("item_id")
             .head(1)
)

print("‚úî single HS3 leader done:", pairs_hs3_best.shape)

# ------------------------------------------------------------
# 5Ô∏è‚É£ HS4 multi-leader weighted lag (top3 ‚Üí Îã®ÏùºÍ∞í)
# ------------------------------------------------------------
def find_pairs_item(pivot, max_lag=6, corr_threshold=0.3):
    items = pivot.index.to_list()
    months = pivot.columns
    n_months = len(months)

    res = []

    for A in items:
        x = pivot.loc[A].values.astype(float)
        if np.count_nonzero(x) < 6:
            continue

        for B in items:
            if A == B:
                continue

            y = pivot.loc[B].values.astype(float)
            if np.count_nonzero(y) < 6:
                continue

            best_corr = -999.0
            best_lag = None

            for lag in range(1, max_lag+1):
                if n_months <= lag:
                    continue
                corr = np.corrcoef(x[:-lag], y[lag:])[0, 1]
                if corr > best_corr:
                    best_corr = corr
                    best_lag = lag

            if best_corr >= corr_threshold and best_lag is not None:
                res.append([A, B, best_lag, best_corr])

    return pd.DataFrame(res, columns=["leader", "follower", "lag", "corr"])


pairs_hs4 = find_pairs_item(pivot_item, corr_threshold=0.3)

# followerÎ≥Ñ ÏÉÅÏúÑ 3Í∞úÎßå
top3 = (
    pairs_hs4.sort_values("corr", ascending=False)
             .groupby("follower")
             .head(3)
)

def weighted_lag(df):
    return np.average(df["lag"], weights=df["corr"])

multi_leader_lag = (
    top3.groupby("follower")
        .apply(weighted_lag)
        .reset_index(name="multi_leader_lag")
)

print("‚úî multi-leader lag done:", multi_leader_lag.shape)

# ------------------------------------------------------------
# 6Ô∏è‚É£ ÏµúÏ¢Ö FE v2 (itemÎãπ 1 row) ÏÉùÏÑ± & Ï†ÄÏû•
# ------------------------------------------------------------
FE = (
    meta
    .merge(pairs_hs3_best, on="item_id", how="left")
    .merge(multi_leader_lag, left_on="item_id", right_on="follower", how="left")
)

FE = FE.drop(columns=["follower"], errors="ignore")

print("üéâ Final FE v2:", FE.shape)
print(FE.head())

FE_OUT_PATH = BASE_DIR.parents[1] / "data" / "processed" / "v2_features_basic.csv"
FE_OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
FE.to_csv(FE_OUT_PATH, index=False)
print(f"üíæ Saved FE v2 to: {FE_OUT_PATH}")


Loading cleaned datasets...
‚úî Loaded: (10535, 9) (3734, 8)
pivot_item: (100, 43)
pivot_hs3: (54, 43)
‚úî meta feature done: (100, 4)
‚úî single HS3 leader done: (100, 4)
‚úî multi-leader lag done: (95, 2)
üéâ Final FE v2: (100, 8)
    item_id  zero_ratio    volatility        cv lead_hs3  best_lag_hs3  \
0  AANGBULD    0.325581  1.435857e+05  1.708994      300           3.0   
1  AHMDUILJ    0.000000  4.551154e+04  0.358842      600           6.0   
2  ANWUJOKX    0.813953  2.034624e+04  2.497477      520           3.0   
3  APQGTRMF    0.023256  2.095910e+05  0.814870      720           6.0   
4  ATLDMDBO    0.000000  3.234730e+07  0.539145      281           3.0   

   corr_hs3  multi_leader_lag  
0  0.592394          3.030900  
1  0.409002          4.000270  
2  0.636641          2.956122  
3  0.571589          1.323089  
4  0.728171          3.253572  
üíæ Saved FE v2 to: /data/ephemeral/home/data/processed/v2_features_basic.csv


In [9]:

# ============================================================
# 7Ô∏è‚É£ pair_train ÏÉùÏÑ± (baseline self-supervised)
# ============================================================

print("\nüß© Create pair_train from pivot_item...")

def safe_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

months = pivot_item.columns.to_list()
n_months = len(months)

pairs = []
for leader in tqdm(pivot_item.index):
    x = pivot_item.loc[leader].values.astype(float)
    if np.count_nonzero(x) < 6:
        continue

    for follower in pivot_item.index:
        if follower == leader:
            continue

        y = pivot_item.loc[follower].values.astype(float)
        if np.count_nonzero(y) < 6:
            continue

        best_corr = 0.0
        best_lag = None

        for lag in range(1, 7):
            if n_months <= lag:
                continue

            corr = safe_corr(x[:-lag], y[lag:])
            if abs(corr) > abs(best_corr):
                best_corr = corr
                best_lag = lag

        if best_lag is not None:
            pairs.append([leader, follower, best_lag, best_corr])

pair_df = pd.DataFrame(
    pairs,
    columns=["leading_item_id", "following_item_id", "best_lag", "max_corr"],
)

print("‚úî pair candidates:", pair_df.shape)

# ------------------------------------------------------------
# 8Ô∏è‚É£ pair_train target ÏÉùÏÑ±
# ------------------------------------------------------------
rows = []
for row in pair_df.itertuples():
    L = row.leading_item_id
    F = row.following_item_id
    lag = int(row.best_lag)
    corr = float(row.max_corr)

    xs = pivot_item.loc[L].values.astype(float)
    ys = pivot_item.loc[F].values.astype(float)

    for t in range(max(lag, 1), n_months - 1):
        rows.append({
            "leading_item_id": L,
            "following_item_id": F,
            "b_t": ys[t],
            "b_t_1": ys[t - 1],
            "a_t_lag": xs[t - lag],
            "max_corr": corr,
            "best_lag": lag,
            "target": ys[t + 1],
        })

pair_train = pd.DataFrame(rows)
print("‚úî pair_train:", pair_train.shape)

# ------------------------------------------------------------
# 9Ô∏è‚É£ pair_train ‚®â FE merge
# ------------------------------------------------------------
pair_train = pair_train.merge(
    FE,
    left_on="following_item_id",
    right_on="item_id",
    how="left",
)

pair_train = pair_train.drop(columns=["item_id"], errors="ignore")

PAIR_OUT_PATH = BASE_DIR.parents[1] / "data" / "processed" / "pair_train_v2.csv"
PAIR_OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
pair_train.to_csv(PAIR_OUT_PATH, index=False)
print(f"üíæ Saved pair_train v2 to: {PAIR_OUT_PATH}")



üß© Create pair_train from pivot_item...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:04<00:00, 21.24it/s]


‚úî pair candidates: (8930, 4)
‚úî pair_train: (343354, 8)
üíæ Saved pair_train v2 to: /data/ephemeral/home/data/processed/pair_train_v2.csv
