In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
import warnings
warnings.filterwarnings("ignore")


In [2]:


# ============================================================
# 0. Load monthly_full (Ï†ÑÏ≤òÎ¶¨ Í≤∞Í≥º)
# ============================================================
monthly = pd.read_csv("../../data/processed/train_monthly_v3_eda.csv")

items = monthly["item_id"].unique()
print(f"üü¢ total items: {len(items)}")


# ============================================================
# 1. Pivot WITHOUT fillna(0)
#    ‚Üí corr/lagcorr Í≥ÑÏÇ∞ÏùÑ ÏúÑÌï¥ NaN Ïú†ÏßÄ
# ============================================================
pivot_value = monthly.pivot(index="t", columns="item_id", values="total_value")
pivot_weight = monthly.pivot(index="t", columns="item_id", values="total_weight")

# ratio: weight=0Ïù¥Î©¥ NaN Ïú†ÏßÄ
pivot_ratio = pivot_value / (pivot_weight + 1e-9)

# normalize for lag corr (NaN Ïú†ÏßÄ OK)
pivot_norm = (pivot_value - pivot_value.mean()) / (pivot_value.std() + 1e-6)


# ============================================================
# 2. meta table (HS¬∑cluster¬∑season¬∑rare)
# ============================================================
item_meta = monthly.groupby("item_id").agg({
    "hs2": "first",
    "hs3": "first",
    "hs4": "first",
    "cluster_wv": "first",
    "item_season_idx": "mean",
    "hs4_season_idx": "mean",
    "rare_item_flag": "max"
}).reset_index().set_index("item_id")

# coefficient of variation (CV)
item_cv = (
    monthly.groupby("item_id")["total_value"].std()
    /
    monthly.groupby("item_id")["total_value"].mean().replace(0, np.nan)
)
item_meta["value_cv"] = item_cv.fillna(0)


# ============================================================
# 3. B item time-series features (Ï†ïÌôïÌïú Î∞©Ïãù)
# ============================================================
# BÏùò rolling featureÎäî ‚ÄúÍ∞ÄÏû• ÏµúÍ∑º month Í∞í‚ÄùÏùÑ Ïì∞Îäî Í≤å ÎßûÏùå ‚Üí ÎØ∏Îûò ÎàÑÏ∂ú ÏóÜÏùå
b_last = (
    monthly.sort_values(["item_id", "t"])
    .groupby("item_id")
    .tail(1)      # Í∞ÄÏû• ÏµúÍ∑º row
    .set_index("item_id")
)

# ÌïÑÏöîÌïú Ïª¨ÎüºÎßå ÏÑ†ÌÉù
b_feats = b_last[[
    "value_roll_mean_3",
    "value_roll_mean_6",
    "value_roll_std_3",
    "total_value_lag1"
]]

item_meta = item_meta.merge(b_feats, left_index=True, right_index=True, how="left")


# ============================================================
# 4. Lag correlation
# ============================================================
def lag_corr(a, b, max_lag=6):
    x = pivot_norm[a].values
    y = pivot_norm[b].values

    best_corr = -999
    best_lag = 0

    for lag in range(max_lag + 1):
        if lag == 0:
            corr = np.corrcoef(x, y)[0,1]
        else:
            corr = np.corrcoef(x[lag:], y[:-lag])[0,1]

        if np.isnan(corr):
            continue

        if corr > best_corr:
            best_corr = corr
            best_lag = lag

    return best_lag, best_corr


# ============================================================
# 5. Pair loop
# ============================================================
corr_value = pivot_value.corr()
corr_ratio = pivot_ratio.corr()

pairs = []
for A, B in combinations(items, 2):

    # HS similarity
    hs4_same = int(item_meta.loc[A,"hs4"] == item_meta.loc[B,"hs4"])
    hs3_same = int(item_meta.loc[A,"hs3"] == item_meta.loc[B,"hs3"])

    # distance
    def hs_dist(a, b):
        if item_meta.loc[a,"hs4"] == item_meta.loc[b,"hs4"]: return 0
        if item_meta.loc[a,"hs3"] == item_meta.loc[b,"hs3"]: return 1
        if item_meta.loc[a,"hs2"] == item_meta.loc[b,"hs2"]: return 2
        return 3
    hs_distance = hs_dist(A, B)

    # cluster
    cluster_sim = int(item_meta.loc[A,"cluster_wv"] == item_meta.loc[B,"cluster_wv"])

    # correlations
    same_corr = corr_value.loc[A, B]
    r_corr = corr_ratio.loc[A, B]

    # lag corr
    lag, lagcorr = lag_corr(A, B)

    # B meta row
    Bmeta = item_meta.loc[B]

    pairs.append([
        A, B,
        hs4_same, hs3_same, hs_distance,
        cluster_sim,
        same_corr, r_corr,
        lag, lagcorr,
        Bmeta["value_roll_mean_3"],
        Bmeta["value_roll_mean_6"],
        Bmeta["value_roll_std_3"],
        Bmeta["total_value_lag1"],
        Bmeta["value_cv"],
        Bmeta["hs4_season_idx"],
        Bmeta["item_season_idx"],
        Bmeta["rare_item_flag"]
    ])


pair_df = pd.DataFrame(pairs, columns=[
    "A","B",
    "hs4_same","hs3_same","hs_distance",
    "cluster_sim",
    "corr_same_month","corr_ratio",
    "best_lag","best_lagcorr",
    "value_roll_mean_3_B",
    "value_roll_mean_6_B",
    "value_roll_std_3_B",
    "total_value_lag1_B",
    "value_cv_B",
    "hs4_season_idx_B",
    "item_season_idx_B",
    "rare_item_flag_B"
])

pair_df.to_csv("pair_features_v2.csv", index=False)
print("üçì pair_features_v2.csv ÏÉùÏÑ± ÏôÑÎ£å!")


üü¢ total items: 100
üçì pair_features_v2.csv ÏÉùÏÑ± ÏôÑÎ£å!
