In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import warnings
warnings.filterwarnings("ignore")



# ============================================================
# 0. Load monthly_full (Ï†ÑÏ≤òÎ¶¨ Í≤∞Í≥º)
# ============================================================
monthly = pd.read_csv("../../data/processed/train_monthly_v3_eda.csv")

# item list
items = monthly["item_id"].unique()
N = len(items)
print(f"üü¢ total items: {N}")

# pivot for correlation & ratio
pivot_value = monthly.pivot(index="t", columns="item_id", values="total_value").fillna(0)
pivot_weight = monthly.pivot(index="t", columns="item_id", values="total_weight").fillna(0)

ratio = (pivot_value / (pivot_weight + 1e-9))
pivot_ratio = ratio.fillna(0)

# normalize for lag-corr
pivot_norm = (pivot_value - pivot_value.mean()) / (pivot_value.std() + 1e-6)

# HS info
hs_info = monthly.groupby("item_id")[["hs2", "hs3", "hs4", "cluster_wv"]].first().reset_index()
hs_info.set_index("item_id", inplace=True)


def hs_distance(a, b):
    if hs_info.loc[a, "hs4"] == hs_info.loc[b, "hs4"]:
        return 0
    if hs_info.loc[a, "hs3"] == hs_info.loc[b, "hs3"]:
        return 1
    if hs_info.loc[a, "hs2"] == hs_info.loc[b, "hs2"]:
        return 2
    return 3


corr_mat = pivot_value.corr()
ratio_corr_mat = pivot_ratio.corr()


def lag_corr(a, b, max_lag=6):
    x = pivot_norm[a].values
    y = pivot_norm[b].values

    best_corr = -999
    best_lag = 0

    for lag in range(0, max_lag + 1):
        if lag == 0:
            corr = np.corrcoef(x, y)[0,1]
        else:
            corr = np.corrcoef(x[lag:], y[:-lag])[0,1]

        if np.isnan(corr): continue
        if corr > best_corr:
            best_corr = corr
            best_lag = lag

    return best_lag, best_corr


pivot_diff = pivot_value.diff().fillna(0)

def sign_agree(a, b):
    s1 = np.sign(pivot_diff[a].values)
    s2 = np.sign(pivot_diff[b].values)
    return np.mean(s1 == s2)


def cluster_sim(a, b):
    return int(hs_info.loc[a, "cluster_wv"] == hs_info.loc[b, "cluster_wv"])


pairs = []
for A, B in combinations(items, 2):

    hs4_same = int(hs_info.loc[A,"hs4"] == hs_info.loc[B,"hs4"])
    hs3_same = int(hs_info.loc[A,"hs3"] == hs_info.loc[B,"hs3"])
    hs2_same = int(hs_info.loc[A,"hs2"] == hs_info.loc[B,"hs2"])
    hs_dist  = hs_distance(A,B)

    # cluster
    clA = hs_info.loc[A, "cluster_wv"]
    clB = hs_info.loc[B, "cluster_wv"]
    cl_sim = int(clA == clB)
    cl_dist = abs(clA - clB)

    # correlations
    same_corr = corr_mat.loc[A, B]
    ratio_corr = ratio_corr_mat.loc[A, B]

    # lag
    lag, lagcorr = lag_corr(A, B)

    # sign agree
    s_agree = sign_agree(A, B)

    pairs.append([
        A, B,
        hs4_same, hs3_same, hs2_same, hs_dist,
        clA, clB, cl_sim, cl_dist,
        same_corr, ratio_corr,
        lag, lagcorr,
        s_agree
    ])

pair_df = pd.DataFrame(pairs, columns=[
    "A", "B",
    "hs4_same", "hs3_same", "hs2_same", "hs_distance",
    "cluster_A", "cluster_B", "cluster_sim", "cluster_distance",
    "corr_same_month", "corr_ratio",
    "best_lag", "best_lagcorr",
    "sign_agreement"
])

pair_df.to_csv("pair_features_final.csv", index=False)
print("üçì pair_features_final.csv ÏÉùÏÑ± ÏôÑÎ£å!")


