In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from scipy.spatial.distance import cosine
from scipy.signal import correlate
from fastdtw import fastdtw


In [2]:
def safe_cosine(a, b):
    if np.all(a == 0) or np.all(b == 0):
        return 0.0
    return 1 - cosine(a, b)


In [3]:
def compute_cross_corr(x, y):
    x = (x - x.mean()) / (x.std() + 1e-9)
    y = (y - y.mean()) / (y.std() + 1e-9)

    c = correlate(x, y, mode='full')
    lag = c.argmax() - (len(x) - 1)
    return c.max(), lag


In [4]:
def compute_sign_agree(x, y):
    sx = np.sign(np.diff(x))
    sy = np.sign(np.diff(y))
    return (sx == sy).mean()


In [5]:
def compute_dtw(a, b):
    dist, _ = fastdtw(a, b)
    return dist


In [6]:
def hs_distance(h1, h2):
    if h1 == h2: return 0
    if h1[:3] == h2[:3]: return 1
    if h1[:2] == h2[:2]: return 2
    return 3


In [17]:
def build_pair_df(monthly):

    print("üìå Fixing time index (year, month ‚Üí t=1..43)")

    # Ïó∞ÏõîÏùÑ ÌïòÎÇòÏùò ÏãúÍ≥ÑÏó¥ Ïù∏Îç±Ïä§Î°ú Î≥ÄÌôò
    monthly = monthly.copy()
    monthly["t"] = (monthly["year"] - monthly["year"].min()) * 12 + monthly["month"]

    # Î®ºÏ†Ä Ï§ëÎ≥µÏùÑ Ï†úÍ±∞ÌïòÍ∏∞ ÏúÑÌïú groupby sum
    print("üì¶ Aggregating duplicated item_id-year-month ...")
    monthly_agg = (monthly
                   .groupby(["item_id", "t"], as_index=False)
                   .agg(total_value=("total_value", "sum"),
                        total_weight=("total_weight", "sum"))
                  )

    print("üìä Pivoting... (item_id √ó 43 time points)")

    ts_val = monthly_agg.pivot(index="item_id", columns="t", values="total_value").fillna(0)
    ts_wgt = monthly_agg.pivot(index="item_id", columns="t", values="total_weight").fillna(0)

    items = ts_val.index.tolist()
    meta = monthly.drop_duplicates("item_id").copy()
    meta["hs4"] = meta["hs4"].astype(str).str.zfill(4)
    meta["hs3"] = meta["hs3"].astype(str)
    meta["hs2"] = meta["hs2"].astype(str)
    meta = meta.set_index("item_id")

    pair_list = []
    total_pairs = len(items)*(len(items)-1)//2
    print(f"üì¶ Total pairs to process: {total_pairs}")

    cnt = 0
    for item_i, item_j in combinations(items, 2):
        v_i, v_j = ts_val.loc[item_i].values, ts_val.loc[item_j].values
        w_i, w_j = ts_wgt.loc[item_i].values, ts_wgt.loc[item_j].values

        cos_val = safe_cosine(v_i, v_j)
        cos_wgt = safe_cosine(w_i, w_j)
        cc_val, lag_val = compute_cross_corr(v_i, v_j)
        sign_agree = compute_sign_agree(v_i, v_j)
        dtw_dist = compute_dtw(v_i, v_j)
        dtw_sim = 1 / (1 + dtw_dist)
        hs_dist = hs_distance(meta.loc[item_i, "hs4"], meta.loc[item_j, "hs4"])
        same_cluster = int(meta.loc[item_i, "cluster_wv"] == meta.loc[item_j, "cluster_wv"])

        pair_list.append([
            item_i, item_j,
            cos_val, cos_wgt, sign_agree,
            cc_val, lag_val,
            dtw_dist, dtw_sim,
            hs_dist,
            meta.loc[item_i, "hs4"], meta.loc[item_j, "hs4"],
            meta.loc[item_i, "hs3"], meta.loc[item_j, "hs3"],
            meta.loc[item_i, "hs2"], meta.loc[item_j, "hs2"],
            meta.loc[item_i, "cluster_wv"], meta.loc[item_j, "cluster_wv"],
            same_cluster
        ])

        cnt += 1
        if cnt % 500 == 0:
            print(f"‚è≥ processed pairs: {cnt}/{total_pairs}")

    cols = [
        "item_i", "item_j",
        "cos_val", "cos_wgt", "sign_agree_val",
        "cc_val", "lag_val",
        "dtw_dist", "dtw_sim",
        "hs_dist",
        "hs4_i", "hs4_j",
        "hs3_i", "hs3_j",
        "hs2_i", "hs2_j",
        "cluster_i", "cluster_j",
        "same_cluster"
    ]

    pair_df = pd.DataFrame(pair_list, columns=cols)
    print("‚úÖ pair_df ÏÉùÏÑ± ÏôÑÎ£å:", pair_df.shape)

    return pair_df


In [8]:
def minmax_norm(x):
    return (x - x.min()) / (x.max() - x.min() + 1e-9)

def normalize_features(pair_df):
    pair_df = pair_df.copy()
    pair_df["dtw_norm"] = minmax_norm(pair_df["dtw_dist"])
    pair_df["hs_dist_norm"] = pair_df["hs_dist"].astype(float)
    pair_df["cluster_sim"] = pair_df["same_cluster"].astype(float)
    return pair_df


In [9]:
def compute_lag_direction_score(lag):
    if lag in [1, 2]:
        return 1.0
    elif lag == 0:
        return 0.0
    elif lag < 0:
        return -1.0
    else:
        return 0.5


In [10]:
def compute_pair_score_v1(df):
    df = df.copy()
    df["lag_dir"] = df["lag_val"].apply(compute_lag_direction_score)

    df["score_v1"] = (
          0.30 * df["cos_val"]
        + 0.10 * df["cos_wgt"]
        + 0.20 * df["cc_val"]
        + 0.15 * df["sign_agree_val"]
        - 0.10 * df["dtw_norm"]
        - 0.10 * df["hs_dist_norm"]
        + 0.05 * df["cluster_sim"]
        + 0.10 * df["lag_dir"]
    )
    return df


In [11]:
def apply_pair_filters(df):
    df = df.copy()

    mask = (
        (df["cos_val"] > -0.15) &
        (df["cc_val"] > 0) &
        (df["sign_agree_val"] >= 0.30) &
        (df["dtw_norm"] <= df["dtw_norm"].quantile(0.90))
    )

    filtered = df[mask].copy()
    filtered.loc[filtered["hs_dist"] == 3, "score_v1"] *= 0.90
    return filtered


In [12]:
def run_pair_ranking_pipeline(pair_df):
    print("üîß Normalizing features...")
    pair_df = normalize_features(pair_df)

    print("üìä Computing rule-based score_v1...")
    pair_df = compute_pair_score_v1(pair_df)

    print("üßπ Applying noise filtering...")
    filtered_df = apply_pair_filters(pair_df)

    print(f"‚≠ê Filtering ÌõÑ pair Ïàò: {len(filtered_df)} / {len(pair_df)}")

    print("\nüéØ score_v1 ÏÉÅÏúÑ 10Í∞ú pair:")
    print(filtered_df.sort_values("score_v1", ascending=False).head(10))

    return filtered_df


In [18]:
monthly = pd.read_csv("/data/ephemeral/home/data/processed/train_monthly_v2.csv")
pair_df = build_pair_df(monthly)
pair_df.to_csv("pair_df_v1.csv", index=False)

filtered_pairs = run_pair_ranking_pipeline(pair_df)


üìå Fixing time index (year, month ‚Üí t=1..43)
üì¶ Aggregating duplicated item_id-year-month ...
üìä Pivoting... (item_id √ó 43 time points)
üì¶ Total pairs to process: 4950
‚è≥ processed pairs: 500/4950
‚è≥ processed pairs: 1000/4950
‚è≥ processed pairs: 1500/4950
‚è≥ processed pairs: 2000/4950
‚è≥ processed pairs: 2500/4950
‚è≥ processed pairs: 3000/4950
‚è≥ processed pairs: 3500/4950
‚è≥ processed pairs: 4000/4950
‚è≥ processed pairs: 4500/4950
‚úÖ pair_df ÏÉùÏÑ± ÏôÑÎ£å: (4950, 19)
üîß Normalizing features...
üìä Computing rule-based score_v1...
üßπ Applying noise filtering...
‚≠ê Filtering ÌõÑ pair Ïàò: 3642 / 4950

üéØ score_v1 ÏÉÅÏúÑ 10Í∞ú pair:
        item_i    item_j   cos_val   cos_wgt  sign_agree_val     cc_val  \
3263  KFQSHBNH  RJCAXSGH  0.920558  0.916607        0.952381  39.030389   
3813  NAQIHUKZ  XIIEJNEE  0.004889  0.022170        0.380952  40.112320   
2861  IGDVVKUD  XIIEJNEE  0.811912  0.789449        0.428571  35.240837   
1490  DJBLNPNC  LLHREMKS  0.273