In [1]:
"""
FE Notebook A ‚Äî Pair Ranking Pipeline
1) pair_df ÏÉùÏÑ±
2) score Í≥ÑÏÇ∞ + refine
3) TOP N ÏÑ†ÌÉù
4) leader-follower direction Í≤∞Ï†ï
5) direction_df_topN.csv Ï†ÄÏû•
"""

import numpy as np
import pandas as pd
from itertools import combinations
from scipy.spatial.distance import cosine
from scipy.signal import correlate
from fastdtw import fastdtw


In [2]:
# =========================================================
# Utility Ìï®ÏàòÎì§
# =========================================================

def safe_cosine(a, b):
    if np.all(a == 0) or np.all(b == 0):
        return 0.0
    return 1 - cosine(a, b)


def compute_cross_corr(x, y):
    x = (x - x.mean()) / (x.std() + 1e-9)
    y = (y - y.mean()) / (y.std() + 1e-9)
    c = correlate(x, y, mode="full")
    lag = c.argmax() - (len(x) - 1)
    return c.max(), lag


def compute_sign_agree(x, y):
    sx = np.sign(np.diff(x))
    sy = np.sign(np.diff(y))
    return (sx == sy).mean()


def compute_dtw(a, b):
    dist, _ = fastdtw(a, b)
    return dist


def hs_distance(h1, h2):
    if h1 == h2: return 0
    if h1[:3] == h2[:3]: return 1
    if h1[:2] == h2[:2]: return 2
    return 3


# =========================================================
# 1) pair_df ÏÉùÏÑ±
# =========================================================

def build_pair_df(monthly):

    monthly = monthly.copy()
    monthly["t"] = (monthly["year"] - monthly["year"].min()) * 12 + monthly["month"]

    monthly_agg = monthly.groupby(["item_id", "t"], as_index=False).agg(
        total_value=("total_value", "sum"),
        total_weight=("total_weight", "sum")
    )

    ts_val = monthly_agg.pivot(index="item_id", columns="t", values="total_value").fillna(0)
    ts_wgt = monthly_agg.pivot(index="item_id", columns="t", values="total_weight").fillna(0)

    items = ts_val.index.tolist()

    meta = monthly.drop_duplicates("item_id").copy()
    meta["hs4"] = meta["hs4"].astype(str).str.zfill(4)
    meta["hs3"] = meta["hs3"].astype(str)
    meta["hs2"] = meta["hs2"].astype(str)
    meta = meta.set_index("item_id")

    pair_list = []

    for item_i, item_j in combinations(items, 2):
        v_i, v_j = ts_val.loc[item_i].values, ts_val.loc[item_j].values
        w_i, w_j = ts_wgt.loc[item_i].values, ts_wgt.loc[item_j].values

        cos_val = safe_cosine(v_i, v_j)
        cos_wgt = safe_cosine(w_i, w_j)
        cc_val, lag_val = compute_cross_corr(v_i, v_j)
        sign_agree = compute_sign_agree(v_i, v_j)
        dtw_dist = compute_dtw(v_i, v_j)
        dtw_sim = 1 / (1 + dtw_dist)
        hs_dist = hs_distance(meta.loc[item_i, "hs4"], meta.loc[item_j, "hs4"])
        same_cluster = int(meta.loc[item_i, "cluster_wv"] == meta.loc[item_j, "cluster_wv"])

        pair_list.append([
            item_i, item_j,
            cos_val, cos_wgt, sign_agree,
            cc_val, lag_val,
            dtw_dist, dtw_sim,
            hs_dist,
            meta.loc[item_i, "hs4"], meta.loc[item_j, "hs4"],
            meta.loc[item_i, "hs3"], meta.loc[item_j, "hs3"],
            meta.loc[item_i, "hs2"], meta.loc[item_j, "hs2"],
            meta.loc[item_i, "cluster_wv"], meta.loc[item_j, "cluster_wv"],
            same_cluster
        ])

    cols = [
        "item_i", "item_j",
        "cos_val", "cos_wgt", "sign_agree_val",
        "cc_val", "lag_val",
        "dtw_dist", "dtw_sim",
        "hs_dist",
        "hs4_i", "hs4_j",
        "hs3_i", "hs3_j",
        "hs2_i", "hs2_j",
        "cluster_i", "cluster_j",
        "same_cluster"
    ]

    return pd.DataFrame(pair_list, columns=cols)


# =========================================================
# 2) Normalize + score + refine
# =========================================================

def normalize_features(df):
    df = df.copy()
    df["dtw_norm"] = (df["dtw_dist"] - df["dtw_dist"].min()) / (df["dtw_dist"].max() - df["dtw_dist"].min() + 1e-9)
    df["hs_dist_norm"] = df["hs_dist"].astype(float)
    df["cluster_sim"] = df["same_cluster"].astype(float)
    return df


def compute_lag_direction_score(lag):
    if lag in [1, 2]: return 1.0
    if lag == 0: return 0.0
    if lag < 0: return -1.0
    return 0.5


def compute_pair_score_v1(df):
    df = df.copy()
    df["lag_dir"] = df["lag_val"].apply(compute_lag_direction_score)
    df["score_v1"] = (
          0.30 * df["cos_val"]
        + 0.10 * df["cos_wgt"]
        + 0.20 * df["cc_val"]
        + 0.15 * df["sign_agree_val"]
        - 0.10 * df["dtw_norm"]
        - 0.10 * df["hs_dist_norm"]
        + 0.05 * df["cluster_sim"]
        + 0.10 * df["lag_dir"]
    )
    return df


def refine_pairs(df):
    df = df.copy()
    # 1Ï∞® Í±∞Î•∏ ÌïÑÌÑ∞ (ÎÖ∏Ïù¥Ï¶à Ï†úÍ±∞Ïö©)
    df = df[df["cos_val"] > -0.05]
    df = df[df["sign_agree_val"] >= 0.30]

    # cc_val Ïä§ÌååÏù¥ÌÅ¨ + cos_val ÎÇÆÏùÄ Í≤ΩÏö∞ Ï†úÍ±∞ (Í∏âÎì±/Í∏âÎùΩ Ìïú Î≤àÎßå ÎßûÏùÄ Í≤É)
    spike_mask = (df["cc_val"] > df["cc_val"].quantile(0.98)) & (df["cos_val"] < 0.05)
    df = df[~spike_mask]

    return df


# =========================================================
# 2-1) TOP_N ÏûêÎèô ÏÑ†ÌÉù (ÌïµÏã¨)
# =========================================================

def auto_select_top_pairs(df,
                          min_pairs=200,
                          max_pairs=1500,
                          fallback_top=500):
    """
    score_v1 Í∏∞Ï§ÄÏúºÎ°ú ÏÉÅÏúÑ Íµ¨Í∞ÑÏóêÏÑú ÏûêÎèôÏúºÎ°ú N Í≤∞Ï†ï.
    - Ïö∞ÏÑ† high-quantile(0.99 ~ 0.90) Íµ¨Í∞ÑÏóêÏÑú
      min_pairs ~ max_pairs ÏÇ¨Ïù¥Í∞Ä ÎêòÎèÑÎ°ù threshold ÏÑ†ÌÉù
    - Î™ª Ï∞æÏúºÎ©¥ fallback_topÍ∞ú ÏÇ¨Ïö©
    """
    df = df.copy()

    # score_v1 high ‚Üí Ï¢ãÏùå
    scores = df["score_v1"]
    quantiles = [0.99, 0.985, 0.98, 0.975, 0.97,
                 0.965, 0.96, 0.955, 0.95, 0.94, 0.93, 0.92, 0.90]

    chosen = None
    chosen_q = None

    for q in quantiles:
        thr = scores.quantile(q)
        cand = df[df["score_v1"] >= thr]
        n = len(cand)
        print(f"[auto-N] q={q:.3f}, thr={thr:.4f}, pairs={n}")
        if min_pairs <= n <= max_pairs:
            chosen = cand
            chosen_q = q
            break

    if chosen is None:
        # fallback: Îã®ÏàúÌûà ÏÉÅÏúÑ fallback_topÍ∞ú
        chosen = df.sort_values("score_v1", ascending=False).head(fallback_top)
        chosen_q = None
        print(f"[auto-N] Ï†ÅÎãπÌïú qÎ•º Î™ª Ï∞æÏïÑÏÑú ÏÉÅÏúÑ {fallback_top}Í∞ú ÏÇ¨Ïö©")

    chosen = chosen.sort_values("score_v1", ascending=False).reset_index(drop=True)

    print("===================================")
    if chosen_q is not None:
        print(f"[auto-N] ÏµúÏ¢Ö ÏÑ†ÌÉù: q={chosen_q:.3f}, N={len(chosen)}")
    else:
        print(f"[auto-N] ÏµúÏ¢Ö ÏÑ†ÌÉù: fallback, N={len(chosen)}")
    print("score_v1 Î≤îÏúÑ:", chosen["score_v1"].min(), "~", chosen["score_v1"].max())
    print("===================================")

    return chosen


# =========================================================
# 3) Direction Í≤∞Ï†ï
# =========================================================

def compute_trend_slope(ts):
    x = np.arange(len(ts))
    if np.std(ts) < 1e-9:
        return 0.0
    return np.polyfit(x, ts, 1)[0]


def recent_growth(ts, w=6):
    if len(ts) < w+1: return 0.0
    return ts[-1] - ts[-w]


def determine_direction(row, ts_val):

    i, j = row["item_i"], row["item_j"]
    lag = row["lag_val"]
    v_i = ts_val.loc[i].values
    v_j = ts_val.loc[j].values

    # 1) lag Í∏∞Î∞ò: ÏñëÏàòÎ©¥ i‚Üíj, ÏùåÏàòÎ©¥ j‚Üíi
    if lag > 0: return i, j
    if lag < 0: return j, i

    # 2) lag==0Ïùº Îïå Î≥¥Ï°∞ Í∏∞Ï§Ä

    # (1) ÏµúÍ∑º 6Í∞úÏõî growth
    g_i, g_j = recent_growth(v_i), recent_growth(v_j)
    if g_i != g_j:
        return (i, j) if g_i > g_j else (j, i)

    # (2) Ï†ÑÍ∏∞Í∞Ñ Í∏∞Ïö∏Í∏∞
    s_i, s_j = compute_trend_slope(v_i), compute_trend_slope(v_j)
    if s_i != s_j:
        return (i, j) if s_i > s_j else (j, i)

    # (3) ÎßàÏßÄÎßâ Í∞í(Í∑úÎ™®)
    return (i, j) if v_i[-1] > v_j[-1] else (j, i)


def assign_directions(top_pairs, monthly):

    monthly = monthly.copy()
    monthly["t"] = (monthly["year"] - monthly["year"].min())*12 + monthly["month"]

    monthly = monthly.groupby(["item_id", "t"], as_index=False).agg(
        total_value=("total_value", "sum")
    )
    ts_val = monthly.pivot(index="item_id", columns="t", values="total_value").fillna(0)

    leaders, followers = [], []

    for _, row in top_pairs.iterrows():
        L, F = determine_direction(row, ts_val)
        leaders.append(L)
        followers.append(F)

    out = top_pairs.copy()
    out["leader"] = leaders
    out["follower"] = followers

    return out


# =========================================================
# Ïã§Ìñâ
# =========================================================

monthly = pd.read_csv("/data/ephemeral/home/data/processed/train_monthly_v2.csv")

print("üìå Building pair_df...")
pair_df = build_pair_df(monthly)
print("pair_df shape:", pair_df.shape)

pair_df = normalize_features(pair_df)
pair_df = compute_pair_score_v1(pair_df)
pair_df = refine_pairs(pair_df)

print("üìå After refine:", pair_df.shape)

# üî• Ïó¨Í∏∞ÏÑú N ÏûêÎèô Í≤∞Ï†ï
top_pairs = auto_select_top_pairs(
    pair_df,
    min_pairs=200,
    max_pairs=600,
    fallback_top=500
)

direction_df = assign_directions(top_pairs, monthly)

pair_df.to_csv("pair_df_v1.csv", index=False)
direction_df.to_csv("direction_df_autoN.csv", index=False)

print("üéâ Saved:")
print(" - pair_df_v1.csv")
print(" - direction_df_autoN.csv")
print("ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú pair Ïàò:", len(direction_df))


üìå Building pair_df...
pair_df shape: (4950, 19)
üìå After refine: (4073, 24)
[auto-N] q=0.990, thr=6.2989, pairs=41
[auto-N] q=0.985, thr=6.0485, pairs=62
[auto-N] q=0.980, thr=5.8315, pairs=82
[auto-N] q=0.975, thr=5.6631, pairs=102
[auto-N] q=0.970, thr=5.5361, pairs=123
[auto-N] q=0.965, thr=5.4189, pairs=143
[auto-N] q=0.960, thr=5.3494, pairs=163
[auto-N] q=0.955, thr=5.2126, pairs=184
[auto-N] q=0.950, thr=5.1511, pairs=204
[auto-N] ÏµúÏ¢Ö ÏÑ†ÌÉù: q=0.950, N=204
score_v1 Î≤îÏúÑ: 5.153577862711564 ~ 8.316253836905712
üéâ Saved:
 - pair_df_v1.csv
 - direction_df_autoN.csv
ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú pair Ïàò: 204
