In [1]:
import numpy as np
import pandas as pd


In [2]:
# ============================================================
# 0) Helper functions
# ============================================================
def minmax_norm(x):
    return (x - x.min()) / (x.max() - x.min() + 1e-9)


# ============================================================
# 1) Normalization Ìï®Ïàò
# ============================================================
def normalize_features(pair_df):
    df = pair_df.copy()

    df["dtw_norm"] = minmax_norm(df["dtw_dist"])        # DTW distance normalize
    df["hs_dist_norm"] = df["hs_dist"].astype(float)    # Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©
    df["cluster_sim"] = df["same_cluster"].astype(float)

    return df


# ============================================================
# 2) Lag Î∞©Ìñ•ÏÑ± Ï†êÏàò
# ============================================================
def compute_lag_direction_score(lag):
    if lag in [1, 2]:
        return 1.0
    elif lag == 0:
        return 0.0
    elif lag < 0:
        return -1.0
    else:
        return 0.5


# ============================================================
# 3) Pair Score (Version 1)
# ============================================================
def compute_pair_score_v1(df):
    df = df.copy()

    df["lag_dir"] = df["lag_val"].apply(compute_lag_direction_score)

    df["score_v1"] = (
          0.30 * df["cos_val"]
        + 0.10 * df["cos_wgt"]
        + 0.20 * df["cc_val"]
        + 0.15 * df["sign_agree_val"]
        - 0.10 * df["dtw_norm"]
        - 0.10 * df["hs_dist_norm"]
        + 0.05 * df["cluster_sim"]
        + 0.10 * df["lag_dir"]
    )

    return df


# ============================================================
# 4) Filtering Í∑úÏπô Ï†ÅÏö©
# ============================================================
def refine_pairs(df):
    df = df.copy()

    # cosine ÎÑàÎ¨¥ ÎÇÆÏùÄ pair Ï†úÍ±∞
    df = df[df["cos_val"] > -0.05]

    # cross-corr Ïä§ÌååÏù¥ÌÅ¨ Í∏∞Î∞ò Ïú†ÏÇ¨ÎèÑ Ï†úÍ±∞
    mask_spike = (df["cc_val"] > 20) & (df["cos_val"] < 0.05)
    df = df[~mask_spike]

    # sign consistency
    df = df[df["sign_agree_val"] >= 0.30]

    # hs_dist=3 + low score Ï†úÍ±∞
    if "score_v1" in df.columns:
        df = df[~((df["hs_dist"] == 3) & (df["score_v1"] < df["score_v1"].quantile(0.50)))]

    print(f"After refine: {len(df)} pairs left")
    return df


In [3]:
pair_df = pd.read_csv("pair_df_v1.csv")
pair_df = normalize_features(pair_df)
pair_df = compute_pair_score_v1(pair_df)

refined = refine_pairs(pair_df)

TOP_N = 1000   # NÏùÄ Î≥ÄÎèô Í∞ÄÎä•
top_pairs = refined.sort_values("score_v1", ascending=False).head(TOP_N)
top_pairs.head()


After refine: 2156 pairs left


Unnamed: 0,item_i,item_j,cos_val,cos_wgt,sign_agree_val,cc_val,lag_val,dtw_dist,dtw_sim,hs_dist,...,hs2_i,hs2_j,cluster_i,cluster_j,same_cluster,dtw_norm,hs_dist_norm,cluster_sim,lag_dir,score_v1
3263,KFQSHBNH,RJCAXSGH,0.920558,0.916607,0.952381,39.030389,0,23790669.0,4.203328e-08,0,...,85,85,1,0,0,0.005092,0.0,0.0,0.0,8.316254
2313,FTSVTTSR,XIIEJNEE,0.130645,0.12366,0.47619,40.119895,32,8621955.0,1.15983e-07,3,...,25,38,1,-1,0,0.001845,3.0,0.0,0.5,7.896783
2273,FTSVTTSR,NAQIHUKZ,0.123248,0.188101,0.309524,38.726479,1,7282466.0,1.373161e-07,3,...,25,38,1,2,0,0.001559,3.0,0.0,1.0,7.647353
3798,NAQIHUKZ,TANNMIMB,0.154295,0.04059,0.452381,37.705716,16,187432.0,5.33524e-06,3,...,38,43,2,2,1,4e-05,3.0,1.0,0.5,7.459344
3401,LLHREMKS,XIIEJNEE,0.102084,0.000757,0.47619,37.975287,34,4874266.0,2.051591e-07,3,...,28,38,3,-1,0,0.001043,3.0,0.0,0.5,7.447083


In [4]:
# =================================================================
#  Helper: Trend slope Í≥ÑÏÇ∞ (Ï†ÑÏ≤¥ Í∏∞Í∞Ñ ÎòêÎäî ÏµúÍ∑º 12Í∞úÏõî)
# =================================================================
def compute_trend_slope(ts):
    x = np.arange(len(ts))
    if np.std(ts) < 1e-9:
        return 0.0
    slope = np.polyfit(x, ts, 1)[0]
    return slope


# =================================================================
#  Helper: Recent growth (ÎßàÏßÄÎßâ 6Í∞úÏõî)
# =================================================================
def recent_growth(ts, window=6):
    if len(ts) < window + 1:
        return 0.0
    return ts[-1] - ts[-window]


# =================================================================
#  Direction Í≤∞Ï†ï Ìï®Ïàò
# =================================================================
def determine_direction(row, ts_val):
    i, j = row["item_i"], row["item_j"]
    lag = row["lag_val"]

    v_i = ts_val.loc[i].values
    v_j = ts_val.loc[j].values

    # Í∏∞Î≥∏ Î£∞: lag Í∏∞Î∞ò
    # ================================================================
    if lag > 0:
        return i, j   # i leads j

    if lag < 0:
        return j, i   # j leads i


    # ================================================================
    # lag == 0 ‚Üí Î≥¥Ï°∞ Í∏∞Ï§Ä
    # ================================================================

    # 1) Recent 6M growth
    g_i = recent_growth(v_i, 6)
    g_j = recent_growth(v_j, 6)
    if abs(g_i - g_j) > 0:  # Îã§Î•¥Îã§Î©¥ ÌÅ∞ Ï™ΩÏù¥ leader
        if g_i > g_j:
            return i, j
        else:
            return j, i

    # 2) Trend slope
    s_i = compute_trend_slope(v_i)
    s_j = compute_trend_slope(v_j)
    if abs(s_i - s_j) > 0:
        if s_i > s_j:
            return i, j
        else:
            return j, i

    # 3) Value magnitude (Í∑úÎ™® ÌÅ∞ Ï™ΩÏù¥ leaderÏù∏ Í≤ΩÏö∞ ÎßéÏùå)
    if v_i[-1] > v_j[-1]:
        return i, j
    else:
        return j, i


# =================================================================
#  Ï†ÑÏ≤¥ Î∞©Ìñ•ÏÑ± Í≤∞Ï†ï ÌååÏù¥ÌîÑÎùºÏù∏
# =================================================================
def assign_directions(top_pairs, monthly):
    print("üìå Fixing monthly duplicates for direction assignment...")

    # 1) time index ÏÉùÏÑ± (year + month ‚Üí t)
    monthly = monthly.copy()
    monthly["t"] = (monthly["year"] - monthly["year"].min()) * 12 + monthly["month"]

    # 2) Ï§ëÎ≥µ item_id‚Äìt aggregation
    monthly = monthly.groupby(["item_id", "t"], as_index=False).agg({
        "total_value": "sum",
        "total_weight": "sum",
        "total_quantity": "sum"
    })

    # 3) pivot
    ts_val = monthly.pivot(index="item_id", columns="t", values="total_value").fillna(0)

    leaders, followers = [], []

    for idx, row in top_pairs.iterrows():
        lead, follow = determine_direction(row, ts_val)
        leaders.append(lead)
        followers.append(follow)

    out_df = top_pairs.copy()
    out_df["leader"] = leaders
    out_df["follower"] = followers

    return out_df



In [5]:
monthly = pd.read_csv("/data/ephemeral/home/data/processed/train_monthly_v2.csv")

direction_df = assign_directions(top_pairs, monthly)
direction_df.head()


üìå Fixing monthly duplicates for direction assignment...


Unnamed: 0,item_i,item_j,cos_val,cos_wgt,sign_agree_val,cc_val,lag_val,dtw_dist,dtw_sim,hs_dist,...,cluster_i,cluster_j,same_cluster,dtw_norm,hs_dist_norm,cluster_sim,lag_dir,score_v1,leader,follower
3263,KFQSHBNH,RJCAXSGH,0.920558,0.916607,0.952381,39.030389,0,23790669.0,4.203328e-08,0,...,1,0,0,0.005092,0.0,0.0,0.0,8.316254,KFQSHBNH,RJCAXSGH
2313,FTSVTTSR,XIIEJNEE,0.130645,0.12366,0.47619,40.119895,32,8621955.0,1.15983e-07,3,...,1,-1,0,0.001845,3.0,0.0,0.5,7.896783,FTSVTTSR,XIIEJNEE
2273,FTSVTTSR,NAQIHUKZ,0.123248,0.188101,0.309524,38.726479,1,7282466.0,1.373161e-07,3,...,1,2,0,0.001559,3.0,0.0,1.0,7.647353,FTSVTTSR,NAQIHUKZ
3798,NAQIHUKZ,TANNMIMB,0.154295,0.04059,0.452381,37.705716,16,187432.0,5.33524e-06,3,...,2,2,1,4e-05,3.0,1.0,0.5,7.459344,NAQIHUKZ,TANNMIMB
3401,LLHREMKS,XIIEJNEE,0.102084,0.000757,0.47619,37.975287,34,4874266.0,2.051591e-07,3,...,3,-1,0,0.001043,3.0,0.0,0.5,7.447083,LLHREMKS,XIIEJNEE


In [6]:
# ------------------------------------------------------------
# PHASE 3-1: Build FE base table
# ------------------------------------------------------------

def build_fe_base(direction_df, monthly):

    monthly = monthly.copy()
    monthly["t"] = (monthly["year"] - monthly["year"].min()) * 12 + monthly["month"]
    

    # Pivot leader/follower series
    ts_val = monthly.pivot(index="item_id", columns="t", values="total_value").fillna(0)
    ts_wgt = monthly.pivot(index="item_id", columns="t", values="total_weight").fillna(0)


    rows = []

    for idx, row in direction_df.iterrows():
        leader = row["leader"]
        follower = row["follower"]
        lag = int(row["lag_val"])

        leader_ts = ts_val.loc[leader].values
        follower_ts = ts_val.loc[follower].values

        # ----------------------------------------------------
        # 1) Leader past features (t = 1..42)
        # ----------------------------------------------------
        for t in range(1, 43):  # predict follower at t+1
            if t + 1 > 43:
                continue

            # Leader past window
            L_1 = leader_ts[t-1]
            L_diff = leader_ts[t-1] - leader_ts[t-2] if t >= 2 else 0
            L_roll3_mean = leader_ts[max(0, t-3):t].mean()
            L_roll3_std = leader_ts[max(0, t-3):t].std()

            # ------------------------------------------------
            # 2) Follower target (value at t+1)
            # ------------------------------------------------
            target = follower_ts[t]

            # ------------------------------------------------
            # 3) Pair-level static features
            # ------------------------------------------------
            pair_static = {
                "cos_val": row["cos_val"],
                "cos_wgt": row["cos_wgt"],
                "sign_agree_val": row["sign_agree_val"],
                "cc_val": row["cc_val"],
                "lag_val": lag,
                "dtw_sim": row["dtw_sim"],
                "hs_dist": row["hs_dist"],
                "cluster_sim": row["same_cluster"],
                "score_v1": row["score_v1"]
            }

            # ------------------------------------------------
            # 4) Íµ¨Ï°∞ Ï†ÄÏû•
            # ------------------------------------------------
            rows.append({
                "leader": leader,
                "follower": follower,
                "t": t,
                "L_1": L_1,
                "L_diff": L_diff,
                "L_roll3_mean": L_roll3_mean,
                "L_roll3_std": L_roll3_std,
                **pair_static,
                "target": target
            })

    fe_df = pd.DataFrame(rows)
    print("FE base shape:", fe_df.shape)
    return fe_df


In [7]:
monthly = pd.read_csv("/data/ephemeral/home/data/processed/train_monthly_v2.csv")

fe_df = build_fe_base(direction_df, monthly)
fe_df.to_csv("fe_v1.csv", index=False)


FE base shape: (42000, 17)


In [10]:
pair_df.to_csv("pair_df_v1.csv", index=False)
direction_df.to_csv("direction_df_top1000.csv", index=False)
fe_df.to_csv("fe_v1.csv", index=False)

print("Saved files:")
print(" - pair_df_v1.csv")
print(" - direction_df_top1000.csv")
print(" - fe_v1.csv")


Saved files:
 - pair_df_v1.csv
 - direction_df_top1000.csv
 - fe_v1.csv
