In [1]:
# ============================================================
# EDA Core Script v2 (for trade competition)
# Purpose: HS4 corr, lead-lag corr, DTW, ratio corr
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.signal import correlate
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw



In [4]:

# ------------------------------------------------------------
# Load processed monthly file
# ------------------------------------------------------------
df = pd.read_csv("../../data/processed/train_monthly_v3_eda.csv")

# pivot by item × month index
pivot = df.pivot(index="item_id", columns="t", values="total_value").fillna(0)
pivot_norm = pivot.apply(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-9), axis=1)

item_ids = pivot.index.tolist()
hs4_map = df.groupby("item_id")["hs4"].first().to_dict()

print("Loaded pivot:", pivot.shape)

# ============================================================
# 1. HS4 Internal Correlation Matrix
# ============================================================
hs4_corr_results = {}

for hs4, sub in df.groupby("hs4"):
    items = sub["item_id"].unique()
    if len(items) < 2:
        continue
    
    p = pivot_norm.loc[items]

    corr_mat = p.T.corr()
    hs4_corr_results[hs4] = corr_mat

    plt.figure(figsize=(8,6))
    sns.heatmap(corr_mat, annot=False, cmap="coolwarm", vmin=-1, vmax=1)
    plt.title(f"HS4={hs4} Internal Correlation ({len(items)} items)")
    plt.tight_layout()
    plt.savefig(f"eda_hs4_corr_{hs4}.png")
    plt.close()

print("HS4 corr 완료")


# ============================================================
# 2. Lead-Lag Cross Correlation (A leads B)
# ============================================================
def best_leadlag(x, y, max_lag=6):
    best_corr = 0
    best_lag = None

    if np.std(x) == 0 or np.std(y) == 0:
        return None, 0

    for lag in range(1, max_lag+1):
        if len(x) <= lag: 
            continue

        # A(t) vs B(t+lag)
        c = np.corrcoef(x[:-lag], y[lag:])[0,1]
        if abs(c) > abs(best_corr):
            best_corr = c
            best_lag = lag

    return best_lag, best_corr


results = []

for A in tqdm(item_ids):
    x = pivot_norm.loc[A].values
    for B in item_ids:
        if A == B:
            continue
        y = pivot_norm.loc[B].values

        lag, corr = best_leadlag(x, y)
        if lag is not None:
            results.append([A, B, lag, corr])

lag_df = pd.DataFrame(results, columns=["A", "B", "best_lag", "best_corr"])
lag_df.to_csv("eda_lag_leadlag.csv", index=False)
print("Lead-Lag corr 완료")


# ============================================================
# 3. DTW Distance Matrix
# ============================================================
# ============================================================
# 3. DTW Distance Matrix (robust version)
# ============================================================

dtw_list = []

for A in tqdm(item_ids):
    x = pivot_norm.loc[A].values.astype(float).reshape(-1)
    
    # 길이 부족/변동성 없음 → skip
    if len(x) < 3 or np.std(x) < 1e-8:
        continue
    
    for B in item_ids:
        if A == B:
            continue
        
        y = pivot_norm.loc[B].values.astype(float).reshape(-1)
        
        if len(y) < 3 or np.std(y) < 1e-8:
            continue

        # DTW 계산
        try:
            dist, _ = fastdtw(x, y, dist=euclidean)
        except Exception as e:
            print(f"DTW fail: {A}, {B}, reason={e}")
            continue

        dtw_list.append([A, B, dist])

dtw_df = pd.DataFrame(dtw_list, columns=["A", "B", "dtw_dist"])
dtw_df.to_csv("eda_dtw_matrix.csv", index=False)
print("DTW 완료 (안 터짐)")


# ============================================================
# 4. Ratio Corr (weight/value)
# ============================================================
pivot_ratio = df.pivot(index="item_id", columns="t", values="wv_ratio").fillna(0)
ratio_corr = pivot_ratio.T.corr()
ratio_corr.to_csv("eda_ratio_corr.csv")

plt.figure(figsize=(9,7))
sns.heatmap(ratio_corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Ratio (weight/value) Corr")
plt.tight_layout()
plt.savefig("eda_ratio_corr.png")
plt.close()

print("Ratio corr 완료")

print("\n🎉 EDA Core v2 완성! 의미 없는 그래프 없이, 실제로 FE에 필요한 기능만 넣음.")


Loaded pivot: (100, 43)
HS4 corr 완료


  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 31.24it/s]


Lead-Lag corr 완료


 23%|████████████████████████▍                                                                                 | 23/100 [00:00<00:00, 109.17it/s]

DTW fail: AANGBULD, AHMDUILJ, reason=Input vector should be 1-D.
DTW fail: AANGBULD, ANWUJOKX, reason=Input vector should be 1-D.
DTW fail: AANGBULD, APQGTRMF, reason=Input vector should be 1-D.
DTW fail: AANGBULD, ATLDMDBO, reason=Input vector should be 1-D.
DTW fail: AANGBULD, AXULOHBQ, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BEZYMBBT, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BJALXPFS, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BLANHGYY, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BSRMSVTC, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BTMOEMEP, reason=Input vector should be 1-D.
DTW fail: AANGBULD, BUZIIBYG, reason=Input vector should be 1-D.
DTW fail: AANGBULD, CCLHWFWF, reason=Input vector should be 1-D.
DTW fail: AANGBULD, DBWLZWNK, reason=Input vector should be 1-D.
DTW fail: AANGBULD, DDEXPPXU, reason=Input vector should be 1-D.
DTW fail: AANGBULD, DEWLVASR, reason=Input vector should be 1-D.
DTW fail: AANGBULD, DJBLN

 34%|████████████████████████████████████                                                                      | 34/100 [00:00<00:00, 108.50it/s]

DTW fail: FITUEHWN, AANGBULD, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, AHMDUILJ, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, ANWUJOKX, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, APQGTRMF, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, ATLDMDBO, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, AXULOHBQ, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BEZYMBBT, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BJALXPFS, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BLANHGYY, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BSRMSVTC, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BTMOEMEP, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, BUZIIBYG, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, CCLHWFWF, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, DBWLZWNK, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, DDEXPPXU, reason=Input vector should be 1-D.
DTW fail: FITUEHWN, DEWLV

 58%|█████████████████████████████████████████████████████████████▍                                            | 58/100 [00:00<00:00, 111.03it/s]

DTW fail: LSOIUSXD, AANGBULD, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, AHMDUILJ, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, ANWUJOKX, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, APQGTRMF, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, ATLDMDBO, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, AXULOHBQ, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BEZYMBBT, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BJALXPFS, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BLANHGYY, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BSRMSVTC, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BTMOEMEP, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, BUZIIBYG, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, CCLHWFWF, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, DBWLZWNK, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, DDEXPPXU, reason=Input vector should be 1-D.
DTW fail: LSOIUSXD, DEWLV

 82%|██████████████████████████████████████████████████████████████████████████████████████▉                   | 82/100 [00:00<00:00, 111.76it/s]

DTW fail: SAAYMURU, AANGBULD, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, AHMDUILJ, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, ANWUJOKX, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, APQGTRMF, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, ATLDMDBO, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, AXULOHBQ, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BEZYMBBT, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BJALXPFS, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BLANHGYY, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BSRMSVTC, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BTMOEMEP, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, BUZIIBYG, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, CCLHWFWF, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, DBWLZWNK, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, DDEXPPXU, reason=Input vector should be 1-D.
DTW fail: SAAYMURU, DEWLV

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 111.14it/s]

DTW fail: XUOIQPFL, AANGBULD, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, AHMDUILJ, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, ANWUJOKX, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, APQGTRMF, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, ATLDMDBO, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, AXULOHBQ, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BEZYMBBT, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BJALXPFS, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BLANHGYY, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BSRMSVTC, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BTMOEMEP, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, BUZIIBYG, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, CCLHWFWF, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, DBWLZWNK, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, DDEXPPXU, reason=Input vector should be 1-D.
DTW fail: XUOIQPFL, DEWLV




Ratio corr 완료

🎉 EDA Core v2 완성! 의미 없는 그래프 없이, 실제로 FE에 필요한 기능만 넣음.
