In [4]:
# ============================================================
# train_preprocess_v6.py
# - raw → clean → monthly_full → smoothed series → volatility filter
# - pairwise FE(3M/6M/lag<=4)에 최적화된 minimal 구조
# ============================================================

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ============================================================
# 0. PATH
# ============================================================
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"

RAW_PATH          = DATA_DIR / "raw" / "train.csv"
CLEAN_PATH        = DATA_DIR / "interim" / "train_clean_v6.csv"
MONTHLY_PATH      = DATA_DIR / "processed" / "train_monthly_v6.csv"
PIVOT_PATH        = DATA_DIR / "processed" / "pivot_value_v6.csv"
SUMMARY_PATH      = DATA_DIR / "processed" / "item_summary_v6.csv"
TS_CORE_PATH      = DATA_DIR / "processed" / "train_ts_core_v6.csv"

# ============================================================
# 1. LOAD RAW
# ============================================================
df = pd.read_csv(RAW_PATH)

df["year"]  = df["year"].astype(int)
df["month"] = df["month"].astype(int)
df["seq"]   = df["seq"].astype(int)
df["hs4"]   = df["hs4"].astype(str).str.zfill(4)
df["type"]  = df["type"].astype(str)

for col in ["weight", "quantity", "value"]:
    df[col] = df[col].astype(float)

df = df.drop_duplicates()

# ============================================================
# 2. HS + DATE
# ============================================================
df["hs3"] = df["hs4"].str[:3]
df["hs2"] = df["hs4"].str[:2]

df["ym"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01")

# ============================================================
# 3. 월별 집계
# ============================================================
group_cols = ["item_id", "hs4", "hs3", "hs2", "type", "year", "month", "ym"]

monthly = (
    df.groupby(group_cols, as_index=False)
      .agg(
          total_value    = ("value", "sum"),
          total_weight   = ("weight", "sum"),
      )
      .sort_values(group_cols)
)

# ============================================================
# 4. 완전 패널 생성 (item × 전체 ym)
# ============================================================
items = monthly["item_id"].unique()
all_ym = pd.date_range(monthly["ym"].min(), monthly["ym"].max(), freq="MS")

full_index = pd.MultiIndex.from_product([items, all_ym], names=["item_id", "ym"])

monthly_full = (
    monthly.set_index(["item_id", "ym"])
           .reindex(full_index)
           .reset_index()
)

monthly_full["year"]  = monthly_full["ym"].dt.year
monthly_full["month"] = monthly_full["ym"].dt.month

# 카테고리 정보 복원
monthly_full[["hs4", "hs3", "hs2", "type"]] = (
    monthly_full.groupby("item_id")[["hs4","hs3","hs2","type"]]
                .ffill().bfill()
)

# ============================================================
# 5. t index
# ============================================================
unique_ym = sorted(monthly_full["ym"].unique())
t_map = {ym: i for i, ym in enumerate(unique_ym)}
monthly_full["t"] = monthly_full["ym"].map(t_map).astype(int)

# ============================================================
# 6. log1p + 3M smoothing
# ============================================================
monthly_full["log_value"] = np.log1p(monthly_full["total_value"])

# smoothing (핵심!)
monthly_full["smooth_value"] = (
    monthly_full.groupby("item_id")["log_value"]
                .transform(lambda x: x.rolling(3, min_periods=1).mean())
)

# ============================================================
# 7. Rolling 3M / 6M mean & std
# ============================================================
def add_roll(df, group, col, wins):
    for w in wins:
        df[f"{col}_mean_{w}"] = df.groupby(group)[col].transform(
            lambda x: x.rolling(w, min_periods=1).mean()
        )
        df[f"{col}_std_{w}"] = df.groupby(group)[col].transform(
            lambda x: x.rolling(w, min_periods=1).std()
        )
    return df

monthly_full = add_roll(monthly_full, "item_id", "smooth_value", [3, 6])

# ============================================================
# 8. Rolling slope (3M, 6M)
# ============================================================
def rolling_slope(arr):
    arr = np.asarray(arr, dtype=float)
    mask = ~np.isnan(arr)
    # 유효한 값(Non-NaN)이 2개 미만이면 기울기 0으로 처리
    if mask.sum() < 2:
        return 0.0
    y = arr[mask]
    X = np.arange(len(y)).reshape(-1, 1)
    reg = LinearRegression().fit(X, y)
    return float(reg.coef_[0])

for w in [3, 6]:
    monthly_full[f"slope_{w}"] = (
        monthly_full.groupby("item_id")["smooth_value"]
                    .transform(
                        lambda x: x.rolling(w, min_periods=2)
                                   .apply(rolling_slope, raw=True)  # raw=True 권장
                    )
    )


# ============================================================
# 9. lag 1~4 (총 4개월 제한)
# ============================================================
for lag in [1, 2, 3, 4]:
    monthly_full[f"smooth_lag{lag}"] = (
        monthly_full.groupby("item_id")["smooth_value"].shift(lag)
    )

# ============================================================
# 10. item-level 요약 (변동성 필터용)
# ============================================================
g = monthly_full.groupby("item_id")

total_sum  = g["total_value"].sum()
nonzero    = g["total_value"].apply(lambda x: (x > 0).sum())
std_val    = g["total_value"].std().fillna(0)
mean_val   = g["total_value"].mean()
cv         = (std_val / mean_val.replace(0, np.nan)).fillna(0)

item_summary = pd.DataFrame({
    "item_id": total_sum.index,
    "total_sum": total_sum.values,
    "active_months": nonzero.values,
    "value_std": std_val.values,
    "value_mean": mean_val.values,
    "value_cv": cv.values,
})

meta = (
    monthly_full.groupby("item_id")[["hs2","hs3","hs4","type"]]
                .agg(lambda x: x.mode().iloc[0] if len(x.mode()) else x.iloc[0])
                .reset_index()
)

item_summary = item_summary.merge(meta, on="item_id")

# ============================================================
# 11. pivot_value (pairwise FE용 → smoothing 값 사용!)
# ============================================================
pivot_value = (
    monthly_full.pivot(index="item_id", columns="t", values="smooth_value")
               .sort_index()
)

# ============================================================
# 12. ts_core
# ============================================================
ts_core = monthly_full[[
    "item_id","ym","t",
    "smooth_value",
    "smooth_lag1","smooth_lag2","smooth_lag3","smooth_lag4",
    "hs2","hs3","hs4"
]].copy()

# ============================================================
# SAVE
# ============================================================
df.to_csv(CLEAN_PATH, index=False)
monthly_full.to_csv(MONTHLY_PATH, index=False)
pivot_value.to_csv(PIVOT_PATH)
item_summary.to_csv(SUMMARY_PATH, index=False)
ts_core.to_csv(TS_CORE_PATH, index=False)

print("Preprocess v6 DONE!")


Preprocess v6 DONE!
