In [1]:
# ============================================================
# train_preprocess_v3.py
# - raw ‚Üí clean ‚Üí monthly (ÏôÑÏ†Ñ Ìå®ÎÑê + Ïù¥Î≤§Ìä∏ Í∏∞Î∞ò ÌååÏÉù Ìè¨Ìï®)
# - pairwise EDA & FE Îã®Í≥ÑÍπåÏßÄ ÏùºÍ¥ÄÎêòÍ≤å ÏÇ¨Ïö©Ìï† Ïàò ÏûàÎäî Î≤ÑÏ†Ñ
# ============================================================
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans





In [2]:
# ============================================================
# 0. Path ÏÑ§Ï†ï
# ============================================================
BASE_DIR = Path.cwd().resolve()
DATA_DIR = BASE_DIR.parents[1] / "data"

RAW_PATH        = DATA_DIR / "raw" / "train.csv"
CLEAN_PATH      = DATA_DIR / "interim" / "train_clean_v3.csv"
MONTHLY_PATH    = DATA_DIR / "processed" / "train_monthly_v3_eda.csv"
PIVOT_VALUE_PATH = DATA_DIR / "processed" / "pivot_value_v3_eda.csv"
ITEM_SUMMARY_PATH = DATA_DIR / "processed" / "item_summary_v3_eda.csv"

print("üìÇ RAW_PATH:", RAW_PATH)

# ============================================================
# 1. RAW load & Í∏∞Î≥∏ ÌÉÄÏûÖ Ï†ïÎ¶¨
# ============================================================
df = pd.read_csv(RAW_PATH)

df["year"]  = df["year"].astype(int)
df["month"] = df["month"].astype(int)
df["seq"]   = df["seq"].astype(int)
df["type"]  = df["type"].astype(str)
df["hs4"]   = df["hs4"].astype(str).str.zfill(4)

for col in ["weight", "quantity", "value"]:
    df[col] = df[col].astype(float)

# NA ‚Üí 0
df = df.fillna(0)

# Ï§ëÎ≥µ Ï†úÍ±∞
df = df.drop_duplicates()

print("üìå Loaded:", df.shape)

# ============================================================
# 2. HS Í≥ÑÏ∏µ ÌååÏÉù + ÎÖºÎ¶¨ Î∂àÏùºÏπò ÌîåÎûòÍ∑∏
# ============================================================
df["hs2"] = df["hs4"].str[:2]
df["hs3"] = df["hs4"].str[:3]

df["flag_v0_wpos"] = ((df["value"] == 0) & (df["weight"] > 0)).astype(int)
df["flag_w0_vpos"] = ((df["weight"] == 0) & (df["value"] > 0)).astype(int)

# ym (Ïõî Îã®ÏúÑ datetime) ÏÉùÏÑ±
df["ym"] = pd.to_datetime(df["year"].astype(str) + "-" +
                          df["month"].astype(str) + "-01")

# ============================================================
# 3. ÏõîÎ≥Ñ ÏßëÍ≥Ñ (item √ó year √ó month)
# ============================================================
group_cols = ["item_id", "type", "hs4", "hs3", "hs2", "year", "month", "ym"]

monthly = (
    df.groupby(group_cols, as_index=False)
      .agg(
          total_value=("value", "sum"),
          total_weight=("weight", "sum"),
          total_quantity=("quantity", "sum"),
          flag_v0_wpos=("flag_v0_wpos", "max"),
          flag_w0_vpos=("flag_w0_vpos", "max"),
      )
      .sort_values(group_cols)
)

print("üì¶ monthly raw shape:", monthly.shape)

# Î°úÍ∑∏/ÎπÑÏú® ÌååÏÉù
monthly["log_value"]    = np.log1p(monthly["total_value"])
monthly["log_weight"]   = np.log1p(monthly["total_weight"])
monthly["log_quantity"] = np.log1p(monthly["total_quantity"])
monthly["wv_ratio"]     = monthly["total_weight"] / np.maximum(monthly["total_value"], 1.0)

# ============================================================
# 4. ÏôÑÏ†Ñ Ìå®ÎÑê ÌôïÏû• (item √ó Î™®Îì† ym)
# ============================================================
items = monthly["item_id"].unique()
all_ym = pd.date_range(
    start=monthly["ym"].min(),
    end=monthly["ym"].max(),
    freq="MS",  # month start
)

full_index = pd.MultiIndex.from_product(
    [items, all_ym],
    names=["item_id", "ym"],
)

monthly_full = (
    monthly.set_index(["item_id", "ym"])
           .reindex(full_index)
           .reset_index()
)

# year / month Î≥µÏõê
monthly_full["year"] = monthly_full["ym"].dt.year
monthly_full["month"] = monthly_full["ym"].dt.month

# 0 fill for ÏàòÏπòÌòï
for col in [
    "total_value", "total_weight", "total_quantity",
    "log_value", "log_weight", "log_quantity",
    "wv_ratio", "flag_v0_wpos", "flag_w0_vpos"
]:
    monthly_full[col] = monthly_full[col].fillna(0)

# hs/type Ï†ïÎ≥¥Îäî forward/backward fill (ÏïÑÏù¥ÌÖúÎ≥Ñ Í≥†Ï†ï)
monthly_full = monthly_full.sort_values(["item_id", "ym"])
monthly_full[["hs2", "hs3", "hs4", "type"]] = (
    monthly_full.groupby("item_id")[["hs2", "hs3", "hs4", "type"]]
                .ffill()
                .bfill()
)

print("üìå monthly FULL shape:", monthly_full.shape)

# t Ïù∏Îç±Ïä§ (EDA/Î™®Îç∏Ïö© time index)
monthly_full = monthly_full.sort_values(["ym"])
t_map = {ym: i for i, ym in enumerate(sorted(monthly_full["ym"].unique()))}
monthly_full["t"] = monthly_full["ym"].map(t_map)

# ============================================================
# 5. Ïù¥Î≤§Ìä∏ ÌîåÎûòÍ∑∏
# ============================================================
monthly_full["event"] = (monthly_full["total_value"] > 0).astype(int)

# ============================================================
# 6. item-level rolling / diff / sign / lag
# ============================================================
def add_group_rolling(df, group_col, target_col, win_list, prefix):
    for w in win_list:
        col_name_mean = f"{prefix}_mean_{w}"
        col_name_std  = f"{prefix}_std_{w}"
        df[col_name_mean] = (
            df.groupby(group_col)[target_col]
              .transform(lambda x: x.rolling(w, min_periods=1).mean())
        )
        df[col_name_std] = (
            df.groupby(group_col)[target_col]
              .transform(lambda x: x.rolling(w, min_periods=1).std())
        )
    return df

# rolling 3, 6 (12Îäî Í≥ºÍ∞êÌûà Ï†úÏô∏)
monthly_full = add_group_rolling(
    monthly_full, group_col="item_id",
    target_col="total_value",
    win_list=[3, 6],
    prefix="value_roll"
)

# diff
monthly_full["diff_value"] = (
    monthly_full.groupby("item_id")["total_value"].diff()
)
monthly_full["diff_weight"] = (
    monthly_full.groupby("item_id")["total_weight"].diff()
)

monthly_full["sign_value"]  = np.sign(monthly_full["diff_value"].fillna(0))
monthly_full["sign_weight"] = np.sign(monthly_full["diff_weight"].fillna(0))

# lag features (1~3)
for col in ["total_value", "total_weight", "total_quantity",
            "log_value", "log_weight"]:
    for lag in [1, 2, 3]:
        new_col = f"{col}_lag{lag}"
        monthly_full[new_col] = (
            monthly_full.groupby("item_id")[col].shift(lag)
        )

# ============================================================
# 7. Ìä∏Î†åÎìú Í≥ÑÏÇ∞ Ìï®Ïàò (item-level global slope)
# ============================================================
def compute_trend(values: np.ndarray) -> float:
    values = np.asarray(values, dtype=float)
    if len(values) < 3:
        return 0.0
    if np.all(values == values[0]):
        return 0.0
    X = np.arange(len(values)).reshape(-1, 1)
    lr = LinearRegression()
    lr.fit(X, values)
    return float(lr.coef_[0])

trend_log = []
trend_raw = []

for _, sub in monthly_full.groupby("item_id"):
    vals_log = sub["log_value"].values
    vals_raw = sub["total_value"].values
    slope_log = compute_trend(vals_log)
    slope_raw = compute_trend(vals_raw)
    trend_log.extend([slope_log] * len(sub))
    trend_raw.extend([slope_raw] * len(sub))

monthly_full["trend_log_value"] = trend_log
monthly_full["trend_value"] = trend_raw

# ============================================================
# 8. Seasonality (global / hs4 / itemÎ≥Ñ)
#   - month-of-year Í∏∞Ï§Ä
# ============================================================
# Ï†ÑÏ≤¥ ÏãúÏ¶åÏÑ± (monthÎ≥Ñ ÌèâÍ∑† / Ï†ÑÏ≤¥ ÌèâÍ∑†)
global_mean = monthly_full["total_value"].mean()
month_mean = monthly_full.groupby("month")["total_value"].mean()
season_global = (month_mean / global_mean).to_dict()

monthly_full["seasonality_global"] = monthly_full["month"].map(season_global)

# HS4 Í∏∞Î∞ò ÏãúÏ¶åÏÑ±
hs4_month = (
    monthly_full.groupby(["hs4", "month"])["total_value"]
                .mean()
                .reset_index()
                .rename(columns={"total_value": "hs4_month_avg"})
)

hs4_global = (
    monthly_full.groupby("hs4")["total_value"]
                .mean()
                .reset_index()
                .rename(columns={"total_value": "hs4_global_mean"})
)

hs4_season = hs4_month.merge(hs4_global, on="hs4", how="left")
hs4_season["hs4_season_idx"] = (
    hs4_season["hs4_month_avg"] / hs4_season["hs4_global_mean"].replace(0, np.nan)
)

monthly_full = monthly_full.merge(
    hs4_season[["hs4", "month", "hs4_season_idx"]],
    on=["hs4", "month"],
    how="left"
)

# itemÎ≥Ñ ÏãúÏ¶åÏÑ± (ÏïÑÏù¥ÌÖúÎ≥Ñ month-of-year Ìå®ÌÑ¥)
item_month = (
    monthly_full.groupby(["item_id", "month"])["total_value"]
                .mean()
                .reset_index()
                .rename(columns={"total_value": "item_month_avg"})
)

item_global = (
    monthly_full.groupby("item_id")["total_value"]
                .mean()
                .reset_index()
                .rename(columns={"total_value": "item_global_mean"})
)

item_season = item_month.merge(item_global, on="item_id", how="left")
item_season["item_season_idx"] = (
    item_season["item_month_avg"] / item_season["item_global_mean"].replace(0, np.nan)
)

monthly_full = monthly_full.merge(
    item_season[["item_id", "month", "item_season_idx"]],
    on=["item_id", "month"],
    how="left"
)

# ============================================================
# 9. value-weight cluster (KMeans, log Í∏∞Î∞ò)
# ============================================================
mask_pos = (monthly_full["total_value"] > 0) & (monthly_full["total_weight"] > 0)
X = monthly_full.loc[mask_pos, ["log_weight", "log_value"]].values

if len(X) > 0:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)

    monthly_full["cluster_wv"] = -1
    monthly_full.loc[mask_pos, "cluster_wv"] = cluster_labels
else:
    monthly_full["cluster_wv"] = -1

monthly_full["cluster_wv"] = monthly_full["cluster_wv"].astype(int)

# ============================================================
# 10. item-level Ï†ïÏ†Å ÏöîÏïΩ (CV, Í∏∞Í∞Ñ, active months, rare flag)
# ============================================================
item_group = monthly_full.groupby("item_id")

total_sum = item_group["total_value"].sum()
total_mean = item_group["total_value"].mean()
total_std = item_group["total_value"].std().fillna(0)

value_cv = (total_std / total_mean.replace(0, np.nan)).fillna(0)

active_months = item_group["event"].sum()
first_date = item_group["ym"].min()
last_date = item_group["ym"].max()

item_static = pd.DataFrame({
    "item_id": total_sum.index,
    "total_value_sum": total_sum.values,
    "total_value_mean": total_mean.values,
    "total_value_std": total_std.values,
    "value_cv": value_cv.values,
    "active_months": active_months.values,
    "first_date": first_date.values,
    "last_date": last_date.values,
})

# rare item flag (ÌïòÏúÑ 5% Í∏∞Ï§Ä)
threshold = total_sum.quantile(0.05)
rare_items = total_sum[total_sum < threshold].index
monthly_full["rare_item_flag"] = monthly_full["item_id"].isin(rare_items).astype(int)

# type / hs Ï†ïÎ≥¥ÎèÑ summaryÏóê Î∂ôÏù¥Í∏∞
item_meta = (
    monthly_full.groupby("item_id")[["hs2", "hs3", "hs4", "type"]]
                .agg(lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0])
                .reset_index()
)

item_summary = item_static.merge(item_meta, on="item_id", how="left")

# ============================================================
# 11. EDAÏö© pivot (item √ó ym, value matrix)
# ============================================================
pivot_value = (
    monthly_full.pivot(index="item_id", columns="ym", values="total_value")
               .sort_index()
)

print("üìå pivot_value shape:", pivot_value.shape)

# ============================================================
# SAVE
# ============================================================
df.to_csv(CLEAN_PATH, index=False)
monthly_full.to_csv(MONTHLY_PATH, index=False)
pivot_value.to_csv(PIVOT_VALUE_PATH)
item_summary.to_csv(ITEM_SUMMARY_PATH, index=False)

print("üéâ Saved:")
print(" -", CLEAN_PATH)
print(" -", MONTHLY_PATH)
print(" -", PIVOT_VALUE_PATH)
print(" -", ITEM_SUMMARY_PATH)
print("üî• preprocess_v3_eda ÏôÑÏÑ±!")


üìÇ RAW_PATH: /data/ephemeral/home/data/raw/train.csv
üìå Loaded: (10836, 9)
üì¶ monthly raw shape: (3776, 13)
üìå monthly FULL shape: (4300, 17)
üìå pivot_value shape: (100, 43)
üéâ Saved:
 - /data/ephemeral/home/data/interim/train_clean_v3.csv
 - /data/ephemeral/home/data/processed/train_monthly_v3_eda.csv
 - /data/ephemeral/home/data/processed/pivot_value_v3_eda.csv
 - /data/ephemeral/home/data/processed/item_summary_v3_eda.csv
üî• preprocess_v3_eda ÏôÑÏÑ±!
