In [1]:
# ===============================================
# Toss CTR - Preprocessing v3 (Leakage-Free + ID/Target Preserved)
# ===============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans

pd.options.io.parquet.engine = "pyarrow"



In [2]:
# -----------------------------------------------
# 1. Load
# -----------------------------------------------
train = pd.read_parquet("train.parquet")
test  = pd.read_parquet("test.parquet")
print(f"Train: {train.shape} | Test: {test.shape}")


Train: (10704179, 119) | Test: (1527298, 119)


In [3]:
# -----------------------------------------------
# 2. ID / Target 보존
# -----------------------------------------------
if "ID" in test.columns:
    test.rename(columns={"ID": "id"}, inplace=True)

if "id" not in train.columns:
    train["row_id"] = np.arange(len(train))
    id_col = "row_id"
else:
    id_col = "id"

target_col = "clicked" if "clicked" in train.columns else None
print(f"[info] Using '{id_col}' as ID column, target = {target_col}")

# -----------------------------------------------
# 3. 완전 중복 제거
# -----------------------------------------------
dup_count = train.duplicated().sum()
if dup_count > 0:
    print(f"[info] 완전 중복 {dup_count}건 제거")
    train = train.drop_duplicates().reset_index(drop=True)

# -----------------------------------------------
# 4. feat_e_3 결측 처리
# -----------------------------------------------
train["feat_e_3_isna"] = train["feat_e_3"].isnull().astype(int)
test["feat_e_3_isna"]  = test["feat_e_3"].isnull().astype(int)

age_median = train.groupby("age_group")["feat_e_3"].median()
train["feat_e_3"] = train["feat_e_3"].fillna(train["age_group"].map(age_median))
test["feat_e_3"]  = test["feat_e_3"].fillna(test["age_group"].map(age_median))

global_median = train["feat_e_3"].median()
train["feat_e_3"].fillna(global_median, inplace=True)
test["feat_e_3"].fillna(global_median, inplace=True)

[info] Using 'row_id' as ID column, target = clicked


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["feat_e_3"].fillna(global_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["feat_e_3"].fillna(global_median, inplace=True)


In [4]:
# -----------------------------------------------
# 5. Sequence Feature
# -----------------------------------------------
def count_unique_items(seq_str):
    try:
        items = str(seq_str).split(',')
        return len(set(items))
    except:
        return 0

train["seq_length"] = train["seq"].astype(str).apply(lambda x: len(x.split(",")))
test["seq_length"]  = test["seq"].astype(str).apply(lambda x: len(x.split(",")))
train["seq_length_log"] = np.log1p(train["seq_length"])
test["seq_length_log"]  = np.log1p(test["seq_length"])

train["unique_items"] = train["seq"].apply(count_unique_items)
test["unique_items"]  = test["seq"].apply(count_unique_items)
train["diversity_ratio"] = train["unique_items"] / train["seq_length"].replace(0, np.nan)
test["diversity_ratio"]  = test["unique_items"] / test["seq_length"].replace(0, np.nan)
train["diversity_ratio"].fillna(0, inplace=True)
test["diversity_ratio"].fillna(0, inplace=True)

train.drop(columns=["seq"], errors="ignore", inplace=True)
test.drop(columns=["seq"], errors="ignore", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["diversity_ratio"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["diversity_ratio"].fillna(0, inplace=True)


In [5]:
# -----------------------------------------------
# 6. user_cluster (lightweight KMeans)
# -----------------------------------------------
cluster_feats = ["history_mean", "history_var", "seq_length"]
cluster_feats = [c for c in cluster_feats if c in train.columns]
if len(cluster_feats) >= 2:
    km = KMeans(n_clusters=10, random_state=42)
    train["user_cluster"] = km.fit_predict(train[cluster_feats])
    test["user_cluster"]  = km.predict(test[cluster_feats])
else:
    train["user_cluster"] = 0
    test["user_cluster"]  = 0

In [6]:
# -----------------------------------------------
# 7. history_mean / var (Leakage-free OOF version)
# -----------------------------------------------
print("[info] Generating leakage-free history features...")
if "user_id" in train.columns:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train["history_mean"] = np.nan
    train["history_var"]  = np.nan

    for fold, (tr_idx, val_idx) in enumerate(kf.split(train)):
        tr_sub = train.iloc[tr_idx]
        val_sub = train.iloc[val_idx]
        stats = (
            tr_sub.groupby("user_id")["clicked"]
            .agg(history_mean="mean", history_var="var")
        )
        train.loc[val_idx, "history_mean"] = val_sub["user_id"].map(stats["history_mean"])
        train.loc[val_idx, "history_var"]  = val_sub["user_id"].map(stats["history_var"])

    # test는 전체 train 통계로 계산
    full_stats = (
        train.groupby("user_id")["clicked"]
        .agg(history_mean="mean", history_var="var")
    )
    test["history_mean"] = test["user_id"].map(full_stats["history_mean"])
    test["history_var"]  = test["user_id"].map(full_stats["history_var"])

    # 결측치 (신규 유저)
    global_mean = train["clicked"].mean()
    global_var  = train["clicked"].var()
    train["history_mean"].fillna(global_mean, inplace=True)
    train["history_var"].fillna(global_var, inplace=True)
    test["history_mean"].fillna(global_mean, inplace=True)
    test["history_var"].fillna(global_var, inplace=True)
else:
    train["history_mean"] = 0
    train["history_var"]  = 0
    test["history_mean"]  = 0
    test["history_var"]   = 0


[info] Generating leakage-free history features...


In [7]:
# -----------------------------------------------
# 8. history_a/b mean
# -----------------------------------------------
hist_a_cols = [c for c in train.columns if "history_a_" in c]
hist_b_cols = [c for c in train.columns if "history_b_" in c]
train["history_a_mean"] = train[hist_a_cols].mean(axis=1) if len(hist_a_cols) > 0 else 0
test["history_a_mean"]  = test[hist_a_cols].mean(axis=1)  if len(hist_a_cols) > 0 else 0
train["history_b_mean"] = train[hist_b_cols].mean(axis=1) if len(hist_b_cols) > 0 else 0
test["history_b_mean"]  = test[hist_b_cols].mean(axis=1)  if len(hist_b_cols) > 0 else 0
for col in ["history_a_1", "history_b_2"]:
    if col not in train.columns:
        train[col] = 0
        test[col]  = 0
train["history_clicked_corr"] = 0
test["history_clicked_corr"]  = 0

In [8]:

# -----------------------------------------------
# 9. Flags
# -----------------------------------------------
if "hour" in train.columns:
    train["hour"] = pd.to_numeric(train["hour"], errors="coerce")
    test["hour"]  = pd.to_numeric(test["hour"], errors="coerce")
    train["night_flag"] = train["hour"].between(0, 6, inclusive="both").astype(int)
    test["night_flag"]  = test["hour"].between(0, 6, inclusive="both").astype(int)

if "day_of_week" in train.columns:
    train["day_of_week"] = pd.to_numeric(train["day_of_week"], errors="coerce")
    test["day_of_week"]  = pd.to_numeric(test["day_of_week"], errors="coerce")
    train["is_weekend"]  = train["day_of_week"].isin([5, 6]).astype(int)
    test["is_weekend"]   = test["day_of_week"].isin([5, 6]).astype(int)
    train["tuesday_flag"] = (train["day_of_week"] == 1).astype(int)
    test["tuesday_flag"]  = (test["day_of_week"] == 1).astype(int)

# 신규 유저 flag
if "user_id" in train.columns:
    seen_users = set(train["user_id"].astype(str))
    test["new_user_flag"] = (~test["user_id"].astype(str).isin(seen_users)).astype(int)
    train["new_user_flag"] = 0
else:
    train["new_user_flag"] = 0
    test["new_user_flag"]  = 0


In [9]:
# -----------------------------------------------
# 10. Category dtype
# -----------------------------------------------
for col in ["gender", "age_group", "inventory_id"]:
    if col in train.columns:
        train[col] = train[col].astype("category")
        test[col]  = test[col].astype("category")


In [10]:
# -----------------------------------------------
# 11. 저장 (ID + clicked 항상 보존)
# -----------------------------------------------
cols_to_keep = [id_col]
if target_col is not None:
    cols_to_keep.append(target_col)

train = pd.concat([train[cols_to_keep], train.drop(columns=cols_to_keep)], axis=1)
if id_col in test.columns:
    test = pd.concat([test[[id_col]], test.drop(columns=[id_col])], axis=1)

train.to_parquet("train_basic_3.parquet", index=False, compression="snappy")
test.to_parquet("test_basic_3.parquet", index=False, compression="snappy")

print("✅ Saved train_basic_3.parquet / test_basic_3.parquet (ID + clicked preserved)")


✅ Saved train_basic_3.parquet / test_basic_3.parquet (ID + clicked preserved)
