In [1]:
# ===============================================
# Toss CTR - Preprocessing v2_final_light
# (커널 안정화 버전)
# ===============================================
import pandas as pd
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:

# pyarrow 엔진 고정 (버전 충돌 방지)
pd.options.io.parquet.engine = "pyarrow"

train = pd.read_parquet("train.parquet")
test  = pd.read_parquet("test.parquet")

print("Train:", train.shape, "| Test:", test.shape)


Train: (10704179, 119) | Test: (1527298, 119)


In [3]:
# -----------------------------------------------
# -----------------------------------------------
# 2. 완전 중복 제거 (행 전체 동일한 경우만)
# -----------------------------------------------
dup_count = train.duplicated().sum()
if dup_count > 0:
    print(f"[info] 완전 중복 {dup_count}건 제거 중...")
    train = train.drop_duplicates().reset_index(drop=True)
    print(f"[done] 중복 제거 후: {train.shape}")
else:
    print("[info] 완전 중복 없음")


[info] 완전 중복 11건 제거 중...
[done] 중복 제거 후: (10704168, 119)


In [4]:
# -----------------------------------------------
# 3. feat_e_3 결측 처리 (age_group별 중앙값 → global median)
# -----------------------------------------------
train["feat_e_3_isna"] = train["feat_e_3"].isnull().astype(int)
test["feat_e_3_isna"]  = test["feat_e_3"].isnull().astype(int)


In [5]:
# 중앙값 매핑 (벡터화 버전, 안전하고 빠름)
age_median = train.groupby("age_group")["feat_e_3"].median()
train["feat_e_3"] = train["feat_e_3"].fillna(train["age_group"].map(age_median))
test["feat_e_3"]  = test["feat_e_3"].fillna(test["age_group"].map(age_median))


In [6]:
# global median으로 최종 보완
global_median = train["feat_e_3"].median()
train["feat_e_3"] = train["feat_e_3"].fillna(global_median)
test["feat_e_3"]  = test["feat_e_3"].fillna(global_median)

In [8]:
# -----------------------------------------------
# 4. Sequence feature 생성
# -----------------------------------------------
train["seq_length"] = train["seq"].astype(str).apply(lambda x: len(x.split(",")))
test["seq_length"]  = test["seq"].astype(str).apply(lambda x: len(x.split(",")))

train["seq_length_log"] = np.log1p(train["seq_length"])
test["seq_length_log"]  = np.log1p(test["seq_length"])

if "seq" in train.columns:
    def count_unique_items(seq_str):
        try:
            items = str(seq_str).split(',')
            return len(set(items))
        except:
            return 0
            
train["unique_items"] = train["seq"].apply(count_unique_items)
test["unique_items"]  = test["seq"].apply(count_unique_items)

train["diversity_ratio"] = train["unique_items"] / train["seq_length"].replace(0, np.nan)
test["diversity_ratio"]  = test["unique_items"] / test["seq_length"].replace(0, np.nan)

# 0으로 나누기 방지 후 결측 보완
train["diversity_ratio"] = train["diversity_ratio"].fillna(0)
test["diversity_ratio"]  = test["diversity_ratio"].fillna(0)
    
# 메모리 절약용: seq 제거
train = train.drop(columns=["seq"], errors="ignore")
test  = test.drop(columns=["seq"], errors="ignore")


In [11]:
if "diversity_ratio" not in train.columns:
    raise RuntimeError("❌ [ERROR] diversity_ratio 생성 실패: train에 컬럼이 없습니다.")
if "diversity_ratio" not in test.columns:
    raise RuntimeError("❌ [ERROR] diversity_ratio 생성 실패: test에 컬럼이 없습니다.")

# 값 검증 (요약 통계)
print("\n🌈 [CHECK] diversity_ratio 생성 완료")
print(f"  train shape: {train.shape}, test shape: {test.shape}")
print(f"  train diversity_ratio null 비율: {train['diversity_ratio'].isna().mean():.4f}")
print(f"  test  diversity_ratio null 비율: {test['diversity_ratio'].isna().mean():.4f}")
print(f"  train diversity_ratio 샘플: {train['diversity_ratio'].head(3).tolist()}")
print(f"  test  diversity_ratio 샘플: {test['diversity_ratio'].head(3).tolist()}")


🌈 [CHECK] diversity_ratio 생성 완료
  train shape: (10704168, 123), test shape: (1527298, 123)
  train diversity_ratio null 비율: 0.0000
  test  diversity_ratio null 비율: 0.0000
  train diversity_ratio 샘플: [0.07269155206286837, 0.1128747795414462, 0.07445255474452554]
  test  diversity_ratio 샘플: [0.1282952548330404, 0.15083798882681565, 0.29245283018867924]


In [12]:
cluster_feats = ["history_mean", "history_var", "seq_length"]
cluster_feats = [c for c in cluster_feats if c in train.columns]
if len(cluster_feats) >= 2:
    km = KMeans(n_clusters=10, random_state=42)
    train["user_cluster"] = km.fit_predict(train[cluster_feats])
    test["user_cluster"]  = km.predict(test[cluster_feats])
else:
    train["user_cluster"] = 0
    test["user_cluster"]  = 0

In [13]:
# -----------------------------------------------
# 5. History feature 생성 (history_a_1 / b_2 유지)
# -----------------------------------------------
if "user_id" in train.columns:
    # user_id dtype을 category로 변환
    train["user_id"] = train["user_id"].astype("category")
    test["user_id"]  = test["user_id"].astype("category")

    user_stats = (
        train.groupby("user_id")["clicked"]
        .agg(history_mean="mean", history_var="var")
        .reset_index()
    )
    train = train.merge(user_stats, on="user_id", how="left")
    test  = test.merge(user_stats, on="user_id", how="left")
else:
    train["history_mean"] = 0
    train["history_var"]  = 0
    test["history_mean"]  = 0
    test["history_var"]   = 0

hist_a_cols = [c for c in train.columns if "history_a_" in c]
hist_b_cols = [c for c in train.columns if "history_b_" in c]

train["history_a_mean"] = train[hist_a_cols].mean(axis=1) if len(hist_a_cols) > 0 else 0
test["history_a_mean"]  = test[hist_a_cols].mean(axis=1)  if len(hist_a_cols) > 0 else 0

train["history_b_mean"] = train[hist_b_cols].mean(axis=1) if len(hist_b_cols) > 0 else 0
test["history_b_mean"]  = test[hist_b_cols].mean(axis=1)  if len(hist_b_cols) > 0 else 0

for col in ["history_a_1", "history_b_2"]:
    if col not in train.columns:
        train[col] = 0
        test[col]  = 0

# corr() 계산 제거 (heavy 연산 방지)
train["history_clicked_corr"] = 0
test["history_clicked_corr"]  = 0


In [14]:

# -----------------------------------------------
# 6. flag feature (타입 안정화 + 안전모드)
# -----------------------------------------------

# hour → 숫자형으로 강제 변환 (문자열이면 NaN 처리)
if "hour" in train.columns:
    train["hour"] = pd.to_numeric(train["hour"], errors="coerce")
    test["hour"]  = pd.to_numeric(test["hour"], errors="coerce")

    train["night_flag"] = train["hour"].between(0, 6, inclusive="both").astype(int)
    test["night_flag"]  = test["hour"].between(0, 6, inclusive="both").astype(int)

# day_of_week → 숫자형으로 강제 변환
if "day_of_week" in train.columns:
    train["day_of_week"] = pd.to_numeric(train["day_of_week"], errors="coerce")
    test["day_of_week"]  = pd.to_numeric(test["day_of_week"], errors="coerce")

    train["is_weekend"]  = train["day_of_week"].isin([5, 6]).astype(int)
    test["is_weekend"]   = test["day_of_week"].isin([5, 6]).astype(int)
    train["tuesday_flag"] = (train["day_of_week"] == 1).astype(int)
    test["tuesday_flag"]  = (test["day_of_week"] == 1).astype(int)

# user_id 기반 신규 유저 플래그
if "user_id" in train.columns:
    # category dtype이면 set 변환 훨씬 가벼움
    seen_users = set(train["user_id"].astype(str))
    test["new_user_flag"] = (~test["user_id"].astype(str).isin(seen_users)).astype(int)
    train["new_user_flag"] = 0
else:
    train["new_user_flag"] = 0
    test["new_user_flag"]  = 0



In [15]:

# -----------------------------------------------
# 7. Category 최소화
# -----------------------------------------------
for col in ["gender", "age_group", "inventory_id"]:
    if col in train.columns:
        train[col] = train[col].astype("category")
        test[col]  = test[col].astype("category")

# -----------------------------------------------
# 8. 주요 피처 검증 (print 최소화)
# -----------------------------------------------
must_have = [
    "seq_length", "seq_length_log", "history_mean", "history_var",
    "history_a_mean", "history_b_mean", "history_a_1", "history_b_2",
    "history_clicked_corr", "feat_e_3", "feat_e_3_isna"
]

print("\n📋 필수 피처 존재 여부:")
for col in must_have:
    print(f"{col}: {'✅' if col in train.columns else '❌'}")



📋 필수 피처 존재 여부:
seq_length: ✅
seq_length_log: ✅
history_mean: ✅
history_var: ✅
history_a_mean: ✅
history_b_mean: ✅
history_a_1: ✅
history_b_2: ✅
history_clicked_corr: ✅
feat_e_3: ✅
feat_e_3_isna: ✅


In [16]:

# -----------------------------------------------
# 9. 저장 (snappy 압축)
# -----------------------------------------------
train.to_parquet("train_basic_2.parquet", index=False, compression="snappy")
test.to_parquet("test_basic_2.parquet", index=False, compression="snappy")

print("✅ 저장 완료: train_basic_2.parquet / test_basic_2.parquet")


✅ 저장 완료: train_basic_2.parquet / test_basic_2.parquet
