In [1]:
# ===============================================
# Toss CTR - Feature Engineering & Selection (Phase 2)
# ===============================================
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")


In [2]:

# ===============================================
# 1. 데이터 로드
# ===============================================
print("📦 Loading data...")
train = pd.read_parquet("train_basic_2.parquet")
test  = pd.read_parquet("test_basic_2.parquet")

if "ID" in test.columns:
    test.rename(columns={"ID": "id"}, inplace=True)

if "id" not in train.columns:
    train["row_id"] = np.arange(len(train))
    id_col = "row_id"
else:
    id_col = "id"

target_col = "clicked" if "clicked" in train.columns else None

print(f"train shape: {train.shape}, test shape: {test.shape}")

y_col = "clicked"
id_cols = [c for c in ["row_id", "id", "index"] if c in train.columns or c in test.columns]


📦 Loading data...
train shape: (10704168, 134), test shape: (1527298, 133)


In [3]:
# ID, target 보존용 베이스프레임
out_train_df = pd.DataFrame(index=train.index)
out_test_df  = pd.DataFrame(index=test.index)

for c in id_cols:
    if c in train.columns:
        out_train_df[c] = train[c]
    if c in test.columns:
        out_test_df[c] = test[c]
if y_col in train.columns:
    out_train_df[y_col] = train[y_col]


In [4]:
# ===============================================
# 2. 기본 인코딩 + 사이클릭 인코딩
# ===============================================
def cyclic_encode(series, period, prefix):
    s = pd.to_numeric(series, errors="coerce").fillna(0) % period
    return pd.DataFrame({
        f"{prefix}_sin": np.sin(2 * np.pi * s / period),
        f"{prefix}_cos": np.cos(2 * np.pi * s / period)
    }, index=series.index)

# Label Encoding
for col in ["gender", "age_group", "user_cluster"]:
    if col in train.columns and col in test.columns:
        le = LabelEncoder()
        both = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
        le.fit(both.fillna("nan"))
        out_train_df[col] = le.transform(train[col].astype(str).fillna("nan"))
        out_test_df[col]  = le.transform(test[col].astype(str).fillna("nan"))

# Cyclic features
if "hour" in train.columns:
    out_train_df = pd.concat([out_train_df, cyclic_encode(train["hour"], 24, "hour")], axis=1)
    out_test_df  = pd.concat([out_test_df, cyclic_encode(test["hour"], 24, "hour")], axis=1)
if "day_of_week" in train.columns:
    out_train_df = pd.concat([out_train_df, cyclic_encode(train["day_of_week"], 7, "dow")], axis=1)
    out_test_df  = pd.concat([out_test_df, cyclic_encode(test["day_of_week"], 7, "dow")], axis=1)



In [5]:
# ===============================================
# 3. Sequence Features (이미 전처리에서 생성됨)
# ===============================================
for col in ["seq_length", "seq_length_log"]:
    if col in train.columns:
        out_train_df[col] = train[col]
        out_test_df[col]  = test[col]

# ===============================================
# 4. History Features
# ===============================================
hist_cols = [
    "history_mean", "history_var", "history_a_mean", "history_b_mean",
    "history_clicked_corr", "history_a_1", "history_b_2"
]
for col in hist_cols:
    if col in train.columns and col in test.columns:
        out_train_df[col] = train[col]
        out_test_df[col]  = test[col]
    elif col in train.columns:
        out_train_df[col] = train[col]
        out_test_df[col]  = 0
    elif col in test.columns:
        out_train_df[col] = 0
        out_test_df[col]  = test[col]


In [6]:

# ===============================================
# 5. CTR & Cross Feature
# ===============================================
def has_cols(df, cols): return all(c in df.columns for c in cols)

def frequency_encode(train_s, test_s):
    freq = train_s.value_counts(dropna=False) / len(train_s)
    return train_s.map(freq).astype(float), test_s.map(freq).astype(float)

def mean_target_encode(trn_cat, trn_y, tst_cat, global_mean=None, min_samples_leaf=50, smoothing=10.0):
    if global_mean is None:
        global_mean = trn_y.mean() if len(trn_y) else 0.0
    stats = pd.DataFrame({"cat": trn_cat, "y": trn_y}).groupby("cat")["y"].agg(["mean", "count"])
    smoothing_factor = 1 / (1 + np.exp(-(stats["count"] - min_samples_leaf) / smoothing))
    stats["enc"] = global_mean * (1 - smoothing_factor) + stats["mean"] * smoothing_factor
    trn_enc = trn_cat.map(stats["enc"]).fillna(global_mean)
    tst_enc = tst_cat.map(stats["enc"]).fillna(global_mean)
    return trn_enc, tst_enc

def safe_te_or_fe(train_df, test_df, cat_col, y_col="clicked"):
    if y_col not in train_df or train_df[y_col].nunique() < 2:
        return frequency_encode(train_df[cat_col], test_df[cat_col])
    try:
        trn, tst = mean_target_encode(train_df[cat_col], train_df[y_col], test_df[cat_col])
        if trn.std() < 1e-10:
            return frequency_encode(train_df[cat_col], test_df[cat_col])
        return trn, tst
    except:
        return frequency_encode(train_df[cat_col], test_df[cat_col])

# 기본 CTR columns
ctr_cols = ["click_ratio_per_age", "click_ratio_per_hour", "click_ratio_per_dow", "inventory_ctr"]
for col in ctr_cols:
    if col in train.columns:
        out_train_df[col] = train[col]
        out_test_df[col]  = test[col]

# 주요 교차 feature
crosses = [
    ("hour_dow_cross", ["hour", "day_of_week"]),
    ("age_gender_cross", ["age_group", "gender"]),
    ("inventory_id_hour_cross", ["inventory_id", "hour"])
]
for name, cols in crosses:
    if has_cols(train, cols):
        tr_cat = train[cols].astype(str).fillna("nan").agg("|".join, axis=1)
        ts_cat = test[cols].astype(str).fillna("nan").agg("|".join, axis=1)
        tmp_train = pd.DataFrame({"tmp": tr_cat, "clicked": train[y_col]})
        tmp_test = pd.DataFrame({"tmp": ts_cat})
        tr_enc, ts_enc = safe_te_or_fe(tmp_train, tmp_test, "tmp", y_col)
        out_train_df[name] = tr_enc
        out_test_df[name]  = ts_enc

# inventory_id 인코딩
if "inventory_id" in train.columns:
    tr_te, ts_te = safe_te_or_fe(train, test, "inventory_id", y_col)
    tr_fe, ts_fe = frequency_encode(train["inventory_id"], test["inventory_id"])
    out_train_df["inventory_id_te"] = tr_te
    out_test_df["inventory_id_te"]  = ts_te
    out_train_df["inventory_id_fe"] = tr_fe
    out_test_df["inventory_id_fe"]  = ts_fe


In [7]:

# ===============================================
# 6. Embedding / Flags / 추가 Feature
# ===============================================
add_cols = [
    "diversity_ratio", "new_user_flag", "is_weekend", "tuesday_flag",
    "night_flag", "pca_component_1", "user_cluster"
]
for col in add_cols:
    if col in train.columns:
        out_train_df[col] = train[col]
        out_test_df[col]  = test[col]

# ===============================================
# 7. Scaling (수치형만)
# ===============================================
class StdScalerCols(BaseEstimator, TransformerMixin):
    def __init__(self, cols): self.cols = cols; self.scaler = StandardScaler()
    def fit(self, X, y=None):
        cols = [c for c in self.cols if c in X.columns]
        self.scaler.fit(X[cols]); self.cols = cols; return self
    def transform(self, X):
        X = X.copy(); X[self.cols] = self.scaler.transform(X[self.cols]); return X

num_cols = [c for c in out_train_df.columns if out_train_df[c].dtype != "object" and c not in id_cols + [y_col]]
scaler = StdScalerCols(num_cols)
scaler.fit(out_train_df)
out_train_df = scaler.transform(out_train_df)
out_test_df  = scaler.transform(out_test_df)



In [8]:
# ===============================================
# 8. Feature Audit (no autocompletion)
# ===============================================

# 우선순위 10개 (요구사항 고정)
required = [
    "inventory_id_te", "history_a_1", "history_b_2", "seq_length",
    "diversity_ratio", "hour_dow_cross", "age_gender_cross",
     "user_cluster", "inventory_id_hour_cross"
]

# 컬럼 존재 체크
train_cols = set(out_train_df.columns)
test_cols  = set(out_test_df.columns)

missing_train = [c for c in required if c not in train_cols]
missing_test  = [c for c in required if c not in test_cols]

print("\n[Feature Audit] Required 10 features presence")
for c in required:
    print(f" - {c:24s} | train: {'OK' if c in train_cols else 'MISS'} | test: {'OK' if c in test_cols else 'MISS'}")

if missing_train or missing_test:
    print("\n[WARN] Missing required features detected (no autocompletion applied).")
    if missing_train: print(" - Missing in train:", missing_train)
    if missing_test:  print(" - Missing in test :", missing_test)




[Feature Audit] Required 10 features presence
 - inventory_id_te          | train: OK | test: OK
 - history_a_1              | train: OK | test: OK
 - history_b_2              | train: OK | test: OK
 - seq_length               | train: OK | test: OK
 - diversity_ratio          | train: OK | test: OK
 - hour_dow_cross           | train: OK | test: OK
 - age_gender_cross         | train: OK | test: OK
 - user_cluster             | train: OK | test: OK
 - inventory_id_hour_cross  | train: OK | test: OK


In [11]:
# 상수/전부 0/결측비율/고유값 수 등 감사지표 생성
# 대용량이면 샘플링을 켜세요: AUDIT_SAMPLE = 300000  (None이면 전체)
AUDIT_SAMPLE = 30000

def build_audit(df, name="train", sample=AUDIT_SAMPLE):
    df2 = df if (sample is None or len(df) <= sample) else df.sample(n=sample, random_state=42)
    rows = []
    for c in df.columns:
        s_full = df[c]
        s = df2[c]
        dtype = str(s_full.dtype)
        non_null_ratio = float(s.notna().mean())
        nunique = int(s.nunique(dropna=True))
        is_num = pd.api.types.is_numeric_dtype(s_full)
        zero_ratio = float((s == 0).mean()) if is_num else float("nan")
        is_constant = (nunique <= 1)
        rows.append([c, dtype, non_null_ratio, nunique, zero_ratio, is_constant])
    audit_df = pd.DataFrame(rows, columns=["column", "dtype", "non_null_ratio", "nunique", "zero_ratio", "is_constant"])
    audit_df = audit_df.sort_values(["is_constant", "zero_ratio", "non_null_ratio", "column"],
                                    ascending=[False, False, True, True])
    audit_df.to_csv(f"feature_audit_{name}.csv", index=False)
    print(f"[Feature Audit] Saved feature_audit_{name}.csv  (rows={len(audit_df)})")
    return audit_df

audit_train = build_audit(out_train_df, "train", AUDIT_SAMPLE)
audit_test  = build_audit(out_test_df,  "test",  AUDIT_SAMPLE)



[Feature Audit] Saved feature_audit_train.csv  (rows=28)
[Feature Audit] Saved feature_audit_test.csv  (rows=27)


In [12]:
# 원본 전처리 컬럼 대비 'FE로 새로 생긴' 칼럼 목록 저장
# (train은 전처리 원본이 이미 위에서 'train' 변수로 로드되어 있음)
base_cols = set(train.columns)  # 전처리 결과의 원본 컬럼들
base_cols |= set([c for c in ["row_id", "id", "index", y_col] if c in base_cols or c in out_train_df.columns])

fe_only_cols = [c for c in out_train_df.columns if c not in base_cols]
pd.DataFrame({"feature": fe_only_cols}).to_csv("fe_only_columns.csv", index=False)
print(f"[Feature Audit] Saved fe_only_columns.csv  (count={len(fe_only_cols)})")

# 리더보드 제출에 필요한 ID / clicked 보장 여부 최종 점검(자동 생성 아님, 존재 확인만)
id_present = [c for c in ["row_id", "id"] if c in out_train_df.columns]
if not id_present:
    raise ValueError("ID column (row_id or id) is missing in out_train_df.")
if y_col not in out_train_df.columns:
    raise ValueError("Target column 'clicked' is missing in out_train_df.")

print("\n[OK] ID and clicked are present. No autocompletion applied for features.")


[Feature Audit] Saved fe_only_columns.csv  (count=9)

[OK] ID and clicked are present. No autocompletion applied for features.


In [15]:

# ===============================================
# 9. 저장
# ===============================================
out_train_df.to_parquet("train_input_2.parquet", index=False)
out_test_df.to_parquet("test_input_2.parquet", index=False)
print("Saved train_input_2.parquet / test_input_2.parquet")
print(f"train_input cols: {len(out_train_df.columns)}, test_input cols: {len(out_test_df.columns)}")


Saved train_input_2.parquet / test_input_2.parquet
train_input cols: 28, test_input cols: 27
