In [1]:
# ===============================================
# Toss CTR - LGBM + Feature Selection (Exclude Support)
# ===============================================
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, average_precision_score
import matplotlib.pyplot as plt


In [2]:

# -----------------------------------------------
# 1️⃣ Load data
# -----------------------------------------------
train = pd.read_parquet("train_input_2.parquet")
test  = pd.read_parquet("test_input_2.parquet")

target_col = "clicked"
id_col = "row_id" if "row_id" in train.columns else "ID"

X = train.drop(columns=[target_col, id_col])
y = train[target_col]


In [3]:

# -----------------------------------------------
# 2️⃣ 제외할 피처 지정 (필요 시 이름만 추가)
# -----------------------------------------------
exclude_features = [
    # 예시: "diversity_ratio", "feat_e_3", "user_cluster"
]
if exclude_features:
    print(f"[info] Excluding {len(exclude_features)} features: {exclude_features}")
    X = X.drop(columns=[f for f in exclude_features if f in X.columns], errors="ignore")



In [4]:
# -----------------------------------------------
# 3️⃣ Train/Valid split
# -----------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------------------------
# 4️⃣ scale_pos_weight (Weighted LogLoss)
# -----------------------------------------------
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"[info] scale_pos_weight = {pos_weight:.2f}")

train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val, label=y_val)


[info] scale_pos_weight = 51.43


In [13]:
# -----------------------------------------------
# -----------------------------------------------
# 5️⃣ LightGBM Parameters
# -----------------------------------------------
train_data = lgb.Dataset(X_train, label=y_train, params={"max_bin": 512})
val_data   = lgb.Dataset(X_val, label=y_val, params={"max_bin": 512})

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,          # 완만한 학습
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "scale_pos_weight": pos_weight, # 유지
    "min_data_in_leaf": 20,
    "min_gain_to_split": 0.0,
    "n_jobs": -1,
    "seed": 42,
    "verbose": -1,
}

# -----------------------------------------------
# 6️⃣ Train (early stopping 제거)
# -----------------------------------------------
callbacks = [lgb.log_evaluation(period=100)]

model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[train_data, val_data],
    valid_names=["train", "valid"],
    num_boost_round=1000,
    callbacks=callbacks
)



[100]	train's binary_logloss: 0.601335	valid's binary_logloss: 0.601849
[200]	train's binary_logloss: 0.602734	valid's binary_logloss: 0.603505
[300]	train's binary_logloss: 0.600408	valid's binary_logloss: 0.601407
[400]	train's binary_logloss: 0.597291	valid's binary_logloss: 0.598513
[500]	train's binary_logloss: 0.594644	valid's binary_logloss: 0.596094
[600]	train's binary_logloss: 0.591818	valid's binary_logloss: 0.593499
[700]	train's binary_logloss: 0.589164	valid's binary_logloss: 0.591067
[800]	train's binary_logloss: 0.586858	valid's binary_logloss: 0.588999
[900]	train's binary_logloss: 0.58451	valid's binary_logloss: 0.586864
[1000]	train's binary_logloss: 0.582025	valid's binary_logloss: 0.584586


In [14]:
# -----------------------------------------------
# 7️⃣ Feature Importance 기반 Selection (개선 버전)
# -----------------------------------------------
imp = pd.DataFrame({
    "feature": model.feature_name(),
    "importance": model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)

imp["norm_imp"] = imp["importance"] / imp["importance"].sum()
imp["cum_ratio"] = imp["norm_imp"].cumsum()

selected_features = imp.loc[imp["cum_ratio"] <= 0.95, "feature"].tolist()
print(f"[info] Initially selected {len(selected_features)} features out of {X.shape[1]} total.")

corr_matrix = X[selected_features].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
selected_features = [f for f in selected_features if f not in high_corr_features]
print(f"[info] After removing highly correlated ones (>0.9): {len(selected_features)} remain.")

imp.to_csv("feature_importance_full.csv", index=False)
pd.Series(selected_features, name="selected_features").to_csv("selected_features_v2.csv", index=False)



[info] Initially selected 11 features out of 26 total.
[info] After removing highly correlated ones (>0.9): 10 remain.


In [16]:
# -----------------------------------------------
# 8️⃣ Retrain with selected features
# -----------------------------------------------
X_test = test.drop(columns=[col for col in ["clicked", "ID", "row_id"] if col in test.columns], errors="ignore")

X_train_sel = X_train[selected_features]
X_val_sel   = X_val[selected_features]
X_test_sel  = X_test[selected_features]

train_data_sel = lgb.Dataset(X_train_sel, label=y_train, params={"max_bin": 512})
val_data_sel   = lgb.Dataset(X_val_sel, label=y_val, params={"max_bin": 512})

params_sel = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "scale_pos_weight": pos_weight,
    "min_data_in_leaf": 20,
    "min_gain_to_split": 0.0,
    "n_jobs": -1,
    "seed": 42,
    "verbose": -1,
}

print("\n[phase 2] Training model with selected features...")
callbacks = [lgb.log_evaluation(period=100)]

model_sel = lgb.train(
    params=params_sel,
    train_set=train_data_sel,
    valid_sets=[train_data_sel, val_data_sel],
    valid_names=["train", "valid"],
    num_boost_round=900,
    callbacks=callbacks
)



[phase 2] Training model with selected features...
[100]	train's binary_logloss: 0.602629	valid's binary_logloss: 0.603079
[200]	train's binary_logloss: 0.604077	valid's binary_logloss: 0.604794
[300]	train's binary_logloss: 0.601922	valid's binary_logloss: 0.602876
[400]	train's binary_logloss: 0.598955	valid's binary_logloss: 0.600122
[500]	train's binary_logloss: 0.596442	valid's binary_logloss: 0.597819
[600]	train's binary_logloss: 0.593793	valid's binary_logloss: 0.595392
[700]	train's binary_logloss: 0.591272	valid's binary_logloss: 0.593092
[800]	train's binary_logloss: 0.589121	valid's binary_logloss: 0.591154
[900]	train's binary_logloss: 0.586906	valid's binary_logloss: 0.589121


In [17]:
# -----------------------------------------------
# 9️⃣ Validation Evaluation
# -----------------------------------------------

y_val_pred = model_sel.predict(X_val_sel)
val_logloss = log_loss(y_val, y_val_pred)
val_ap = average_precision_score(y_val, y_val_pred)
weighted_score = 0.5 * (1 - val_ap) + 0.5 * val_logloss

print(f"\n=== Validation Results ===")
print(f"LogLoss: {val_logloss:.5f}")
print(f"AP: {val_ap:.5f}")
print(f"Weighted Score (lower is better): {weighted_score:.5f}")



=== Validation Results ===
LogLoss: 0.58912
AP: 0.06871
Weighted Score (lower is better): 0.76021


In [23]:
# -----------------------------------------------
# 🔟 Submission CSV Generation (Final for lowercase 'id')
# -----------------------------------------------

# test 데이터 확인
assert "id" in test.columns, "[error] test 데이터에 'id' 컬럼이 없습니다. 확인해주세요."

# 예측 확률 생성
y_test_pred = model_sel.predict(X_test_sel, num_iteration=model_sel.best_iteration)

# 제출 파일 생성 (id + clicked)
submission = pd.DataFrame({
    "ID": test["id"],        # ✅ test에는 소문자 'id' 사용
    "clicked": y_test_pred
}).sort_values("ID").reset_index(drop=True)

# 저장
submission_path = "toss_lgbm_v2_submit.csv"
submission.to_csv(submission_path, index=False)

print(f"[info] Saved submission file → {submission_path}")
print(submission.head())

# -----------------------------------------------
# ✅ sanity check (권장)
# -----------------------------------------------
print("\n[check] submission sanity check")
print(f"- ID 중복 여부: {submission['ID'].duplicated().sum()} duplicates")
print(f"- clicked 결측치 여부: {submission['clicked'].isna().sum()} missing")
print(f"- clicked 값 범위: {submission['clicked'].min():.4f} ~ {submission['clicked'].max():.4f}")


[info] Saved submission file → toss_lgbm_v2_submit.csv
             ID   clicked
0  TEST_0000000  0.433195
1  TEST_0000001  0.369427
2  TEST_0000002  0.402421
3  TEST_0000003  0.325831
4  TEST_0000004  0.348539

[check] submission sanity check
- ID 중복 여부: 0 duplicates
- clicked 결측치 여부: 0 missing
- clicked 값 범위: 0.0091 ~ 0.9909
