In [1]:
# ===============================================
# Toss CTR - LGBM (v3 Hybrid Compatible)
# CORE + TREE_OPT 전용 Feature 사용
# ===============================================
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, average_precision_score
import matplotlib.pyplot as plt



In [2]:
# -----------------------------------------------
# 1️⃣ Load data & feature groups
# -----------------------------------------------
train = pd.read_parquet("train_input_3.parquet")
test  = pd.read_parquet("test_input_3.parquet")
feat_grp = pd.read_csv("feature_groups.csv")

# group 필터링
tree_features = feat_grp.query("group in ['CORE','TREE_OPT']")["column"].tolist()
print(f"[info] Using {len(tree_features)} CORE + TREE_OPT features")


[info] Using 26 CORE + TREE_OPT features


In [3]:
# -----------------------------------------------
# ID / target setup
# -----------------------------------------------
target_col = "clicked"
id_col = "id"

# feature subset
X = train[tree_features]
y = train[target_col]

X_test = test[tree_features]

# -----------------------------------------------
# 2️⃣ dtype 정리 (LightGBM용 category 인식)
# -----------------------------------------------
cat_cols = X.select_dtypes(include=["category", "object"]).columns.tolist()
for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")

print(f"[info] {len(cat_cols)} categorical features detected.")


# -----------------------------------------------
# 3️⃣ Train/Valid split
# -----------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c] = X[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[c] = X_test[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c] = X[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_inde

[info] 3 categorical features detected.


In [4]:
# -----------------------------------------------
# 4️⃣ scale_pos_weight (Weighted LogLoss)
# -----------------------------------------------
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"[info] scale_pos_weight = {pos_weight:.2f}")

train_data = lgb.Dataset(X_train, label=y_train, params={"max_bin": 512})
val_data   = lgb.Dataset(X_val, label=y_val, params={"max_bin": 512})


[info] scale_pos_weight = 51.43


In [5]:
# -----------------------------------------------
# 5️⃣ LightGBM Parameters
# -----------------------------------------------
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.03,
    "num_leaves": 96,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
    "scale_pos_weight": np.sqrt(pos_weight),
    "min_data_in_leaf": 20,
    "min_gain_to_split": 0.0,
    "n_jobs": -1,
    "seed": 42,
    "verbose": -1,
}

callbacks = [lgb.log_evaluation(period=100)]


In [6]:
# -----------------------------------------------
# 6️⃣ Train base model
# -----------------------------------------------
model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[train_data, val_data],
    valid_names=["train", "valid"],
    num_boost_round=1000,
    callbacks=callbacks
)


[100]	train's binary_logloss: 0.150783	valid's binary_logloss: 0.151137
[200]	train's binary_logloss: 0.154375	valid's binary_logloss: 0.155034
[300]	train's binary_logloss: 0.154002	valid's binary_logloss: 0.154965
[400]	train's binary_logloss: 0.15353	valid's binary_logloss: 0.15479
[500]	train's binary_logloss: 0.153014	valid's binary_logloss: 0.154539
[600]	train's binary_logloss: 0.152538	valid's binary_logloss: 0.154347
[700]	train's binary_logloss: 0.15206	valid's binary_logloss: 0.154138
[800]	train's binary_logloss: 0.151603	valid's binary_logloss: 0.153943
[900]	train's binary_logloss: 0.151181	valid's binary_logloss: 0.153783
[1000]	train's binary_logloss: 0.150762	valid's binary_logloss: 0.153621


In [7]:
# -----------------------------------------------
# 7️⃣ Feature Importance 기반 Selection
# -----------------------------------------------
imp = pd.DataFrame({
    "feature": model.feature_name(),
    "importance": model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)

imp["norm_imp"] = imp["importance"] / imp["importance"].sum()
imp["cum_ratio"] = imp["norm_imp"].cumsum()

selected_features = imp.loc[imp["cum_ratio"] <= 0.95, "feature"].tolist()
print(f"[info] Initially selected {len(selected_features)} features out of {X.shape[1]} total.")

# 상관관계 제거
corr_matrix = X[selected_features].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
selected_features = [f for f in selected_features if f not in high_corr_features]
print(f"[info] After removing highly correlated ones (>0.9): {len(selected_features)} remain.")

imp.to_csv("feature_importance_v3_full.csv", index=False)
pd.Series(selected_features, name="selected_features_v3").to_csv("selected_features_v3.csv", index=False)



[info] Initially selected 13 features out of 26 total.
[info] After removing highly correlated ones (>0.9): 12 remain.


In [8]:
# -----------------------------------------------
# 8️⃣ Retrain with selected features
# -----------------------------------------------
X_train_sel = X_train[selected_features]
X_val_sel   = X_val[selected_features]
X_test_sel  = X_test[selected_features]

# dtype 유지 (category형 그대로)
for c in cat_cols:
    if c in X_train_sel.columns:
        X_train_sel[c] = X_train_sel[c].astype("category")
        X_val_sel[c]   = X_val_sel[c].astype("category")
        X_test_sel[c]  = X_test_sel[c].astype("category")

train_data_sel = lgb.Dataset(X_train_sel, label=y_train, params={"max_bin": 512})
val_data_sel   = lgb.Dataset(X_val_sel, label=y_val, params={"max_bin": 512})

print("\n[phase 2] Training model with selected features...")

model_sel = lgb.train(
    params=params,
    train_set=train_data_sel,
    valid_sets=[train_data_sel, val_data_sel],
    valid_names=["train", "valid"],
    num_boost_round=900,
    callbacks=callbacks
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_sel[c] = X_train_sel[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_sel[c]   = X_val_sel[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_sel[c]  = X_test_sel[c].astype("category")
A value is trying to be set on a copy of a slice from a


[phase 2] Training model with selected features...
[100]	train's binary_logloss: 0.150865	valid's binary_logloss: 0.151233
[200]	train's binary_logloss: 0.154504	valid's binary_logloss: 0.155176
[300]	train's binary_logloss: 0.154152	valid's binary_logloss: 0.15512
[400]	train's binary_logloss: 0.153692	valid's binary_logloss: 0.154947
[500]	train's binary_logloss: 0.153188	valid's binary_logloss: 0.154708
[600]	train's binary_logloss: 0.15273	valid's binary_logloss: 0.154511
[700]	train's binary_logloss: 0.152277	valid's binary_logloss: 0.154313
[800]	train's binary_logloss: 0.151839	valid's binary_logloss: 0.154128
[900]	train's binary_logloss: 0.15144	valid's binary_logloss: 0.15397


In [9]:
# -----------------------------------------------
# 9️⃣ Validation Evaluation
# -----------------------------------------------
y_val_pred = model_sel.predict(X_val_sel)
val_logloss = log_loss(y_val, y_val_pred)
val_ap = average_precision_score(y_val, y_val_pred)
weighted_score = 0.5 * (1 - val_ap) + 0.5 * val_logloss

print(f"\n=== Validation Results ===")
print(f"LogLoss: {val_logloss:.5f}")
print(f"AP: {val_ap:.5f}")
print(f"Weighted Score (lower is better): {weighted_score:.5f}")





=== Validation Results ===
LogLoss: 0.15397
AP: 0.06940
Weighted Score (lower is better): 0.54228


In [None]:
y_val_pred = model_sel.predict(X_val_sel)
from sklearn.metrics import log_loss, average_precision_score
val_logloss = log_loss(y_val, y_val_pred)
val_ap = average_precision_score(y_val, y_val_pred)
weighted_score = 0.5 * (1 - val_ap) + 0.5 * val_logloss

print(f"\n[Validation]")
print(f"LogLoss: {val_logloss:.5f}")
print(f"AP: {val_ap:.5f}")
print(f"Weighted Score (lower is better): {weighted_score:.5f}")


In [None]:
print(y.value_counts(normalize=True))
import matplotlib.pyplot as plt
plt.hist(y_val_pred, bins=100)

In [None]:
# -----------------------------------------------
# 🔟 Submission CSV Generation
# -----------------------------------------------
y_test_pred = model_sel.predict(X_test_sel, num_iteration=model_sel.best_iteration)

submission = pd.DataFrame({
    "ID": test["id"],      # test에는 소문자 id 존재
    "clicked": y_test_pred
}).sort_values("ID").reset_index(drop=True)

submission_path = "toss_lgbm_v6_submit.csv"
submission.to_csv(submission_path, index=False)

print(f"[info] Saved submission file → {submission_path}")
print(submission.head())

# -----------------------------------------------
# ✅ Sanity Check
# -----------------------------------------------
print("\n[check] submission sanity check")
print(f"- ID duplicates: {submission['ID'].duplicated().sum()}")
print(f"- clicked missing: {submission['clicked'].isna().sum()}")
print(f"- clicked range: {submission['clicked'].min():.4f} ~ {submission['clicked'].max():.4f}")