In [11]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, log_loss
from lightgbm import early_stopping, log_evaluation


In [4]:

# 데이터 로드
train = pd.read_csv('train_input.csv')
test = pd.read_csv('test_input.csv')

In [5]:
# Feature와 Target 분리
X = train.drop(columns=['clicked'])
y = train['clicked']


In [6]:
# Stratified K-fold 설정 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# LGBM 파라미터 설정

params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'average_precision'],
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_depth': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'n_jobs': -1,
    'verbose': -1
}


In [12]:
# 교차 검증 학습
oof_pred = np.zeros(len(train))
test_pred = np.zeros(len(test))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"===== Fold {fold+1} =====")
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier(**params, n_estimators=1000)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=['binary_logloss', 'average_precision'],
        callbacks=[early_stopping(100), log_evaluation(100)]
    )

    oof_pred[valid_idx] = model.predict_proba(X_valid)[:, 1]
    test_pred += model.predict_proba(test)[:, 1] / skf.n_splits


===== Fold 1 =====
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0895931	valid_0's average_precision: 0.051165
[200]	valid_0's binary_logloss: 0.0893697	valid_0's average_precision: 0.0526104
[300]	valid_0's binary_logloss: 0.0893224	valid_0's average_precision: 0.0529837
[400]	valid_0's binary_logloss: 0.0892984	valid_0's average_precision: 0.0531325
[500]	valid_0's binary_logloss: 0.0892849	valid_0's average_precision: 0.0532237
[600]	valid_0's binary_logloss: 0.0892777	valid_0's average_precision: 0.0532568
Early stopping, best iteration is:
[515]	valid_0's binary_logloss: 0.0892792	valid_0's average_precision: 0.0532771
===== Fold 2 =====
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0895801	valid_0's average_precision: 0.0517593
[200]	valid_0's binary_logloss: 0.0893843	valid_0's average_precision: 0.0529361
[300]	valid_0's binary_logloss: 0.0893309	valid_0's average_precision: 0.05

In [13]:
# 토스대회 기준 산식 반영
def weighted_logloss(y_true, y_pred):
    w0 = 0.5 / (y_true == 0).mean()
    w1 = 0.5 / (y_true == 1).mean()
    weights = np.where(y_true == 1, w1, w0)
    return log_loss(y_true, y_pred, sample_weight=weights)

ap = average_precision_score(y, oof_pred)
wll = weighted_logloss(y, oof_pred)
final_score = 0.5 * ap + 0.5 * (1 - wll)

print(f"AP: {ap:.5f}, Weighted LogLoss: {wll:.5f}, Final Score: {final_score:.5f}")


AP: 0.05339, Weighted LogLoss: 1.85859, Final Score: -0.40260


In [23]:
# 원본 test.parquet에서 ID 불러오기
test_raw = pd.read_parquet('test.parquet')
ID = test_raw['ID']  # ✅ 제출 형식 그대로 'ID' 컬럼명 유지

# 이미 학습 완료된 test_pred 사용
submission = pd.DataFrame({
    'ID': ID,               # ✅ session_id → ID 로 변경
    'clicked': test_pred
})

# 안전하게 저장 (float 형식 + UTF-8)
submission.to_csv(
    'toss_lgbm_v1_submit.csv',
    index=False,
    encoding='utf-8-sig',
    float_format='%.6f'
)

print("✅ 제출 파일 생성 완료 → toss_lgbm_v1_submit.csv (Dacon 형식 완벽 일치)")

✅ 제출 파일 생성 완료 → toss_lgbm_v1_submit.csv (Dacon 형식 완벽 일치)


In [24]:
submission = pd.read_csv("toss_lgbm_v1_submit.csv")
print(submission.head())   # 상위 5행 미리보기

             ID   clicked
0  TEST_0000000  0.007330
1  TEST_0000001  0.019933
2  TEST_0000002  0.022561
3  TEST_0000003  0.020123
4  TEST_0000004  0.017509
