In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from category_encoders import TargetEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from lightgbm import LGBMClassifier
from tqdm import tqdm

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')



In [2]:
# 데이터 샘플링
train = train_origin.set_index('id').astype(str)

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response'].astype(float)

# Train/Test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 타겟 인코딩
enc = TargetEncoder()
X_train = pd.DataFrame(enc.fit_transform(X_train, y_train), index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(enc.transform(X_valid), index=X_valid.index, columns=X_valid.columns)

# 숫자형 데이터로 변환
X_train = X_train.astype(float)
X_valid = X_valid.astype(float)

In [None]:
import optuna
from sklearn.metrics import roc_auc_score

# Optuna 로깅 레벨 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)

# 목적 함수 정의
def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 5000),
        'max_depth': trial.suggest_int('max_depth', 2, 12),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100),
        'max_bins': trial.suggest_int('max_bins', 32, 255),
        'verbose': 1
    }
    model = HistGradientBoostingClassifier(**param)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, y_pred)
    return score

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# 최적의 하이퍼파라미터 출력
print("Best parameters found by Optuna:", study.best_params)
print("Best ROC AUC score found by Optuna:", study.best_value)


--------------------------------------------------------------------------------------------------------------------------------

In [17]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    test_preds = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

In [19]:
#HistGBR: 0.8768
print("\nHistGradientBoosting:")
hist_param = {'learning_rate': 0.12004570044073418, 'max_iter': 833, 'max_depth': 2, 'min_samples_leaf': 35, 'max_bins': 222}
hist_model = HistGradientBoostingClassifier(**hist_param, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(hist_model, X, y)


HistGradientBoosting:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:  20%|██        | 1/5 [00:06<00:25,  6.46s/it]

Fold 1: Train ROC AUC: 0.8814, Validation ROC AUC: 0.8639


Folds:  40%|████      | 2/5 [00:12<00:19,  6.43s/it]

Fold 2: Train ROC AUC: 0.8828, Validation ROC AUC: 0.8605


Folds:  60%|██████    | 3/5 [00:22<00:15,  7.79s/it]

Fold 3: Train ROC AUC: 0.8799, Validation ROC AUC: 0.8637


Folds:  80%|████████  | 4/5 [00:28<00:07,  7.08s/it]

Fold 4: Train ROC AUC: 0.8775, Validation ROC AUC: 0.8611


Folds: 100%|██████████| 5/5 [00:36<00:00,  7.23s/it]

Fold 5: Train ROC AUC: 0.8820, Validation ROC AUC: 0.8631
Average Train ROC AUC: 0.8807
Average Validation ROC AUC: 0.8625





Test ROC AUC: 0.8607


In [None]:
# LightGBM : 0.8776
print("\nLightGBM:")
best_param = {'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20, 'verbose':1}
lgbm_model = LGBMClassifier(**best_param, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)