In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import TargetEncoder
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [None]:
# 데이터 샘플링
#train = train_origin.sample(frac=0.001, random_state = 42).set_index('id').astype(str)
train = train_origin.set_index('id').astype(str)

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response'].astype(float)

# Train/Test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 타겟 인코딩
enc = TargetEncoder()
X_train = enc.fit_transform(X_train, y_train)
X_valid = enc.transform(X_valid)  # 학습된 인코더를 테스트 데이터에 적용
X_train = X_train.astype(float)
X_valid = X_valid.astype(float)


In [None]:
results=[]

def modeling(model):
    model.fit(X_train, y_train)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_auc = roc_auc_score(y_valid, valid_preds)
    print("ROC AUC:", valid_auc)

    results.append({"ROC AUC":valid_auc,"valid_preds":valid_preds})
    return valid_auc, valid_preds

LightGBM   ROC-AUC: 0.8854   Time: 2m 32.1s
{'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20, 'verbose':-1}



In [10]:
# LightGBM
print("\nLightGBM:")
best_param = {'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20, 'verbose':-1}
#{'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'n_estimators': 100, 'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0, 'metric': 'auc'}
lgbm_model1 = LGBMClassifier(**best_param, random_state=42)

modeling(lgbm_model1)


LightGBM:
ROC AUC: 0.8854378058218718


(0.8854378058218718,
 array([2.69057586e-01, 1.41895106e-01, 3.62949571e-01, ...,
        2.42836373e-04, 3.91265241e-04, 9.96636466e-02]))

LightGBM   ROC-AUC: 0.8848  Time: 2m 20.4s
{'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'n_estimators': 100, 'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0, 'metric': 'auc'}

In [11]:
# LightGBM
print("\nLightGBM:")
best_param = {'objective': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'n_estimators': 100, 'metric': 'auc'}
lgbm_model2 = LGBMClassifier(**best_param, random_state=42)

modeling(lgbm_model2)


LightGBM:
[LightGBM] [Info] Number of positive: 1131875, number of negative: 8071963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.197368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1188
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122979 -> initscore=-1.964521
[LightGBM] [Info] Start training from score -1.964521
ROC AUC: 0.884788781984587


(0.884788781984587,
 array([0.26687232, 0.13061131, 0.32398892, ..., 0.00115292, 0.00118716,
        0.1151039 ]))

XGBoost   ROC-AUC: 0.8857  Time: 1m 34.6s
{'n_estimators': 50, 'max_depth': 12, 'eval_metric':'logloss'}
'use_label_encoder': False, 'tree_method':'gpu_hist'

In [12]:
# XGBoost
print("\nXGBoost:")
xgb_param = {'n_estimators': 50, 'max_depth': 12, 'eval_metric':'logloss'}
xgb_model = XGBClassifier(**xgb_param, random_state=42)
modeling(xgb_model)


XGBoost:


Parameters: { "use_label_encoder" } are not used.



ROC AUC: 0.8856846852175086


(0.8856846852175086,
 array([3.1889963e-01, 1.0619615e-01, 2.8649637e-01, ..., 1.3015227e-04,
        1.6171826e-04, 9.8949939e-02], dtype=float32))

CatBoost   ROC-AUC: 0.8869   Time: 168m 30.2s
iterations:1000 -> 41m 9.1s
iterations:3000 -> 168m 30.2s
{ 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'class_names': [0, 1], 'learning_rate': 0.075, 'iterations': 3000, 'depth': 9, 'random_strength': 0, 'l2_leaf_reg': 0.5, 'max_leaves': 512, 'fold_permutation_block': 64, 'verbose': False, 'allow_writing_files': False }

'task_type': 'GPU'

In [13]:
# CatBoost
print("\nCatBoost:")
cat_param = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'class_names': [0, 1], 'learning_rate': 0.075, 'iterations': 3000, 'depth': 9, 'random_strength': 0, 'l2_leaf_reg': 0.5, 'max_leaves': 512, 'fold_permutation_block': 64, 'allow_writing_files': False, 'verbose':0}
cat_model = CatBoostClassifier(**cat_param, random_state=42)
modeling(cat_model)


CatBoost:
ROC AUC: 0.8869308470546298


(0.8869308470546298,
 array([3.19141099e-01, 1.17099170e-01, 3.33643198e-01, ...,
        1.41015313e-04, 3.43219978e-04, 1.05149323e-01]))

In [None]:
ensemble soft voting : 0.8869

In [26]:
print(results)
results_df = pd.DataFrame(results)
ensemble_preds = np.mean(results['valid_preds'], axis=0)
ensemble_auc = roc_auc_score(y_valid, ensemble_preds)
print("ensemble ROC AUC:", ensemble_auc)

    ROC AUC                                        valid_preds
0  0.885438  [0.269057585848406, 0.14189510575872583, 0.362...
1  0.884789  [0.26687232020494717, 0.130611305217624, 0.323...
2  0.885685  [0.31889963, 0.10619615, 0.28649637, 5.4846005...
3  0.886931  [0.31914109888247305, 0.11709916978790279, 0.3...
ensemble ROC AUC: 0.8869234514462901


In [27]:
#test 데이터 로드
test_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/test.csv')

#인덱스 제외
X_test = test_origin.set_index('id').astype(str)

# 변수 타겟 인코딩
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, 
                      columns=X_test.columns)

In [29]:
classifiers = [lgbm_model1, lgbm_model2, xgb_model, cat_model]
sub_results = []

for model in classifiers:
    # 예측 생성
    y_test_pred = model.predict_proba(X_test)[:,1]
    sub_results.append({"valid_preds":y_test_pred})

    # 'id'와 'Response' 열이 있는 DataFrame 생성
    submission = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred})
    
    # 예측을 CSV 파일로 저장
    submission .to_csv(f'{model}_predictions.csv', index=False)
    print(f"Predictions saved to '{model}_predictions.csv'")


sub_results = pd.DataFrame(sub_results, result)
ensemble_pred = np.mean(results['valid_preds'], axis=0)
submission = pd.DataFrame({'id': ensemble_pred.index, 'Response': ensemble_pred})

# 예측을 CSV 파일로 저장
submission.to_csv(f'ensemble_predictions.csv', index=False)
print(f"Predictions saved to 'ensemble_predictions.csv'")

In [None]:
submision.head()