In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from category_encoders import TargetEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')



In [2]:
# 데이터 샘플링
train = train_origin.set_index('id').astype(str)

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response'].astype(float)

# Train/Test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.002, stratify=y, random_state=42)

# 타겟 인코딩
enc = TargetEncoder()
enc.fit(X_train, y_train) # 학습 데이터로 인코더 학습
X_train = pd.DataFrame(enc.transform(X_train), index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(enc.transform(X_valid), index=X_valid.index, columns=X_valid.columns)
X_train = X_train.astype(float)
X_valid = X_valid.astype(float)


In [3]:
results = []
def modeling(model):
    model.fit(X_train, y_train)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_auc = roc_auc_score(y_valid, valid_preds)
    print("ROC AUC:", valid_auc)

    results.append({"ROC AUC":valid_auc,"valid_preds":valid_preds})
    return valid_auc, valid_preds

In [4]:
#HistGBR: 0.8900
print("\nHistGradientBoosting:")
hist_param = {'learning_rate': 0.12, 'max_iter': 3000, 'max_depth': 12, 'min_samples_leaf': 30, 'max_bins': 220}
hist_model = HistGradientBoostingClassifier(**hist_param, random_state=42)
modeling(hist_model)


HistGradientBoosting:
ROC AUC: 0.8900060585472794


(0.8900060585472794,
 array([1.38756266e-01, 4.49229271e-05, 5.62528411e-05, ...,
        1.83003008e-01, 6.08986839e-04, 4.21196436e-05]))

In [5]:
# LightGBM : 0.8776
print("\nLightGBM:")
best_param = {'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 3000, 'num_leaves': 20, 'verbose':1}
lgbm_model = LGBMClassifier(**best_param, random_state=42)
modeling(lgbm_model)


LightGBM:
[LightGBM] [Info] Number of positive: 1412229, number of negative: 10069559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.314070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 737
[LightGBM] [Info] Number of data points in the train set: 11481788, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964348
[LightGBM] [Info] Start training from score -1.964348
ROC AUC: 0.8914351402746308


(0.8914351402746308,
 array([1.32680182e-01, 3.38994506e-05, 1.14102427e-04, ...,
        1.86653755e-01, 6.34865037e-04, 3.06604577e-05]))

In [8]:
# XGBoost
print("\nXGBoost:")
xgb_param = {'n_estimators': 50, 'max_depth': 12, 'eval_metric':'logloss'}
xgb_model = XGBClassifier(**xgb_param, random_state=42)
modeling(xgb_model)


XGBoost:
ROC AUC: 0.8906290995878086


(0.8906290995878086,
 array([1.4129114e-01, 4.1033691e-05, 4.7287918e-05, ..., 2.1139917e-01,
        1.5866993e-03, 3.3784359e-05], dtype=float32))

In [9]:
# CatBoost
print("\nCatBoost:")
cat_param = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'class_names': [0, 1], 'learning_rate': 0.075, 'iterations': 3000, 'depth': 9, 'random_strength': 0, 'l2_leaf_reg': 0.5, 'max_leaves': 512, 'fold_permutation_block': 64, 'allow_writing_files': False, 'verbose':0}
cat_model = CatBoostClassifier(**cat_param, random_state=42)
modeling(cat_model)


CatBoost:
ROC AUC: 0.8917800397132521


(0.8917800397132521,
 array([1.26874079e-01, 2.07202762e-05, 1.57144489e-04, ...,
        1.85613321e-01, 1.12444493e-03, 3.87741203e-05]))

In [10]:
#test 데이터 로드
test_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/test.csv')
X_test = test_origin.set_index('id').astype(str)
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, columns=X_test.columns)
X_test = X_test.astype(float)

In [11]:
# 예측 생성
y_test_pred = hist_model.predict_proba(X_test.values)[:,1]
submission = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred})
submission.to_csv('hist_predictions.csv', index=False)
print("Predictions saved to 'hist_predictions.csv'")



Predictions saved to 'hist_predictions.csv'


In [None]:
classifiers = [
    {'model':hist_model, 'name':'hist_model'}, 
    {'model':lgbm_model, 'name':'lgbm_model'},
    {'model':xgb_model, 'name':'xgb_model'},
    {'model':cat_model, 'name':'cat_model'}
]

sub_results = []

for clf in classifiers:
    model = clf['model']
    name = clf['name']

    # 예측 생성
    y_test_pred = model.predict_proba(X_test)[:,1]
    clf['preds'] = y_test_pred

    # 'id'와 'Response' 열이 있는 DataFrame 생성
    submission = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred})
    
    # 예측을 CSV 파일로 저장
    submission.to_csv(f'{name}_predictions.csv', index=False)
    print(f"Predictions saved to '{name}_predictions.csv'")


# 앙상블 예측 생성
ensemble_pred = np.mean([clf['preds'] for clf in classifiers], axis=0)
submission = pd.DataFrame({'id': X_test.index, 'Response': ensemble_pred})

# 예측을 CSV 파일로 저장
submission.to_csv('ensemble_predictions.csv', index=False)
print("Predictions saved to 'ensemble_predictions.csv'")