In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

In [7]:
train = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')
train = train.sample(frac=0.01, random_state = 42)

In [3]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
2899125,2899125,Female,41,1,28.0,0,1-2 Year,Yes,29590.0,157.0,46,0
3854901,3854901,Female,40,1,28.0,0,1-2 Year,Yes,42910.0,13.0,245,0
7377384,7377384,Male,31,1,18.0,0,< 1 Year,Yes,30375.0,152.0,31,0
10901782,10901782,Female,24,1,37.0,1,< 1 Year,No,2630.0,152.0,217,0
8099641,8099641,Male,33,1,28.0,1,1-2 Year,No,45224.0,124.0,286,0


In [8]:
# 범주형 변수를 팩터로 변환 (카테고리형)
train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')

# 최소-최대 정규화 (Min-Max 스케일링)
scaler = StandardScaler()
train.iloc[:,[2,8,10]] = scaler.fit_transform(train.iloc[:,[2,8,10]])

# 이분변수 생성: "Annual_Premium" == 2630.0 인 경우
train['Annual_Premium_Binary'] = (train['Annual_Premium'] == 2630.0).astype('category')

# 로그 변환된 "Annual_Premium" 변수 생성
train['Annual_Premium_Log'] = np.where(train['Annual_Premium'] > 0, np.log1p(train['Annual_Premium']), 0)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id', 'Annual_Premium'])

# 원-핫 인코딩 (One-Hot Encoding)
category_columns = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Annual_Premium_Binary']
train = pd.get_dummies(train, columns=category_columns, drop_first=True, dtype=int)

# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
train.head() 

Unnamed: 0,Age,Vintage,Response,Annual_Premium_Log,Gender_Male,Driving_License_1,Region_Code_1.0,Region_Code_2.0,Region_Code_3.0,Region_Code_4.0,...,Policy_Sales_Channel_152.0,Policy_Sales_Channel_153.0,Policy_Sales_Channel_154.0,Policy_Sales_Channel_155.0,Policy_Sales_Channel_156.0,Policy_Sales_Channel_157.0,Policy_Sales_Channel_158.0,Policy_Sales_Channel_159.0,Policy_Sales_Channel_160.0,Policy_Sales_Channel_163.0
2899125,0.173456,-1.4752,0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3854901,0.106775,1.012496,0,0.563943,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7377384,-0.493359,-1.662715,0,0.0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10901782,-0.960129,0.662468,0,0.0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8099641,-0.359996,1.525037,0,0.641059,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = train.drop(['Response'], axis=1)
y = train['Response']

In [10]:
print(X.shape)
print(y.shape)

(115048, 11)
(115048,)


In [11]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        scaler = MinMaxScaler()
        X_skf_train = scaler.fit_transform(X_skf_train)
        X_skf_valid = scaler.transform(X_skf_valid)

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    X_test_scaled = scaler.transform(X_test)
    test_preds = model.predict_proba(X_test_scaled)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [13]:
# Logistic Regression
print("Logistic Regression:")
lr_model = LogisticRegression(random_state=42)
lr_train_scores, lr_valid_scores, lr_test_auc = modeling(lr_model, X, y)

print("Logistic Regression with balancing:")
lr_model_b = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_train_scores, lr_valid_scores, lr_test_auc = modeling(lr_model_b, X, y)

Logistic Regression:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]


ValueError: could not convert string to float: 'Male'

In [11]:
# Random Forest
print("\nRandom Forest:")
rf_model = RandomForestClassifier(random_state=42)
rf_train_scores, rf_valid_scores, rf_test_auc = modeling(rf_model, X, y)

print("\nRandom Forest with Balancing :")
rf_model_b = RandomForestClassifier(class_weight='balanced', n_estimators=1000, random_state=42)
rf_train_scores, rf_valid_scores, rf_test_auc = modeling(rf_model_b, X, y)


Random Forest:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:  20%|██        | 1/5 [00:39<02:36, 39.24s/it]

Fold 1: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8351


Folds:  40%|████      | 2/5 [01:20<02:01, 40.58s/it]

Fold 2: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8381


Folds:  60%|██████    | 3/5 [02:09<01:28, 44.33s/it]

Fold 3: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8406


Folds:  80%|████████  | 4/5 [02:56<00:45, 45.18s/it]

Fold 4: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8327


Folds: 100%|██████████| 5/5 [03:28<00:00, 41.62s/it]

Fold 5: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8317
Average Train ROC AUC: 1.0000
Average Validation ROC AUC: 0.8356





Test ROC AUC: 0.8365

Random Forest with Balancing :


Folds:  20%|██        | 1/5 [07:50<31:21, 470.37s/it]

Fold 1: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8413


Folds:  40%|████      | 2/5 [14:56<22:13, 444.45s/it]

Fold 2: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8408


Folds:  60%|██████    | 3/5 [22:00<14:30, 435.05s/it]

Fold 3: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8454


Folds:  80%|████████  | 4/5 [28:50<07:05, 425.00s/it]

Fold 4: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8363


Folds: 100%|██████████| 5/5 [35:26<00:00, 425.37s/it]

Fold 5: Train ROC AUC: 1.0000, Validation ROC AUC: 0.8367
Average Train ROC AUC: 1.0000
Average Validation ROC AUC: 0.8401





Test ROC AUC: 0.8405


In [12]:
# XGBoost
print("\nXGBoost:")
xgb_model = XGBClassifier(random_state=42)
xgb_train_scores, xgb_valid_scores, xgb_test_auc = modeling(xgb_model, X, y)

print("\nXGBoost with balancing:")
class_weights = [1, (y == 0).sum() / (y == 1).sum()]
xgb_model_b = CatBoostClassifier(class_weights=class_weights, iterations=1000, random_state=42, verbose=0)
xgb_train_scores, xgb_valid_scores, xgb_test_auc = modeling(xgb_model_b, X, y)



XGBoost:


Folds:  20%|██        | 1/5 [00:06<00:25,  6.42s/it]

Fold 1: Train ROC AUC: 0.9078, Validation ROC AUC: 0.8633


Folds:  40%|████      | 2/5 [00:11<00:16,  5.53s/it]

Fold 2: Train ROC AUC: 0.9090, Validation ROC AUC: 0.8612


Folds:  60%|██████    | 3/5 [00:17<00:11,  5.67s/it]

Fold 3: Train ROC AUC: 0.9099, Validation ROC AUC: 0.8676


Folds:  80%|████████  | 4/5 [00:22<00:05,  5.56s/it]

Fold 4: Train ROC AUC: 0.9086, Validation ROC AUC: 0.8580


Folds: 100%|██████████| 5/5 [00:27<00:00,  5.46s/it]

Fold 5: Train ROC AUC: 0.9093, Validation ROC AUC: 0.8601
Average Train ROC AUC: 0.9089
Average Validation ROC AUC: 0.8620





Test ROC AUC: 0.8617

XGBoost with balancing:


Folds:  20%|██        | 1/5 [00:59<03:58, 59.51s/it]

Fold 1: Train ROC AUC: 0.9050, Validation ROC AUC: 0.8644


Folds:  40%|████      | 2/5 [02:02<03:03, 61.27s/it]

Fold 2: Train ROC AUC: 0.9056, Validation ROC AUC: 0.8629


Folds:  60%|██████    | 3/5 [03:39<02:35, 77.95s/it]

Fold 3: Train ROC AUC: 0.9049, Validation ROC AUC: 0.8684


Folds:  80%|████████  | 4/5 [04:18<01:02, 62.26s/it]

Fold 4: Train ROC AUC: 0.9069, Validation ROC AUC: 0.8587


Folds: 100%|██████████| 5/5 [05:17<00:00, 63.59s/it]

Fold 5: Train ROC AUC: 0.9073, Validation ROC AUC: 0.8595
Average Train ROC AUC: 0.9059
Average Validation ROC AUC: 0.8628





Test ROC AUC: 0.8648


In [13]:
# CatBoost
print("\nCatBoost:")
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_train_scores, cat_valid_scores, cat_test_auc = modeling(cat_model, X, y)

print("\nCatBoost with balancing:")
class_weights = [1, (y == 0).sum() / (y == 1).sum()]
cat_model_b = CatBoostClassifier(class_weights=class_weights, verbose=0, random_state=42)
cat_train_scores, cat_valid_scores, cat_test_auc = modeling(cat_model_b, X, y)


CatBoost:


Folds:  20%|██        | 1/5 [01:01<04:05, 61.39s/it]

Fold 1: Train ROC AUC: 0.9026, Validation ROC AUC: 0.8661


Folds:  40%|████      | 2/5 [02:22<03:39, 73.18s/it]

Fold 2: Train ROC AUC: 0.9023, Validation ROC AUC: 0.8627


Folds:  60%|██████    | 3/5 [03:14<02:06, 63.21s/it]

Fold 3: Train ROC AUC: 0.9014, Validation ROC AUC: 0.8698


Folds:  80%|████████  | 4/5 [03:41<00:48, 48.87s/it]

Fold 4: Train ROC AUC: 0.9029, Validation ROC AUC: 0.8620


Folds: 100%|██████████| 5/5 [04:08<00:00, 49.64s/it]

Fold 5: Train ROC AUC: 0.9022, Validation ROC AUC: 0.8607
Average Train ROC AUC: 0.9023
Average Validation ROC AUC: 0.8643





Test ROC AUC: 0.8657

CatBoost with balancing:


Folds:  20%|██        | 1/5 [00:30<02:01, 30.44s/it]

Fold 1: Train ROC AUC: 0.9050, Validation ROC AUC: 0.8644


Folds:  40%|████      | 2/5 [01:10<01:48, 36.16s/it]

Fold 2: Train ROC AUC: 0.9056, Validation ROC AUC: 0.8629


Folds:  60%|██████    | 3/5 [01:52<01:17, 38.59s/it]

Fold 3: Train ROC AUC: 0.9049, Validation ROC AUC: 0.8684


Folds:  80%|████████  | 4/5 [02:28<00:37, 37.84s/it]

Fold 4: Train ROC AUC: 0.9069, Validation ROC AUC: 0.8587


Folds: 100%|██████████| 5/5 [03:08<00:00, 37.73s/it]

Fold 5: Train ROC AUC: 0.9073, Validation ROC AUC: 0.8595
Average Train ROC AUC: 0.9059
Average Validation ROC AUC: 0.8628





Test ROC AUC: 0.8648


In [14]:
# LightGBM
print("\nLightGBM:")
lgbm_model = LGBMClassifier(random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)

print("\nLightGBM with balancing:")
scale_pos_weights = (y == 0).sum() / (y == 1).sum()
lgbm_model_b = LGBMClassifier(scale_pos_weight=scale_pos_weights, n_estimators=1000, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model_b, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:04<00:17,  4.43s/it]

Fold 1: Train ROC AUC: 0.8896, Validation ROC AUC: 0.8656
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 781
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:08<00:12,  4.05s/it]

Fold 2: Train ROC AUC: 0.8907, Validation ROC AUC: 0.8621
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:11<00:07,  3.80s/it]

Fold 3: Train ROC AUC: 0.8888, Validation ROC AUC: 0.8686
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:16<00:04,  4.07s/it]

Fold 4: Train ROC AUC: 0.8905, Validation ROC AUC: 0.8586
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:19<00:00,  3.99s/it]

Fold 5: Train ROC AUC: 0.8904, Validation ROC AUC: 0.8599
Average Train ROC AUC: 0.8900
Average Validation ROC AUC: 0.8630





Test ROC AUC: 0.8667

LightGBM with balancing:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:21<01:27, 21.82s/it]

Fold 1: Train ROC AUC: 0.9679, Validation ROC AUC: 0.8563
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 781
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:40<00:59, 19.95s/it]

Fold 2: Train ROC AUC: 0.9671, Validation ROC AUC: 0.8538
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [01:00<00:39, 19.88s/it]

Fold 3: Train ROC AUC: 0.9670, Validation ROC AUC: 0.8582
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [01:22<00:20, 20.86s/it]

Fold 4: Train ROC AUC: 0.9689, Validation ROC AUC: 0.8494
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [01:40<00:00, 20.15s/it]

Fold 5: Train ROC AUC: 0.9653, Validation ROC AUC: 0.8492
Average Train ROC AUC: 0.9672
Average Validation ROC AUC: 0.8534





Test ROC AUC: 0.8547


추가적으로 KNN에 대해 검토해보았다.
그냥 KNN을 돌리면 시간이 너무 오래걸려서 PCA로 5개 피쳐로 축소한 뒤 진행하였다. 
성능은 그닥 좋지 못하다.

In [15]:
from sklearn.neighbors import KNeighborsClassifier

# KNN
print("\nKNN:")

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# PCA로 차원 축소와 KNN을 결합한 파이프라인
pipeline = Pipeline([
    ('pca', PCA(n_components=5)),  # 피쳐 수를 줄임
    ('knn', KNeighborsClassifier(n_neighbors=18, p=1, weights='uniform'))
])

knn_train_scores, knn_valid_scores, knn_test_auc = modeling(pipeline , X, y)


KNN:


Folds:  20%|██        | 1/5 [00:07<00:28,  7.19s/it]

Fold 1: Train ROC AUC: 0.8816, Validation ROC AUC: 0.8380


Folds:  40%|████      | 2/5 [00:13<00:19,  6.51s/it]

Fold 2: Train ROC AUC: 0.8820, Validation ROC AUC: 0.8326


Folds:  60%|██████    | 3/5 [00:19<00:13,  6.59s/it]

Fold 3: Train ROC AUC: 0.8820, Validation ROC AUC: 0.8375


Folds:  80%|████████  | 4/5 [00:26<00:06,  6.52s/it]

Fold 4: Train ROC AUC: 0.8826, Validation ROC AUC: 0.8301


Folds: 100%|██████████| 5/5 [00:32<00:00,  6.51s/it]

Fold 5: Train ROC AUC: 0.8829, Validation ROC AUC: 0.8322
Average Train ROC AUC: 0.8822
Average Validation ROC AUC: 0.8341





Test ROC AUC: 0.8327


In [19]:
import time
start_time

1721493948.555815

In [29]:
start_time = time.time()

# LightGBM
print("\nLightGBM:")
lgbm_model = LGBMClassifier(random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)

end_time = time.time()
print(end_time - start_time)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:03<00:15,  3.86s/it]

Fold 1: Train ROC AUC: 0.8896, Validation ROC AUC: 0.8656
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 781
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:07<00:10,  3.65s/it]

Fold 2: Train ROC AUC: 0.8907, Validation ROC AUC: 0.8621
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 113
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:10<00:06,  3.48s/it]

Fold 3: Train ROC AUC: 0.8888, Validation ROC AUC: 0.8686
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:14<00:03,  3.64s/it]

Fold 4: Train ROC AUC: 0.8905, Validation ROC AUC: 0.8586
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 112
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:18<00:00,  3.62s/it]

Fold 5: Train ROC AUC: 0.8904, Validation ROC AUC: 0.8599
Average Train ROC AUC: 0.8900
Average Validation ROC AUC: 0.8630





Test ROC AUC: 0.8667
19.180898189544678


In [30]:
start_time = time.time()

# PCA + LightGBMKNN
print("\nPCA + LightGBMKNN:")

pca_lgbm = Pipeline([
    ('pca', PCA(n_components=5)),  # 피쳐 수를 줄임
    ('lgbm', LGBMClassifier(random_state=42))
])

lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(pca_lgbm, X, y)

end_time = time.time()
print(end_time - start_time)


PCA + LightGBMKNN:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:05<00:21,  5.47s/it]

Fold 1: Train ROC AUC: 0.8702, Validation ROC AUC: 0.8545
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:09<00:13,  4.63s/it]

Fold 2: Train ROC AUC: 0.8710, Validation ROC AUC: 0.8500
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:13<00:08,  4.42s/it]

Fold 3: Train ROC AUC: 0.8695, Validation ROC AUC: 0.8552
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:17<00:04,  4.27s/it]

Fold 4: Train ROC AUC: 0.8713, Validation ROC AUC: 0.8479
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:22<00:00,  4.46s/it]

Fold 5: Train ROC AUC: 0.8723, Validation ROC AUC: 0.8488
Average Train ROC AUC: 0.8709
Average Validation ROC AUC: 0.8513





Test ROC AUC: 0.8512
23.30184006690979


In [27]:
start_time = time.time()

# XGBoost
print("\nXGBoost:")
xgb_model = XGBClassifier(random_state=42)
xgb_train_scores, xgb_valid_scores, xgb_test_auc = modeling(xgb_model, X, y)

end_time = time.time()
print(end_time - start_time)



XGBoost:


Folds:  20%|██        | 1/5 [00:06<00:26,  6.53s/it]

Fold 1: Train ROC AUC: 0.9078, Validation ROC AUC: 0.8633


Folds:  40%|████      | 2/5 [00:12<00:19,  6.40s/it]

Fold 2: Train ROC AUC: 0.9090, Validation ROC AUC: 0.8612


Folds:  60%|██████    | 3/5 [00:18<00:12,  6.08s/it]

Fold 3: Train ROC AUC: 0.9099, Validation ROC AUC: 0.8676


Folds:  80%|████████  | 4/5 [00:25<00:06,  6.41s/it]

Fold 4: Train ROC AUC: 0.9086, Validation ROC AUC: 0.8580


Folds: 100%|██████████| 5/5 [00:33<00:00,  6.61s/it]

Fold 5: Train ROC AUC: 0.9093, Validation ROC AUC: 0.8601
Average Train ROC AUC: 0.9089
Average Validation ROC AUC: 0.8620





Test ROC AUC: 0.8617
34.07271909713745


In [28]:
start_time = time.time()

# XGBoost
print("\nXGBoost:")

pca_xgb = Pipeline([
    ('pca', PCA(n_components=5)),  # 피쳐 수를 줄임
    ('xgb', XGBClassifier(random_state=42))
])

lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(pca_xgb, X, y)

end_time = time.time()
print(end_time - start_time)


XGBoost:


Folds:  20%|██        | 1/5 [00:04<00:16,  4.06s/it]

Fold 1: Train ROC AUC: 0.8863, Validation ROC AUC: 0.8496


Folds:  40%|████      | 2/5 [00:07<00:11,  3.97s/it]

Fold 2: Train ROC AUC: 0.8888, Validation ROC AUC: 0.8474


Folds:  60%|██████    | 3/5 [00:12<00:08,  4.23s/it]

Fold 3: Train ROC AUC: 0.8850, Validation ROC AUC: 0.8528


Folds:  80%|████████  | 4/5 [00:16<00:04,  4.05s/it]

Fold 4: Train ROC AUC: 0.8881, Validation ROC AUC: 0.8453


Folds: 100%|██████████| 5/5 [00:19<00:00,  3.98s/it]

Fold 5: Train ROC AUC: 0.8890, Validation ROC AUC: 0.8453
Average Train ROC AUC: 0.8875
Average Validation ROC AUC: 0.8481





Test ROC AUC: 0.8465
21.036637783050537


In [31]:
start_time = time.time()

# CatBoost
print("\nCatBoost:")
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_train_scores, cat_valid_scores, cat_test_auc = modeling(cat_model, X, y)

end_time = time.time()
print(end_time - start_time)


CatBoost:


Folds:  20%|██        | 1/5 [00:56<03:44, 56.10s/it]

Fold 1: Train ROC AUC: 0.9026, Validation ROC AUC: 0.8661


Folds:  40%|████      | 2/5 [02:12<03:24, 68.14s/it]

Fold 2: Train ROC AUC: 0.9023, Validation ROC AUC: 0.8627


Folds:  60%|██████    | 3/5 [02:54<01:52, 56.05s/it]

Fold 3: Train ROC AUC: 0.9014, Validation ROC AUC: 0.8698


Folds:  80%|████████  | 4/5 [03:40<00:52, 52.17s/it]

Fold 4: Train ROC AUC: 0.9029, Validation ROC AUC: 0.8620


Folds: 100%|██████████| 5/5 [04:07<00:00, 49.51s/it]

Fold 5: Train ROC AUC: 0.9022, Validation ROC AUC: 0.8607
Average Train ROC AUC: 0.9023
Average Validation ROC AUC: 0.8643





Test ROC AUC: 0.8657
248.76237201690674


In [34]:
start_time = time.time()

# CatBoost
print("\nCatBoost:")

pca_cat = Pipeline([
    ('pca', PCA(n_components=5)),  # 피쳐 수를 줄임
    ('xgb', XGBClassifier(random_state=42))
])

cat_train_scores, cat_valid_scores, cat_test_auc = modeling(pca_cat, X, y)

end_time = time.time()
print(end_time - start_time)


CatBoost:


Folds:  20%|██        | 1/5 [00:09<00:39,  9.99s/it]

Fold 1: Train ROC AUC: 0.8883, Validation ROC AUC: 0.8491


Folds:  40%|████      | 2/5 [00:17<00:25,  8.63s/it]

Fold 2: Train ROC AUC: 0.8897, Validation ROC AUC: 0.8461


Folds:  60%|██████    | 3/5 [00:22<00:13,  6.95s/it]

Fold 3: Train ROC AUC: 0.8859, Validation ROC AUC: 0.8522


Folds:  80%|████████  | 4/5 [00:27<00:06,  6.03s/it]

Fold 4: Train ROC AUC: 0.8876, Validation ROC AUC: 0.8465


Folds: 100%|██████████| 5/5 [00:31<00:00,  6.36s/it]

Fold 5: Train ROC AUC: 0.8889, Validation ROC AUC: 0.8464
Average Train ROC AUC: 0.8881
Average Validation ROC AUC: 0.8481





Test ROC AUC: 0.8478
33.40449595451355
