In [16]:
!pip install lightgbm catboost



In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
file_path = 'monunmon.csv'
data = pd.read_csv(file_path)

data['binary_label'] = data['Label'].apply(lambda x: 0 if x == -1 else 1)  # 이진 레이블 생성
X = data.drop(['Label', 'binary_label'], axis=1)  # 특성 데이터
y = data['binary_label']  # 이진 레이블

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Gradient Boosting Classifier 사용
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)
gbc_acc = accuracy_score(y_test, y_pred_gbc)
gbc_f1 = f1_score(y_test, y_pred_gbc)

print("Gradient Boosting Classifier")
print(f"Accuracy: {gbc_acc:.4f}")
print(f"F1-Score: {gbc_f1:.4f}")

Gradient Boosting Classifier
Accuracy: 0.8974
F1-Score: 0.9436


In [9]:
# XGBoost Classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

# 결과 출력
print("XGBoost Classifier")
print(f"Accuracy: {xgb_acc:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Classifier
Accuracy: 0.9100
F1-Score: 0.9497


In [19]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
lgbm_acc = accuracy_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm)

# 결과 출력
print("LightGBM Classifier")
print(f"Accuracy: {lgbm_acc:.4f}")
print(f"F1-Score: {lgbm_f1:.4f}")

[LightGBM] [Info] Number of positive: 13282, number of negative: 2118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3345
[LightGBM] [Info] Number of data points in the train set: 15400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.862468 -> initscore=1.835937
[LightGBM] [Info] Start training from score 1.835937
LightGBM Classifier
Accuracy: 0.9105
F1-Score: 0.9504


In [18]:
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)  # verbose=0으로 출력 제한
catboost.fit(X_train, y_train)
y_pred_catboost = catboost.predict(X_test)
catboost_acc = accuracy_score(y_test, y_pred_catboost)
catboost_f1 = f1_score(y_test, y_pred_catboost)

# 결과 출력
print("CatBoost Classifier")
print(f"Accuracy: {catboost_acc:.4f}")
print(f"F1-Score: {catboost_f1:.4f}")

CatBoost Classifier
Accuracy: 0.9114
F1-Score: 0.9508


In [6]:
# AdaBoost Classifier 사용
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train)
y_pred_abc = abc.predict(X_test)
abc_acc = accuracy_score(y_test, y_pred_abc)
abc_f1 = f1_score(y_test, y_pred_abc)

print("\nAdaBoost Classifier")
print(f"Accuracy: {abc_acc:.4f}")
print(f"F1-Score: {abc_f1:.4f}")




AdaBoost Classifier
Accuracy: 0.8941
F1-Score: 0.9417


성능 개선
1. 데이터 전처리

  a. 데이터 불균형: mon 과 unmon 19000:3000 으로 불균형

        (1)  SMOTE 기법(소수 클래스 늘리기) 사용
      
  b. feature selection -> 보류

  c. 새로운 특성 생성

In [20]:
!pip install imbalanced-learn



In [22]:
from imblearn.over_sampling import SMOTE

data['binary_label'] = data['Label'].apply(lambda x: 0 if x == -1 else 1)  # 이진 레이블 생성
X = data.drop(['Label', 'binary_label'], axis=1)  # 특성 데이터
y = data['binary_label']  # 이진 레이블

# SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original dataset size:", X.shape[0])
print("Resampled dataset size:", X_resampled.shape[0])

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

Original dataset size: 22000
Resampled dataset size: 38000


In [23]:
# Gradient Boosting Classifier 사용
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)
gbc_acc = accuracy_score(y_test, y_pred_gbc)
gbc_f1 = f1_score(y_test, y_pred_gbc)

print("Gradient Boosting Classifier")
print(f"Accuracy: {gbc_acc:.4f}")
print(f"F1-Score: {gbc_f1:.4f}")

Gradient Boosting Classifier
Accuracy: 0.8896
F1-Score: 0.8957


In [24]:
# AdaBoost Classifier 사용
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train)
y_pred_abc = abc.predict(X_test)
abc_acc = accuracy_score(y_test, y_pred_abc)
abc_f1 = f1_score(y_test, y_pred_abc)

print("\nAdaBoost Classifier")
print(f"Accuracy: {abc_acc:.4f}")
print(f"F1-Score: {abc_f1:.4f}")




AdaBoost Classifier
Accuracy: 0.8678
F1-Score: 0.8687


In [25]:
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)  # verbose=0으로 출력 제한
catboost.fit(X_train, y_train)
y_pred_catboost = catboost.predict(X_test)
catboost_acc = accuracy_score(y_test, y_pred_catboost)
catboost_f1 = f1_score(y_test, y_pred_catboost)

# 결과 출력
print("CatBoost Classifier")
print(f"Accuracy: {catboost_acc:.4f}")
print(f"F1-Score: {catboost_f1:.4f}")

CatBoost Classifier
Accuracy: 0.9267
F1-Score: 0.9295


In [27]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
lgbm_acc = accuracy_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm)

# 결과 출력
print("LightGBM Classifier")
print(f"Accuracy: {lgbm_acc:.4f}")
print(f"F1-Score: {lgbm_f1:.4f}")

[LightGBM] [Info] Number of positive: 13310, number of negative: 13290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 26600, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
LightGBM Classifier
Accuracy: 0.9181
F1-Score: 0.9217


In [26]:
# XGBoost Classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

# 결과 출력
print("XGBoost Classifier")
print(f"Accuracy: {xgb_acc:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Classifier
Accuracy: 0.9306
F1-Score: 0.9327


**GBC와 AdaBoost:**

SMOTE로 생성된 합성 데이터가 원본 데이터와 다소 다른 분포를 가질 경우, 모델 성능이 저하될 가능성이 높음.
특히 AdaBoost는 잘못된 데이터에 민감함.


**CatBoost, LightGBM, XGBoost:**

이러한 알고리즘들은 데이터 노이즈와 불균형에 더 잘 대처하는 구조를 가짐.

#하이퍼파라미터 튜닝

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [29]:
#Adaboost

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
}

ada = AdaBoostClassifier(random_state=42)
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search_ada.fit(X_train, y_train)

best_ada = grid_search_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test)
ada_accuracy = accuracy_score(y_test, y_pred_ada)
ada_f1 = f1_score(y_test, y_pred_ada)

# 출력
print("Adaboost Best Parameters:", grid_search_ada.best_params_)
print(f"Adaboost Accuracy: {ada_accuracy:.4f}")
print(f"Adaboost F1-Score: {ada_f1:.4f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END ................learning_rate=0.01, n_estimators=50; total time=   6.7s




[CV] END ................learning_rate=0.01, n_estimators=50; total time=   3.1s




[CV] END ................learning_rate=0.01, n_estimators=50; total time=   4.2s




[CV] END ................learning_rate=0.01, n_estimators=50; total time=   5.2s




[CV] END ................learning_rate=0.01, n_estimators=50; total time=   3.5s




[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   7.3s




[CV] END ...............learning_rate=0.01, n_estimators=100; total time=   7.0s




[CV] END ...............learning_rate=0.01, n_estimators=100; total time=  11.8s




[CV] END ...............learning_rate=0.01, n_estimators=100; total time=  10.6s




[CV] END ...............learning_rate=0.01, n_estimators=100; total time=  10.6s




[CV] END ...............learning_rate=0.01, n_estimators=200; total time=  22.6s




[CV] END ...............learning_rate=0.01, n_estimators=200; total time=  23.3s




[CV] END ...............learning_rate=0.01, n_estimators=200; total time=  25.5s




[CV] END ...............learning_rate=0.01, n_estimators=200; total time=  14.2s




[CV] END ...............learning_rate=0.01, n_estimators=200; total time=  13.6s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=   3.8s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=   3.5s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=   3.1s




[CV] END ................learning_rate=0.1, n_estimators=100; total time=   6.9s




[CV] END ................learning_rate=0.1, n_estimators=100; total time=   6.7s




[CV] END ................learning_rate=0.1, n_estimators=100; total time=   6.9s




[CV] END ................learning_rate=0.1, n_estimators=100; total time=   6.8s




[CV] END ................learning_rate=0.1, n_estimators=100; total time=   6.8s




[CV] END ................learning_rate=0.1, n_estimators=200; total time=  13.4s




[CV] END ................learning_rate=0.1, n_estimators=200; total time=  13.1s




[CV] END ................learning_rate=0.1, n_estimators=200; total time=  13.9s




[CV] END ................learning_rate=0.1, n_estimators=200; total time=  17.9s




[CV] END ................learning_rate=0.1, n_estimators=200; total time=  18.1s




[CV] END .................learning_rate=0.5, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=0.5, n_estimators=50; total time=   4.1s




[CV] END .................learning_rate=0.5, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=0.5, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=0.5, n_estimators=50; total time=   3.7s




[CV] END ................learning_rate=0.5, n_estimators=100; total time=  11.8s




[CV] END ................learning_rate=0.5, n_estimators=100; total time=   7.2s




[CV] END ................learning_rate=0.5, n_estimators=100; total time=   6.1s




[CV] END ................learning_rate=0.5, n_estimators=100; total time=   7.1s




[CV] END ................learning_rate=0.5, n_estimators=100; total time=   6.2s




[CV] END ................learning_rate=0.5, n_estimators=200; total time=  13.3s




[CV] END ................learning_rate=0.5, n_estimators=200; total time=  13.3s




[CV] END ................learning_rate=0.5, n_estimators=200; total time=  13.3s




[CV] END ................learning_rate=0.5, n_estimators=200; total time=  13.4s




[CV] END ................learning_rate=0.5, n_estimators=200; total time=  13.4s




[CV] END .................learning_rate=1.0, n_estimators=50; total time=   3.2s




[CV] END .................learning_rate=1.0, n_estimators=50; total time=   4.0s




[CV] END .................learning_rate=1.0, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=1.0, n_estimators=50; total time=   3.1s




[CV] END .................learning_rate=1.0, n_estimators=50; total time=   3.2s




[CV] END ................learning_rate=1.0, n_estimators=100; total time=   7.2s




[CV] END ................learning_rate=1.0, n_estimators=100; total time=   6.3s




[CV] END ................learning_rate=1.0, n_estimators=100; total time=   7.2s




[CV] END ................learning_rate=1.0, n_estimators=100; total time=   6.2s




[CV] END ................learning_rate=1.0, n_estimators=100; total time=   7.3s




[CV] END ................learning_rate=1.0, n_estimators=200; total time=  13.3s




[CV] END ................learning_rate=1.0, n_estimators=200; total time=  13.5s




[CV] END ................learning_rate=1.0, n_estimators=200; total time=  13.6s




[CV] END ................learning_rate=1.0, n_estimators=200; total time=  13.5s




[CV] END ................learning_rate=1.0, n_estimators=200; total time=  13.5s




Adaboost Best Parameters: {'learning_rate': 1.0, 'n_estimators': 200}
Adaboost Accuracy: 0.8897
Adaboost F1-Score: 0.8956


In [31]:
#LGBM

param_dist = {
    'num_leaves': [20, 31, 40, 50],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [10, 20, 30],
}

# RandomizedSearchCV 수행
lgbm = LGBMClassifier(random_state=42)
random_search_lgbm = RandomizedSearchCV(estimator=lgbm, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', verbose=2, random_state=42)
random_search_lgbm.fit(X_train, y_train)

# 최적 모델로 테스트 데이터 평가
best_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
lgbm_accuracy = accuracy_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm)

print("LightGBM Best Parameters:", random_search_lgbm.best_params_)
print(f"LightGBM Accuracy: {lgbm_accuracy:.4f}")
print(f"LightGBM F1-Score: {lgbm_f1:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 21280, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
[CV] END learning_rate=0.2, max_depth=20, min_child_samples=30, n_estimators=200, num_leaves=20; total time=   0.7s
[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825

In [32]:
#Catboost
param_grid = {
    'iterations': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7]
}

catboost = CatBoostClassifier(random_state=42, verbose=0)
grid_search_result = catboost.grid_search(param_grid, X_train, y_train, cv=5, stratified=True)

# 최적 모델로 테스트 데이터 평가
best_catboost = CatBoostClassifier(**grid_search_result['params'], random_state=42, verbose=0)
best_catboost.fit(X_train, y_train)
y_pred_catboost = best_catboost.predict(X_test)
catboost_accuracy = accuracy_score(y_test, y_pred_catboost)
catboost_f1 = f1_score(y_test, y_pred_catboost)

print("CatBoost Best Parameters:", grid_search_result['params'])
print(f"CatBoost Accuracy: {catboost_accuracy:.4f}")
print(f"CatBoost F1-Score: {catboost_f1:.4f}")



bestTest = 0.4574837366
bestIteration = 99

0:	loss: 0.4574837	best: 0.4574837 (0)	total: 903ms	remaining: 2m 9s

bestTest = 0.2934673244
bestIteration = 99

1:	loss: 0.2934673	best: 0.2934673 (1)	total: 1.72s	remaining: 2m 2s

bestTest = 0.2590088178
bestIteration = 99

2:	loss: 0.2590088	best: 0.2590088 (2)	total: 2.53s	remaining: 1m 58s

bestTest = 0.4620257781
bestIteration = 99

3:	loss: 0.4620258	best: 0.2590088 (2)	total: 4.11s	remaining: 2m 23s

bestTest = 0.3015910996
bestIteration = 99

4:	loss: 0.3015911	best: 0.2590088 (2)	total: 5.74s	remaining: 2m 39s

bestTest = 0.2638180221
bestIteration = 99

5:	loss: 0.2638180	best: 0.2590088 (2)	total: 6.8s	remaining: 2m 36s

bestTest = 0.4662033519
bestIteration = 99

6:	loss: 0.4662034	best: 0.2590088 (2)	total: 7.6s	remaining: 2m 28s

bestTest = 0.3018752537
bestIteration = 99

7:	loss: 0.3018753	best: 0.2590088 (2)	total: 8.43s	remaining: 2m 23s

bestTest = 0.2665907447
bestIteration = 99

8:	loss: 0.2665907	best: 0.2590088 (2)	

In [33]:
#XGB

param_dist = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', verbose=2, random_state=42)
random_search_xgb.fit(X_train, y_train)

# 최적 모델로 테스트 데이터 평가
best_xgb = random_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

print("XGBoost Best Parameters:", random_search_xgb.best_params_)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"XGBoost F1-Score: {xgb_f1:.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   3.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8; total time=   3.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8; total time=   6.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8; total time=  13.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   4.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   2.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   2.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   2.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   4.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   2.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=200, subsample=1.0; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=200, subsample=1.0; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=200, subsample=1.0; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=200, subsample=1.0; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=200, subsample=1.0; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   7.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   4.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   7.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   4.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   7.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=  12.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   4.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=  19.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   6.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   4.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=  20.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   4.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   5.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   9.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=   7.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=  14.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, min_child_weight=5, n_estimators=200, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   0.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   3.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   0.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.6; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=10, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   4.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=1.0; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=1.0; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=500, subsample=0.8; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=500, subsample=0.8; total time=   3.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=500, subsample=0.8; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=500, subsample=0.8; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=500, subsample=0.8; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200, subsample=0.6; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200, subsample=0.6; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200, subsample=0.6; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200, subsample=0.6; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=200, subsample=0.6; total time=   0.7s


Parameters: { "use_label_encoder" } are not used.



XGBoost Best Parameters: {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
XGBoost Accuracy: 0.9435
XGBoost F1-Score: 0.9446


#앙상블 기법

In [45]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

In [46]:
best_params = {
    "LightGBM": {"learning_rate": 0.2, "n_estimators": 500, "num_leaves": 40, "max_depth": -1},
    "CatBoost": {"iterations": 200, "learning_rate": 0.1, "depth": 10, 'l2_leaf_reg': 1},
    "XGBoost": {"learning_rate": 0.1, "n_estimators": 500, "max_depth": 10, "min_child_weight": 1, "subsample": 0.8, "colsample_bytree": 0.8},
}

In [47]:
models = {
    "lgbm": LGBMClassifier(**best_params["LightGBM"], random_state=42),
    "catboost": CatBoostClassifier(**best_params["CatBoost"], random_state=42, verbose=0),
    "xgb": XGBClassifier(**best_params["XGBoost"], random_state=42, use_label_encoder=False, eval_metric="logloss"),
}

- Votingclassifier 사용

VotingClassifier는 여러 모델의 예측 결과를 결합하여 최종 예측을 수행합니다. Hard Voting(다수결) 또는 Soft Voting(확률 기반 결합)을 선택할 수 있습니다.

In [48]:
# VotingClassifier 정의 (Soft Voting)
voting_clf = VotingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('catboost', catboost),
        ('xgb', xgb)
    ],
    voting='soft'  # 'soft': 확률 기반, 'hard': 다수결
)

In [49]:
voting_clf.fit(X_train, y_train)

# 예측 및 평가
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"VotingClassifier Accuracy: {accuracy:.4f}")
print(f"VotingClassifier F1-Score: {f1:.4f}")

[LightGBM] [Info] Number of positive: 13310, number of negative: 13290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003279 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 26600, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504


Parameters: { "use_label_encoder" } are not used.



VotingClassifier Accuracy: 0.9368
VotingClassifier F1-Score: 0.9388


- StackingClassifier 사용

StackingClassifier는 여러 모델의 예측 결과를 새로운 모델의 입력으로 사용하여 최종 예측을 수행합니다.

In [58]:
from sklearn.linear_model import LogisticRegression
# StackingClassifier 정의
stacking_clf = StackingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('catboost', catboost),
        ('xgb', xgb),
        ('ada', ada)
    ],
    final_estimator=LogisticRegression(),  # 최종 조합 모델
    cv=5  # 내부 교차 검증
)

# 앙상블 모델 학습
stacking_clf.fit(X_train, y_train)

# 예측 및 평가
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"StackingClassifier Accuracy: {accuracy:.4f}")
print(f"StackingClassifier F1-Score: {f1:.4f}")

[LightGBM] [Info] Number of positive: 13310, number of negative: 13290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 26600, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 21280, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 21280, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
[LightGBM] [Info

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



StackingClassifier Accuracy: 0.9427
StackingClassifier F1-Score: 0.9436


In [59]:
#최종 모델로 gradient Boosting 사
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier

# StackingClassifier 정의
stacking_clf = StackingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('catboost', catboost),
        ('xgb', xgb),
        ('ada', ada)
    ],
    final_estimator= GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42),
    cv=5
)

# 앙상블 모델 학습
stacking_clf.fit(X_train, y_train)

# 예측 및 평가
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"StackingClassifier Accuracy: {accuracy:.4f}")
print(f"StackingClassifier F1-Score: {f1:.4f}")

[LightGBM] [Info] Number of positive: 13310, number of negative: 13290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 26600, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 21280, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
[LightGBM] [Info] Number of positive: 10648, number of negative: 10632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 21280, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500376 -> initscore=0.001504
[LightGBM] [Info] Start training from score 0.001504
[LightGBM] [Info

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



StackingClassifier Accuracy: 0.9447
StackingClassifier F1-Score: 0.9455


#교차 검증

In [34]:
from sklearn.metrics import make_scorer

In [56]:
best_params = {
    "AdaBoost": {"n_estimators": 200, "learning_rate": 0.1},
    "LightGBM": {"learning_rate": 0.2, "n_estimators": 500, "num_leaves": 40, "max_depth": -1},
    "CatBoost": {"iterations": 200, "learning_rate": 0.1, "depth": 10, 'l2_leaf_reg': 1},
    "XGBoost": {"learning_rate": 0.1, "n_estimators": 500, "max_depth": 10, "min_child_weight": 1, "subsample": 0.8, "colsample_bytree": 0.8},
}

In [57]:
models = {
    "AdaBoost": AdaBoostClassifier(**best_params["AdaBoost"], random_state=42),
    "LightGBM": LGBMClassifier(**best_params["LightGBM"], random_state=42),
    "CatBoost": CatBoostClassifier(**best_params["CatBoost"], random_state=42, verbose=0),
    "XGBoost": XGBClassifier(**best_params["XGBoost"], random_state=42, use_label_encoder=False, eval_metric="logloss"),
}

In [43]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [55]:

# K-Fold 설정
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scorer 설정
f1_scorer = make_scorer(f1_score)

# 교차 검증 실행
results = {}
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    accuracy_scores = cross_val_score(model, X_resampled, y_resampled, cv=kfold, scoring='accuracy')
    f1_scores = cross_val_score(model, X_resampled, y_resampled, cv=kfold, scoring=f1_scorer)

    results[model_name] = {
        "Accuracy (mean)": accuracy_scores.mean(),
        "Accuracy (std)": accuracy_scores.std(),
        "F1-Score (mean)": f1_scores.mean(),
        "F1-Score (std)": f1_scores.std(),
    }

for model_name, scores in results.items():
    print(f"\nModel: {model_name}")
    print(f"  Mean Accuracy: {scores['Accuracy (mean)']:.4f} (±{scores['Accuracy (std)']:.4f})")
    print(f"  Mean F1-Score: {scores['F1-Score (mean)']:.4f} (±{scores['F1-Score (std)']:.4f})")


Evaluating lgbm...
[LightGBM] [Info] Number of positive: 15200, number of negative: 15200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 30400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 15200, number of negative: 15200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 30400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 15200, number of negative: 15200
[LightGBM] [Info] Auto-choosing

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Model: lgbm
  Mean Accuracy: 0.9479 (±0.0019)
  Mean F1-Score: 0.9490 (±0.0018)

Model: catboost
  Mean Accuracy: 0.9333 (±0.0028)
  Mean F1-Score: 0.9354 (±0.0025)

Model: xgb
  Mean Accuracy: 0.9486 (±0.0022)
  Mean F1-Score: 0.9496 (±0.0021)
