# ML - LightGBM & XGBoost

##### 라이브러리 설치

In [None]:
import pandas as pd

In [None]:
!pip install lightgbm==4.1.0
!pip install xgboost==2.0.0
!pip install sklearn
!pip install scikit-optimize

Collecting lightgbm==4.1.0
  Using cached lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
Collecting scipy
  Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Collecting numpy
  Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy, scipy, lightgbm
Successfully installed lightgbm-4.1.0 numpy-1.24.4 scipy-1.10.1
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mCollecting xgboost==2.0.0
  Using cached xgboost-2.0.0-py3-none-manylinux2014_x86_64.whl (297.1 MB)
Collecting scipy
  Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Collecting numpy
  Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy, scipy, xgboost
Successfully installed numpy-1.24.4 scipy-1.10.1 xgboost-2.0.0
You 

##### 데이터 불러오기

In [None]:
# 전처리된 데이터 불러오기
diabate = pd.read_csv('diabetes_generators.csv', index_col=0)
diabate

Unnamed: 0,Glucose,BMI,Age,Outcome
0,148.0,33.6,50,1
1,85.0,26.6,31,0
2,183.0,23.3,32,1
3,89.0,28.1,21,0
4,137.0,43.1,33,1
...,...,...,...,...
763,101.0,32.9,63,0
764,122.0,36.8,27,0
765,121.0,26.2,30,0
766,126.0,30.1,47,1


In [None]:
diabate_ = diabate.copy()

#### Data Splitting

In [None]:
# 정답 데이터 분리
X = diabate_.drop(columns=['Outcome'])
y = diabate_[['Outcome']]

In [None]:
# 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=0)

#### Standadardization - Robert Scaler

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

numeric_columns = ['Age', 'BMI', 'Glucose']

# RobustScaler 적용
preprocessor = ColumnTransformer(
    transformers=[('numeric', RobustScaler(), numeric_columns)]
)

#### 전처리 Piperine

In [None]:

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # preprocessing-only

preprocessor_pipe.fit(X_train)

x_train_transformed = preprocessor_pipe.transform(X_train)
x_test_transformed = preprocessor_pipe.transform(X_test)

pd.DataFrame(x_train_transformed).head(3)

Unnamed: 0,0,1,2
0,-0.470588,1.077778,0.761905
1,-0.117647,-1.577778,-0.857143
2,-0.117647,0.488889,0.071429


In [None]:
# 모델 학습 및 하이퍼파라미터 최적화를 위한 필요한 라이브러리
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from skopt import BayesSearchCV
from collections import OrderedDict


### XGBoost<br>
#### HPO - Baysian Search

In [None]:
# %%time
# # lightGBM
# # HPO - Baysian Search

# from skopt.space import Real, Integer # 실수, 정수형 변수 사용

# # Hyper-parameter 검색범위 지정
# param_grid = {'n_estimators': Integer(1000, 2000),
#               'learning_rate': Real(0.01, 0.1),
#               'num_leaves': Integer(1000, 3000),
#               'max_depth': Integer(-1, 5),
#               'scale_pos_weight': Real(1.2, 1.5)}

# # 최적의 하이퍼파라미터 찾기
# hpo_lgbm = BayesSearchCV(LGBMClassifier(random_state=42), param_grid,
#                     scoring='recall', refit=True,
#                     n_jobs=-1,
#                     n_iter=72,
#                     cv=5) # 평가지표 Recall 지정

# hpo_lgbm.fit(x_train_transformed, y_train)


# print('The best parameters are ', hpo_lgbm.best_params_)

**최적의 hyper-parameter 값**<br>The best parameters are  OrderedDict([('learning_rate', 0.01), ('max_depth', 1), ('n_estimators', 1099), ('num_leaves', 2969), ('scale_pos_weight', 1.5)])
Wall time: 2min 19s

In [None]:
# 최적의 하이퍼파라미터 추출
best_params = OrderedDict([('learning_rate', 0.01), ('max_depth', 1), ('n_estimators', 1099), ('num_leaves', 2969), ('scale_pos_weight', 1.5)])

model_lgbm = LGBMClassifier(n_estimators=best_params['n_estimators'],
                            learning_rate=best_params['learning_rate'],
                            max_depth=best_params['max_depth'],
                            num_leaves=best_params['num_leaves'],
                            scale_pos_weight=best_params['scale_pos_weight'],
                            class_weight='balanced',
                            boost_from_average=False,
                            verbose=-1)

model_lgbm.fit(x_train_transformed, y_train.values.ravel()) 

pred = model_lgbm.predict(x_test_transformed)
pred_proba = model_lgbm.predict_proba(x_test_transformed)[:, 1]

confusion_without = confusion_matrix(y_test, pred)
accuracy_without = accuracy_score(y_test, pred)
precision_without = precision_score(y_test, pred)
recall_without = recall_score(y_test, pred)
f1_without = f1_score(y_test, pred)
roc_auc_without = roc_auc_score(y_test, pred_proba)

print('Confusion matrix:')
print(confusion_without)
print()

print('Accuracy :', accuracy_without)
print('Precision :', precision_without)
print('Recall :', recall_without)
print('F1 :', f1_without)
print('ROC_AUC :', roc_auc_without) 


Confusion matrix:
[[113  44]
 [ 15  59]]

Accuracy : 0.7445887445887446
Precision : 0.5728155339805825
Recall : 0.7972972972972973
F1 : 0.6666666666666667
ROC_AUC : 0.8376226545016354


### XGBoost
#### HPO - Baysian Search

In [None]:
# %%time

# # XGBoost
# # HPO - Baysian Search
# from skopt.space import Real, Integer

# param_grid_xgb = {'n_estimators': Integer(1000, 2000),
#                   'learning_rate': Real(0.01, 0.1),
#                   'min_child_weight': Integer(2, 10),
#                   'max_depth': Integer(2, 4),
#                   'scale_pos_weight': Real(1.2, 1.5),
#                   }

# hpo_xgb = BayesSearchCV(xgb.XGBClassifier(random_state=42), param_grid_xgb, scoring='recall',
#                         refit=True, n_jobs=-1, n_iter=72, cv=5)

# hpo_xgb.fit(x_train_transformed, y_train)

# # 최적의 Hyper-parameter 출력
# print('The best parameters are ', hpo_xgb.best_params_)

**최적의 하이퍼파라미터 값**<br>The best parameters are OrderedDict([('learning_rate', 0.01), ('max_depth', 2), ('min_child_weight', 7), ('n_estimators', 1000), ('scale_pos_weight', 1.5)])

In [None]:
# 최적의 하이퍼파라미터 추출
best_params = OrderedDict([('learning_rate', 0.01), ('max_depth', 2), ('min_child_weight', 7), ('n_estimators', 1000), ('scale_pos_weight', 1.5)])

model_xgb = xgb.XGBClassifier(n_estimators=best_params['n_estimators'],
                                  learning_rate= best_params['learning_rate'],
                                  max_depth = best_params['max_depth'],
                                  min_child_weight=best_params['min_child_weight'],
                                  scale_pos_weight=best_params['scale_pos_weight'],
                                  use_label_encoder=False,
                                  eval_metric='logloss',
                                  verbosity=1)

model_xgb.fit(x_train_transformed, y_train.values.ravel()) 

pred = model_xgb.predict(x_test_transformed)
pred_proba = model_xgb.predict_proba(x_test_transformed)[:, 1]

confusion_without = confusion_matrix(y_test, pred)
accuracy_without = accuracy_score(y_test, pred)
precision_without = precision_score(y_test, pred)
recall_without = recall_score(y_test, pred)
f1_without = f1_score(y_test, pred)
roc_auc_without = roc_auc_score(y_test, pred_proba)

print('Confusion matrix:')
print(confusion_without)
print()

print('Accuracy :', accuracy_without)
print('Precision :', precision_without)
print('Recall :', recall_without)
print('F1 :', f1_without)
print('ROC_AUC :', roc_auc_without)

Confusion matrix:
[[133  24]
 [ 31  43]]

Accuracy : 0.7619047619047619
Precision : 0.6417910447761194
Recall : 0.581081081081081
F1 : 0.6099290780141844
ROC_AUC : 0.833706317782751


##### 각 모델 예측 테스트

In [None]:
sample_data = pd.DataFrame([[171, 45.4, 54]], columns=['Glucose', 'BMI', 'Age'])
model_lgbm.predict(sample_data)

array([1])

In [None]:
model_xgb.predict(sample_data)

array([1])

### 모델 저장

In [None]:
# LightGBM
import joblib

# Save model
joblib.dump(model_lgbm, 'ml_lgbm.pkl')

# Load saved model
# loaded_model_lgbm = joblib.load('model_lgbm.pkl')


['ml_lgbm.pkl']

In [None]:
# XGBoost
import pickle

# Save model
with open('ml_xgb.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)


In [None]:
# # pycaret 모델 비교 위해 x_test_transformed, y_test 데이터프레임으로 내보내기
# x_test_df = pd.DataFrame(x_test_transformed, columns=['Glucose', 'BMI', 'Age'])
# x_test_df.to_csv('x_test_df.csv')
# y_test_df = pd.DataFrame(y_test)
# y_test_df.to_csv('y_test_df.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=645823d6-5f8a-49f6-b0db-26e2163dbcc0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>