In [63]:
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')


### 1. Metrics(Accuracy, Precision, Recall, F1_score, AUC)

In [64]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'
          .format(accuracy, precision, recall, f1, roc_auc))

### 2. Dataset Load

In [65]:
import pandas as pd
dataset_name = "feature_permission.csv"
dataset = pd.read_csv(dataset_name)
X_features = dataset.loc[:,dataset.columns != 'label']

y_label = dataset['label']

dataset.loc[:,dataset.columns != 'label']
print('Feature shape:{0}'.format(X_features.shape))

Feature shape:(10276, 206)


### 3. Dataset Split(Train, Eval, Test)

In [66]:
# with validation date
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label,
                                                    test_size=0.4, random_state=0)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, 
                                                  test_size=0.5, random_state=0)

train_cnt = y_train.count() # train 데이터 수
eval_cnt = y_eval.count() # validation 데이터 수
test_cnt = y_test.count() # test 데이터 수


print('Train Shape:{0}, Eval Shape:{1} Test Shape:{2}'.format(X_train.shape, X_eval.shape, X_test.shape))

print('\nTrain')
# print(y_train.value_counts()/train_cnt)
print(y_train.value_counts())

print('\nEval')
# print(y_eval.value_counts()/eval_cnt)
print(y_eval.value_counts())

print('\nTest')
# print(y_test.value_counts()/test_cnt)
print(y_test.value_counts())

Train Shape:(6165, 206), Eval Shape:(2055, 206) Test Shape:(2056, 206)

Train
1.0    5448
0.0     717
Name: label, dtype: int64

Eval
1.0    1819
0.0     236
Name: label, dtype: int64

Test
1.0    1830
0.0     226
Name: label, dtype: int64


### 4. Train with GridSearchCV

HyperParamter setting

In [67]:
import time
# grid_param ={'max_depth' : [3, 5, 7, 9, 11],
#              'n_estimators': [600, 700,800,900,1000],
#              'learning_rate': [0.01,0.05, 0.1, 0.15, 0.2],
#              'objective':['binary:logistic'],
#              'eval_metric': ['logloss'],
#              'gamma': [0, 0.5, 1, 2],
#              'random_state':[51],
#             }

grid_param ={'max_depth' : [3],
             'n_estimators': [600,1000],
             'learning_rate': [0.01],
             'objective':['binary:logistic'],
             'eval_metric': ['logloss'],
             'gamma': [0],
             'random_state':[51],
            }
xgb_clf = XGBClassifier(early_stopping_rounds=50)
evals = [(X_eval, y_eval)]

Train

In [69]:
start_time = time.time()

#verbose: 학습 과정 출력
grid_xgb_clf = GridSearchCV(xgb_clf, param_grid = grid_param, cv = 5, refit = True, verbose = 0)
grid_xgb_clf.fit(X_train, y_train, eval_set = evals, verbose = False)

print("Total learning time: ", time.time() - start_time)

Total learning time:  59.33447575569153


### 5. Best parameter and estimator

In [70]:
# help(grid_xgb_clf)
# 최적의 하이퍼 파라미터 출력
best_params=grid_xgb_clf.best_params_

# 최적의 하이퍼 파라미터로 학습된 모델 best_estimator에 저장
best_estimator = grid_xgb_clf.best_estimator_

In [71]:
print(best_params)

{'eval_metric': 'logloss', 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 51}


In [72]:
w_preds = grid_xgb_clf.predict(X_test) # 분류 결과(label)
print(w_preds)

w_pred_proba = grid_xgb_clf.predict_proba(X_test)[:,1] # 분류 결과(확률)
print(w_pred_proba)

[1 1 1 ... 1 1 1]
[0.99840754 0.9978066  0.99516356 ... 0.9989027  0.9989027  0.99809164]


In [73]:
get_clf_eval(y_test, w_preds, w_pred_proba)

Confusion Matrix
[[ 179   47]
 [  28 1802]]
Accuracy: 0.9635, Precision: 0.9746, Recall: 0.9847, F1: 0.9796, AUC: 0.9838


### 6. Model Save

In [74]:
# model 파일로 저장, 저장형식은 json, 하지만 json은 txt로 저장했을 때 보다 용량이 2배크다...
best_estimator.save_model('Binary_grid_xgb.json')

---

## Model Load
저장된 모델 재사용 하는 방법

In [75]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')



In [76]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'
          .format(accuracy, precision, recall, f1, roc_auc))

### 1. Dataset Load

In [77]:
import pandas as pd
dataset = pd.read_csv("feature_permission.csv")
X_features = dataset.loc[:,dataset.columns != 'label']

y_label = dataset['label']

dataset.loc[:,dataset.columns != 'label']
print('Feature shape:{0}'.format(X_features.shape))

Feature shape:(10276, 206)


In [78]:
# with validation date
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label,
                                                    test_size=0.4, random_state=0)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, 
                                                  test_size=0.5, random_state=0)

train_cnt = y_train.count() # train 데이터 수
eval_cnt = y_eval.count() # validation 데이터 수
test_cnt = y_test.count() # test 데이터 수


print('Train Shape:{0}, Eval Shape:{1} Test Shape:{2}'.format(X_train.shape, X_eval.shape, X_test.shape))

print('\nTrain')
# print(y_train.value_counts()/train_cnt)
print(y_train.value_counts())

print('\nEval')
# print(y_eval.value_counts()/eval_cnt)
print(y_eval.value_counts())

print('\nTest')
# print(y_test.value_counts()/test_cnt)
print(y_test.value_counts())

Train Shape:(6165, 206), Eval Shape:(2055, 206) Test Shape:(2056, 206)

Train
1.0    5448
0.0     717
Name: label, dtype: int64

Eval
1.0    1819
0.0     236
Name: label, dtype: int64

Test
1.0    1830
0.0     226
Name: label, dtype: int64


### 2. Model Load

In [79]:
load_model = XGBClassifier()
load_model.load_model('Binary_grid_xgb.json')

### 3. Model Test

In [80]:
w_preds = load_model.predict(X_test) # 분류 결과(label)
print(w_preds)

w_pred_proba = load_model.predict_proba(X_test)[:,1] # 분류 결과(확률)
print(w_pred_proba)

[1 1 1 ... 1 1 1]
[0.99840754 0.9978066  0.99516356 ... 0.9989027  0.9989027  0.99809164]


In [81]:
get_clf_eval(y_test, w_preds, w_pred_proba)

Confusion Matrix
[[ 179   47]
 [  28 1802]]
Accuracy: 0.9635, Precision: 0.9746, Recall: 0.9847, F1: 0.9796, AUC: 0.9838
