### Binary Classfication과 다른 점
1. 1.Metrics의(get_clf_eval)의 함수
2. 모델 저장 이름
3. 다른 점은 거의 없다.

In [9]:
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')


### 1. Metrics(Accuracy, Precision, Recall, F1_score, AUC)

In [10]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average = 'micro')
    recall = recall_score(y_test, pred, average = 'micro')
    f1 = f1_score(y_test, pred ,average = 'micro')
    
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, F1: {3:.4f}'
          .format(accuracy, precision, recall, f1))

### 2. Dataset Load## Dataset Load

In [11]:
import pandas as pd
dataset = pd.read_csv("cic_feature_permission.csv")
X_features = dataset.loc[:,dataset.columns != 'label']

y_label = dataset['label']

dataset.loc[:,dataset.columns != 'label']
print('Feature shape:{0}'.format(X_features.shape))

Feature shape:(9000, 111)


### 3. Dataset Split(Train, Eval, Test)

In [12]:
# with validation date
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label,
                                                    test_size=0.4, random_state=0)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, 
                                                  test_size=0.5, random_state=0)

train_cnt = y_train.count() # train 데이터 수
eval_cnt = y_eval.count() # validation 데이터 수
test_cnt = y_test.count() # test 데이터 수


print('Train Shape:{0}, Eval Shape:{1} Test Shape:{2}'.format(X_train.shape, X_eval.shape, X_test.shape))

print('\nTrain')
# print(y_train.value_counts()/train_cnt)
print(y_train.value_counts())

print('\nEval')
# print(y_eval.value_counts()/eval_cnt)
print(y_eval.value_counts())

print('\nTest')
# print(y_test.value_counts()/test_cnt)
print(y_test.value_counts())

Train Shape:(5400, 111), Eval Shape:(1800, 111) Test Shape:(1800, 111)

Train
2.0    2891
1.0    1514
0.0     995
Name: label, dtype: int64

Eval
2.0    981
1.0    493
0.0    326
Name: label, dtype: int64

Test
2.0    923
1.0    529
0.0    348
Name: label, dtype: int64


### 4. Train with GridSearchCV## Train without GridSearchCV

HyperParamter setting

In [13]:
grid_param ={'max_depth' : [3, 5, 7, 9],
             'n_estimators': [800, 900, 1000],
             'learning_rate': [0.01, 0.1, 0.2],
             'objective':['multi:softproba'],
             'eval_metric': ['mlogloss'],
             'gamma': [0, 0.5, 1],
             'random_state':[51],
            }

xgb_clf = XGBClassifier(early_stopping_rounds=50)
evals = [(X_eval, y_eval)]

Train

In [15]:
import time
start_time = time.time()

#verbose: 학습 과정 출력
grid_xgb_clf = GridSearchCV(xgb_clf, param_grid=grid_param, cv=5, refit=True, verbose=100)
grid_xgb_clf.fit(X_train, y_train, eval_set=evals, verbose=True)

print("Total learning time: ", time.time() - start_time)

Total learning time:  45.75236797332764


### 5. Best parameter and estimator

In [16]:
# help(grid_xgb_clf)
# 최적의 하이퍼 파라미터 출력
best_params=grid_xgb_clf.best_params_

# 최적의 하이퍼 파라미터로 학습된 모델
best_estimator = grid_xgb_clf.best_estimator_

In [17]:
print(best_params)

{'eval_metric': 'mlogloss', 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'objective': 'multi:softproba', 'random_state': 51}


In [18]:
w_preds = grid_xgb_clf.predict(X_test) # 분류 결과(label)
print(w_preds)

w_pred_proba = grid_xgb_clf.predict_proba(X_test)[:,1] # 분류 결과(확률)
print(w_pred_proba)


[1 2 2 ... 2 1 1]
[0.992081   0.00349206 0.00618605 ... 0.01087432 0.9371764  0.9754439 ]


### 6. Model Save

In [19]:
# model 파일로 저장, 저장형식은 json, 하지만 json은 txt로 저장했을 때 보다 용량이 2배크다...
best_estimator.save_model('Multi_grid_xgb.json')

In [20]:
get_clf_eval(y_test, w_preds, w_pred_proba)

Confusion Matrix
[[338  10   0]
 [ 21 497  11]
 [  2   5 916]]
Accuracy: 0.9728, Precision: 0.9728, Recall: 0.9728, F1: 0.9728


---

## Model Load
저장된 모델 재사용 하는 방법

In [21]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')



In [22]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average = 'micro')
    recall = recall_score(y_test, pred, average = 'micro')
    f1 = f1_score(y_test, pred ,average = 'micro')
    
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, F1: {3:.4f}'
          .format(accuracy, precision, recall, f1))

### 1. Dataset Load

In [23]:
import pandas as pd
dataset = pd.read_csv("cic_feature_permission.csv")
X_features = dataset.loc[:,dataset.columns != 'label']

y_label = dataset['label']

dataset.loc[:,dataset.columns != 'label']
print('Feature shape:{0}'.format(X_features.shape))

Feature shape:(9000, 111)


In [24]:
# with validation date
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label,
                                                    test_size=0.4, random_state=0)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, 
                                                  test_size=0.5, random_state=0)

train_cnt = y_train.count() # train 데이터 수
eval_cnt = y_eval.count() # validation 데이터 수
test_cnt = y_test.count() # test 데이터 수


print('Train Shape:{0}, Eval Shape:{1} Test Shape:{2}'.format(X_train.shape, X_eval.shape, X_test.shape))

print('\nTrain')
# print(y_train.value_counts()/train_cnt)
print(y_train.value_counts())

print('\nEval')
# print(y_eval.value_counts()/eval_cnt)
print(y_eval.value_counts())

print('\nTest')
# print(y_test.value_counts()/test_cnt)
print(y_test.value_counts())

Train Shape:(5400, 111), Eval Shape:(1800, 111) Test Shape:(1800, 111)

Train
2.0    2891
1.0    1514
0.0     995
Name: label, dtype: int64

Eval
2.0    981
1.0    493
0.0    326
Name: label, dtype: int64

Test
2.0    923
1.0    529
0.0    348
Name: label, dtype: int64


### 2. Model Load

In [25]:
load_model = XGBClassifier()
load_model.load_model('Multi_grid_xgb.json')

### 3. Model Test

In [26]:
w_preds = load_model.predict(X_test) # 분류 결과(label)
print(w_preds)

w_pred_proba = load_model.predict_proba(X_test)[:,1] # 분류 결과(확률)
print(w_pred_proba)

[1 2 2 ... 2 1 1]
[0.992081   0.00349206 0.00618605 ... 0.01087432 0.9371764  0.9754439 ]


In [27]:
get_clf_eval(y_test, w_preds, w_pred_proba)

Confusion Matrix
[[338  10   0]
 [ 21 497  11]
 [  2   5 916]]
Accuracy: 0.9728, Precision: 0.9728, Recall: 0.9728, F1: 0.9728
