In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
import xgboost
from xgboost.sklearn import XGBClassifier

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import *

In [4]:
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *

### 1. Over-sampling

##### - load raw data

In [6]:
df = pd.read_excel('D:/연구/Morphea/all.xlsx', header=0)

X = df.drop(['cls', 'number'], axis=1)
y = df['cls']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, '\n')

# Counter는 dataset에서 각 class 별 개수를 보여줌 
print('y_train :', Counter(y_train), '\ny_test :', Counter(y_test))

(126, 39) (55, 39) (126,) (55,) 

y_train : Counter({0: 104, 1: 12, 2: 10}) 
y_test : Counter({0: 46, 2: 5, 1: 4})


In [9]:
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

##### -SMOTE-Tomek

In [8]:
sampler = SMOTETomek(smote=SMOTE(k_neighbors=6, random_state=42), 
                     tomek=TomekLinks(sampling_strategy='majority'), 
                     random_state=42) 
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

print(X_train_resampled.shape, y_train_resampled.shape)
print(Counter(y_train_resampled))

(302, 39) (302,)
Counter({1: 104, 2: 104, 0: 94})


### 2. Training

##### - CV & hyper-para tuning

In [10]:
k=3
splits = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
foldperf = {}

In [12]:
params = {
    'n_estimators':[50, 70, 100, 200, 300, 400, 500], 
    'learning_rate':[0.1, 0.05, 0.01, 0.005, 0.001, 0.0005], 
    'colsample_bytree':[0.3, 0.5, 0.7, 1.0],
    'gamma':[0, 0.5, 1, 1.5],
    'subsample':[0.3, 0.5, 0.7, 1.0],
    'reg_lambda':[0, 1, 3, 5]
}

In [13]:
def one_hot(y_data):     # y_data is Series type
    y = list(y_data)
    y = np.array(y).reshape(-1, 1)
    
    ohe = OneHotEncoder()
    y_true = ohe.fit_transform(y).toarray()
    return y_true

In [14]:
for fold, (train_idx, val_idx) in enumerate(splits.split(X_train_resampled, y_train_resampled)):
    print('*** Fold {} ***'.format(fold+1))
    history = {'train_idx': [], 'val_idx':[], 'val_acc': [], 'val_auc': [], 'para': []}
    history['train_idx'].append(train_idx)
    history['val_idx'].append(val_idx)
    
    train_X = X_train_resampled.loc[train_idx].reset_index(drop=True)
    train_Y = y_train_resampled.loc[train_idx].reset_index(drop=True)
    
    valid_X = X_train_resampled.loc[val_idx].reset_index(drop=True)
    valid_Y = y_train_resampled.loc[val_idx].reset_index(drop=True)

    one_hot_valid_Y = one_hot(valid_Y)
    
    for est in params['n_estimators']:
        for lr in params['learning_rate']:
            for col in params['colsample_bytree']:
                for gam in params['gamma']:
                    for sub in params['subsample']:
                        for lam in params['reg_lambda']:
                            
                            clf = XGBClassifier(learning_rate=lr, n_estimators=est, colsample_bytree=col, gamma=gam,
                                                subsample=sub, reg_lambda=lam, random_state=42)
                            
                            clf.fit(train_X, train_Y)
                            pred = clf.predict(valid_X)
                            prob = clf.predict_proba(valid_X)

                            acc = accuracy_score(valid_Y, pred)
                            auc = roc_auc_score(one_hot_valid_Y, prob)
                            para = str(est)+', '+str(lr)+', '+str(col)+', '+str(gam)+', '+str(sub)+ ', '+str(lam)
                            # para = [est, lr, col, gam, sub, lam]

                            history['val_acc'].append(acc)
                            history['val_auc'].append(auc)
                            history['para'].append(para)
                            
    foldperf['fold{}'.format(fold+1)] = history  

*** Fold 1 ***
*** Fold 2 ***
*** Fold 3 ***


##### - fold 별 history 저장 (로딩)

In [14]:
with open("XGB_over_foldperf.pkl", "wb") as f:
    pickle.dump(foldperf, f)

In [15]:
# with open("./XGB_over_foldperf.pkl", "rb") as f:
#     foldperf = pickle.load(f)

In [20]:
print('* foldperf 구조\n', foldperf.keys(), '\n', foldperf['fold1'].keys())

* foldperf 구조
 dict_keys(['fold1', 'fold2', 'fold3']) 
 dict_keys(['train_idx', 'val_idx', 'val_acc', 'val_auc', 'para'])


##### - best parameters

In [23]:
best_accs = []
best_aucs = []

# 각 fold 별 best score 
for i in range(k):   
    best_acc = max(foldperf['fold'+str(i+1)]['val_acc'])
    best_accs.append(best_acc)
    
    best_auc = max(foldperf['fold'+str(i+1)]['val_auc'])
    best_aucs.append(best_auc)
    
print('Fold 1-3 best acc :', best_accs, '\nFold 1-3 best auc :', best_aucs)

Fold 1-3 best acc : [1.0, 1.0, 0.98] 
Fold 1-3 best auc : [1.0, 1.0, 0.9990929313790464]


In [28]:
# fold1, fold2의 acc가 가장 좋음 
# fold1에서 acc=1.0일 때, auc와 parameter값을 보자

fold_history = foldperf['fold1']
best_acc = best_accs[0]

for i in range(len(fold_history['val_acc'])):
    if fold_history['val_acc'][i] == best_acc:
        best_idx = i
        auc = fold_history['val_auc'][best_idx]
        para = fold_history['para'][best_idx]
        print('best_idx:',best_idx, '\nacc:',best_acc, '\nauc:',auc, '\n',para, '\n')

best_idx: 0 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 0.3, 0 

best_idx: 4 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 0.5, 0 

best_idx: 8 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 0.7, 0 

best_idx: 9 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 0.7, 1 

best_idx: 12 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 1.0, 0 

best_idx: 13 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 1.0, 1 

best_idx: 14 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0, 1.0, 3 

best_idx: 16 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0.5, 0.3, 0 

best_idx: 20 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0.5, 0.5, 0 

best_idx: 21 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0.5, 0.5, 1 

best_idx: 30 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 0.5, 1.0, 3 

best_idx: 36 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 1, 0.5, 0 

best_idx: 61 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.3, 1.5, 1.0, 1 

best_idx: 69 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.5, 0, 0.5, 1 

best_idx: 70 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.5, 0, 0.5, 3 

best_idx: 72 
acc: 1.0 
auc: 1.0 
 50, 0.1, 0.5, 0, 0.7, 0 

best_idx: 73 
acc:

### 3. EVAL using the best para

In [36]:
# fold1에서 best_idx=0일 때, parameters를 사용하여 evaluation

fold = 'fold1'
best_idx = 0

acc = foldperf[fold]['val_acc'][best_idx]
auc = foldperf[fold]['val_auc'][best_idx]
para = foldperf[fold]['para'][best_idx]
# est, lr, col, gam, sub, lam = para    # 위에서 para list로 저장하면 이렇게 가져올 수 있음

fold_t_idx = foldperf[fold]['train_idx'][best_idx]
fold_v_idx = foldperf[fold]['val_idx'][best_idx]

print(acc, '\n', auc, '\n', para)    

1.0 
 1.0 
 50, 0.1, 0.3, 0, 0.3, 0


In [37]:
r_train_X = X_train_resampled.loc[fold_t_idx].reset_index(drop=True)
r_train_Y = y_train_resampled.loc[fold_t_idx].reset_index(drop=True)

r_valid_X = X_train_resampled.loc[fold_v_idx].reset_index(drop=True)
r_valid_Y = y_train_resampled.loc[fold_v_idx].reset_index(drop=True)

In [38]:
xgb_clf = XGBClassifier(n_estimators=50, learning_rate=0.1, colsample_bytree=0.3, 
                       gamma=0, subsample=0.3, reg_lambda=0, random_state=42)

xgb_clf.fit(r_train_X, r_train_Y)

In [39]:
one_hot_y_test = one_hot(y_test)

pred = xgb_clf.predict(X_test)
prob = xgb_clf.predict_proba(X_test)

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(one_hot_y_test, prob)
print(acc, auc)

0.9272727272727272 0.9560718007009568


In [81]:
micro_avg_auc = roc_auc_score(y_true, prob, average='micro')
micro_avg_auc

0.9895867768595041

In [78]:
cm = confusion_matrix(y_test, pred)
cm

array([[45,  1,  0],
       [ 2,  1,  1],
       [ 0,  0,  5]], dtype=int64)

In [79]:
print(classification_report(y_test, pred, target_names=['0','1','2'], digits=5))

              precision    recall  f1-score   support

           0    0.95745   0.97826   0.96774        46
           1    0.50000   0.25000   0.33333         4
           2    0.83333   1.00000   0.90909         5

    accuracy                        0.92727        55
   macro avg    0.76359   0.74275   0.73672        55
weighted avg    0.91289   0.92727   0.91627        55

