In [25]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
import xgboost
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *
from sklearn.metrics import classification_report, explained_variance_score
from sklearn.metrics import plot_confusion_matrix
from xgboost import XGBClassifier
from datetime import datetime, date

In [5]:
dataset = pd.read_csv('./data/pre_data.csv')
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,338,339,340,341,342,343,344,345,346,0.1
0,0.214297,0.236965,4,2,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4
1,2.449873,2.551062,6,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4
2,0.248730,0.272536,4,2,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4
3,-0.093402,-0.081406,0,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,0.298192,0.324030,3,1,1,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37209,-0.289216,-0.283953,2,2,2,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
37210,-0.301264,-0.296447,1,2,1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
37211,-0.298253,-0.293311,0,2,1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
37212,-0.301264,-0.296447,0,2,1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12


In [22]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [26]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='multi:softmax',
                    silent=True, nthread=1)

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [27]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='f1_micro', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001, return_train_score=bool )

start_time = timer(None)
random_search.fit(X_train,y_train)
timer(start_time)

Fitting 3 folds for each of 5 candidates, totalling 15 fits

 Time taken: 2 hours 13 minutes and 58.39 seconds.


In [28]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([1285.3579487 , 1829.08356524, 1477.58274937, 1000.55258155,
       1437.14624103]), 'std_fit_time': array([  9.8153772 , 154.16060519,  31.12361144,   4.1713108 ,
        14.33242706]), 'mean_score_time': array([6.56061037, 7.13333702, 7.17541337, 5.69769979, 5.13933317]), 'std_score_time': array([0.13288017, 0.64135533, 0.34474525, 0.34586884, 0.39874882]), 'param_subsample': masked_array(data=[1.0, 0.6, 0.8, 1.0, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 5, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 5, 5, 5, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[5, 1.5, 1, 5, 1],
             mask=[False, False, False, False, False]

In [29]:
xgb = XGBClassifier(colsample_bytree=0.8, gamma=1.5, learning_rate=0.02, max_depth=5,
              n_estimators=600, nthread=1, objective='multi:softprob',
              silent=True, subsample=0.6)

In [30]:
# basic

pred = xgb.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.59      0.80      0.68      2957
           1       0.77      0.54      0.63      2351
           2       0.78      0.17      0.27        42
           3       0.57      0.66      0.61        32
           4       0.74      0.45      0.56       340
           5       0.70      0.60      0.65       500
           6       0.88      0.28      0.42       104
           7       0.94      0.84      0.89       100
           8       0.59      0.48      0.53       947
           9       0.77      0.87      0.81      1235
          10       0.56      0.63      0.59       314
          11       0.60      0.36      0.45       211
          12       0.96      0.87      0.91       171

    accuracy                           0.66      9304
   macro avg       0.73      0.58      0.62      9304
weighted avg       0.68      0.66      0.66      9304



In [33]:
#under sample
X_under, y_under = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
y_under.value_counts()

0.1
12     100
11     100
10     100
9      100
8      100
7      100
6      100
5      100
4      100
3      100
2      100
1      100
0      100
dtype: int64

In [38]:
pred_unsam = xgb.fit(X_under, y_under).predict(X_test)
print(classification_report(y_test, pred_unsam))

              precision    recall  f1-score   support

           0       0.62      0.28      0.39      2957
           1       0.63      0.42      0.50      2351
           2       0.06      0.64      0.10        42
           3       0.13      0.84      0.22        32
           4       0.27      0.55      0.37       340
           5       0.46      0.59      0.51       500
           6       0.11      0.47      0.18       104
           7       0.48      0.86      0.61       100
           8       0.37      0.46      0.41       947
           9       0.79      0.55      0.65      1235
          10       0.34      0.70      0.46       314
          11       0.19      0.74      0.31       211
          12       0.55      0.84      0.67       171

    accuracy                           0.44      9304
   macro avg       0.38      0.61      0.41      9304
weighted avg       0.57      0.44      0.46      9304



In [43]:
# over sample
X_samp_smote, y_samp_smote = SMOTE(random_state=4).fit_resample(X_train, y_train)
y_samp_smote.value_counts()

0.1
12     8807
11     8807
10     8807
9      8807
8      8807
7      8807
6      8807
5      8807
4      8807
3      8807
2      8807
1      8807
0      8807
dtype: int64

In [44]:
pred_over = xgb.fit(X_samp_smote, y_samp_smote).predict(X_test)
print(classification_report(y_test, pred_over))

              precision    recall  f1-score   support

           0       0.70      0.50      0.59      2957
           1       0.72      0.55      0.62      2351
           2       0.08      0.60      0.14        42
           3       0.31      0.88      0.46        32
           4       0.52      0.59      0.56       340
           5       0.56      0.69      0.62       500
           6       0.24      0.58      0.34       104
           7       0.75      0.92      0.83       100
           8       0.50      0.60      0.54       947
           9       0.82      0.77      0.79      1235
          10       0.39      0.82      0.53       314
          11       0.32      0.75      0.45       211
          12       0.90      0.88      0.89       171

    accuracy                           0.60      9304
   macro avg       0.53      0.70      0.57      9304
weighted avg       0.66      0.60      0.62      9304

