### Import module, function and data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from pprint import pprint
import matplotlib.pyplot as plt
from hyperopt.pyll.stochastic import sample
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.ensemble import RandomForestClassifier # rf분류기

In [2]:
def F1(y_pred, dtrain):
    labels = dtrain.get_label()
    
    pre = precision_score(y_true = labels, y_pred = y_pred, average=None)
    rec = recall_score(y_true = labels, y_pred = y_pred, average=None)
    f1_score = 8/(sum(1/pre) + sum(1/rec))

    return 'f1', f1_score

In [3]:
#### load class
train_label = pd.read_csv('temp_data/train_label_lite.csv')
# hasher = pd.read_csv('test_id.csv')
label_map = {'retained':0,'2month':1,'month':2,'week':3}
y_train = pd.Series([label_map[l] for l in train_label.label])
inv_map = {label_map[k]:k for k in label_map.keys()}

In [4]:
X_train = pd.read_csv('X_train_final.csv')

---

In [None]:
{'colsample_bytree': 0.9, 'gamma': 0.9500000000000001, 'learning_rate': 0.1, 'max_depth': 14, 'min_child_weight': 8, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.097, 'reg_lambda': 0.069, 'seed': 7, 'silent': 0, 'subsample': 0.9500000000000001, 'tree_method': 'gpu_hist'}
===============================================
Find the n_estimators
Optimal n_estimators : 227
5-fold of Xgboost F1: 0.72345 +/- 0.00165

In [7]:
#### xgb
grid_result = []
param = {}
#### XGB parameters
## General Parameters
param['n_gpus'] = -1
param['tree_method'] = 'gpu_hist'
param['silent'] = 0

## Booster Parameters
param['n_estimators'] = 224 #요기...
param['learning_rate'] = 0.1
param['min_child_weight'] = 10
param['max_depth'] = 20
param['gamma'] = 0.1
param['reg_alpha'] =0.01
param['reg_lambda'] = 0.05
param['subsample'] = 0.85
param['colsample_bytree'] = 0.72
param['scale_pos_weight'] = 1

## Learning task parameters
param['num_class'] = 4
param['objective'] = 'multi:softmax'
param['seed'] = 7

## update?
#param['process_type'] = 'update'
#param['updater'] = 'refresh'
#param['refresh_leaf'] = True
model = xgb.XGBClassifier(**param)

In [None]:
#### cross validation(for NA)
kfold = StratifiedKFold(n_splits = 5 ,random_state = 7,shuffle=True).split(X_train, y_train)
scores = []
predict_set = []
for k, (train, test) in enumerate(kfold):
    model.fit(X_train.iloc[train,:], y_train[train],eval_metric = F1)  ####요부분 trimming
    score = model.score(X_train.iloc[test,:], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist.: %s, F1: %.3f' % (k+1,np.bincount(y_train[train]), score))
    ### predict
    y_pred = model.predict(X_train.iloc[test,:])
    predict_set += [(x,inv_map[y_pred[i]]) for i,x in enumerate(test)]
print('\nCV F1: %.6f +/- %.6f' % (np.mean(scores), np.std(scores)))

In [None]:
0.7417,,,,

### hyperopt Xgb

In [5]:
obj_call_count = 0
cur_best_score = 0
cur_best_std = 0

In [6]:
trials = Trials()

In [7]:
param_space = {
    'n_estimators': 1000,
    'learning_rate': 0.1,
    'min_child_weight': hp.quniform('min_child_weight',1,4,0.05),
    'max_depth':hp.choice('max_deph',range(7,18)),
    
    'gamma': hp.quniform('gamma',0.0001,1,0.05),
    'reg_alpha': hp.quniform('reg_alpha',0.0001,0.1,0.001),
    'reg_lambda': hp.quniform('reg_lambda',0.0001,0.1,0.001),
    'subsample': hp.quniform('subsample',0.6,1,0.05),
    'colsample_bytree': hp.quniform('colsample_bytree',0.6,1,0.05),
    
    'num_class':4,
    'objective': 'multi:softmax',
    'seed': 7,
    
    'n_gpus' : -1,
    'tree_method' : 'gpu_hist',
    'silent' : 0
    }

In [8]:
def xgb_classifier(params): # hyperopt의 objective function은 params를 input으로 받는다.
    
    global obj_call_count, cur_best_score, cur_best_std, X_train, y_train # 우리가 input할 데이터는 global변수화!
    
    obj_call_count += 1
    print('\nXgboost objective call #{} cur_best_score={:7.5f} cur_best_std={:7.5f}'.format(obj_call_count,cur_best_score,cur_best_std) )
    
    #### sampling parameters from the hyperparameter params
    xgb_params = sample(params)
    
    if xgb_params['max_depth'] >= 10:
        xgb_params['min_child_weight'] = xgb_params['max_depth'] - 6
        
    
    print(xgb_params)
    
     #### step 1 : tuning n_estimators with cross validation
    print("===============================================")
    print("Find the n_estimators")
    xgtrain = xgb.DMatrix(X_train.values, label= y_train.values.reshape(-1,1))
    cvresult = xgb.cv(xgb_params, xgtrain, num_boost_round = xgb_params['n_estimators'], nfold = 5, feval=F1, 
                      early_stopping_rounds = 50 ,stratified=True, shuffle=True)
    print("Optimal n_estimators : %d"%(cvresult.shape[0]))
    
    
    f1_mean = cvresult['test-f1-mean'].max()
    f1_std = cvresult['test-f1-std'].loc[cvresult['test-f1-mean']==f1_mean].values[0]
    
    print('5-fold of Xgboost F1: %.5f +/- %.5f' % (f1_mean,f1_std))
    
    if f1_mean > cur_best_score:
        cur_best_score = f1_mean
        cur_best_std = f1_std
        
    #### minimize metric
    loss = 1 - f1_mean
    loss_var = f1_std
    
    return {'loss': loss , 'loss_variance': loss_var ,'status':STATUS_OK ,'attachments':{'cvresult':cvresult}}

In [None]:
best = fmin(xgb_classifier, param_space, algo = tpe.suggest, max_evals=100,trials=trials)
print ('best:')
print (best)


Xgboost objective call #1 cur_best_score=0.00000 cur_best_std=0.00000
{'colsample_bytree': 0.6000000000000001, 'gamma': 0.30000000000000004, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 3.6500000000000004, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.029, 'reg_lambda': 0.027, 'seed': 7, 'silent': 0, 'subsample': 0.9, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 615
5-fold of Xgboost F1: 0.72199 +/- 0.00167

Xgboost objective call #2 cur_best_score=0.72199 cur_best_std=0.00167
{'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.081, 'reg_lambda': 0.011, 'seed': 7, 'silent': 0, 'subsample': 0.9, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 184
5-fold of Xgboost F1: 0.72166 +/- 0.00331

Xgboost objective call #3 cur_best_score

Optimal n_estimators : 427
5-fold of Xgboost F1: 0.72028 +/- 0.00248

Xgboost objective call #17 cur_best_score=0.72298 cur_best_std=0.00180
{'colsample_bytree': 0.8, 'gamma': 0.55, 'learning_rate': 0.1, 'max_depth': 14, 'min_child_weight': 8, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.023, 'reg_lambda': 0.044, 'seed': 7, 'silent': 0, 'subsample': 0.8, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 179
5-fold of Xgboost F1: 0.72173 +/- 0.00301

Xgboost objective call #18 cur_best_score=0.72298 cur_best_std=0.00180
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.014, 'reg_lambda': 0.07200000000000001, 'seed': 7, 'silent': 0, 'subsample': 0.75, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 205
5-fold of Xgboost F1: 0.7212

Optimal n_estimators : 182
5-fold of Xgboost F1: 0.72215 +/- 0.00177

Xgboost objective call #33 cur_best_score=0.72340 cur_best_std=0.00193
{'colsample_bytree': 1.0, 'gamma': 0.6000000000000001, 'learning_rate': 0.1, 'max_depth': 16, 'min_child_weight': 10, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.07, 'reg_lambda': 0.049, 'seed': 7, 'silent': 0, 'subsample': 1.0, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 226
5-fold of Xgboost F1: 0.72241 +/- 0.00300

Xgboost objective call #34 cur_best_score=0.72340 cur_best_std=0.00193
{'colsample_bytree': 0.9, 'gamma': 0.8, 'learning_rate': 0.1, 'max_depth': 16, 'min_child_weight': 10, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.088, 'reg_lambda': 0.067, 'seed': 7, 'silent': 0, 'subsample': 0.9500000000000001, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 201
5-fold of Xgboost F1: 0.7229

Optimal n_estimators : 357
5-fold of Xgboost F1: 0.71962 +/- 0.00338

Xgboost objective call #49 cur_best_score=0.72345 cur_best_std=0.00165
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.9, 'learning_rate': 0.1, 'max_depth': 17, 'min_child_weight': 11, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.08, 'reg_lambda': 0.076, 'seed': 7, 'silent': 0, 'subsample': 0.8, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 155
5-fold of Xgboost F1: 0.72253 +/- 0.00256

Xgboost objective call #50 cur_best_score=0.72345 cur_best_std=0.00165
{'colsample_bytree': 0.9, 'gamma': 0.8, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 2.4000000000000004, 'n_estimators': 1000, 'n_gpus': -1, 'num_class': 4, 'objective': 'multi:softmax', 'reg_alpha': 0.061, 'reg_lambda': 0.092, 'seed': 7, 'silent': 0, 'subsample': 0.6000000000000001, 'tree_method': 'gpu_hist'}
Find the n_estimators
Optimal n_estimators : 283
5-fold of Xgb