In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

labelencoder = LabelEncoder()

import xgboost as xgb

import hyperopt
from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL, space_eval
import numpy as np



In [2]:
higgs_boson = pd.read_csv("dataset/training.csv")
higgs_boson = higgs_boson.drop(['Weight'], axis=1)
higgs_boson = higgs_boson[:100000]

In [3]:
higgs_boson['Label'] = labelencoder.fit_transform(higgs_boson['Label'])

X, y= higgs_boson[higgs_boson.columns[1:31]], higgs_boson[higgs_boson.columns[-1]]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

dtrain = xgb.DMatrix(x_train,label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [5]:
xgb_cla_params = {
    'tree_method':      'gpu_hist',
    'booster':          hp.choice('booster_classification', ['gbtree', 'dart']),
    'learning_rate':    hp.choice('learning_rate_classification',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth_classification',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight_classification', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree_classification', np.arange(0.3, 0.8, 0.1)),
    'colsample_bylevel':hp.choice('colsample_bylevel_classification', np.arange(0.3, 0.8, 0.1)),
    'colsample_bynode': hp.choice('colsample_bynode_classification', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample_classification', 0.5, 1),
    'sampling_method':  hp.choice('sampling_method_classification', ['uniform', 'gradient_based']),
    'n_estimators':     30,
    'objective':        'binary:logistic',
}

xgb_cv_params = {
    'tree_method':      'gpu_hist',
    'booster':          hp.choice('booster_cv', ['gbtree', 'dart']),
    'learning_rate':    hp.choice('learning_rate_cv',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth_cv',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight_cv', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree_cv', np.arange(0.3, 0.8, 0.1)),
    'colsample_bylevel':hp.choice('colsample_bylevel_cv', np.arange(0.3, 0.8, 0.1)),
    'colsample_bynode': hp.choice('colsample_bynode_cv', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample_cv', 0.5, 1),
    'sampling_method':  hp.choice('sampling_method_cv', ['uniform', 'gradient_based']),
    'objective':        'binary:logistic',
}

xgb_fit_params = { 
    'eval_metric': 'auc',
    'early_stopping_rounds': 10,
    'verbose': False 
}

xgb_para = dict()
xgb_para['cla_params'] = xgb_cla_params
xgb_para['cv_params'] = xgb_cv_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func'] =  lambda y, pred: roc_auc_score(y, pred)

In [6]:
class HPOpt(object):
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test
        
        self.dtrain = xgb.DMatrix(x_train,label=y_train)
        self.dtest = xgb.DMatrix(x_test, label=y_test)

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            print(e)
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def xgb_test(self, optimal, para):
        cla = xgb.XGBClassifier(**optimal)
        cla.fit(self.x_train, self.y_train, eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],**para['fit_params'])

        pred = cla.predict(self.x_test)
        score = para['loss_func'](self.y_test, pred)
    
        return score

    def xgb_cla(self, para):
        cla = xgb.XGBClassifier(**para['cla_params'])
        return self.train_class(cla, para)
    
    def xgb_cv(self, para):
        cla_cv = xgb.cv(dtrain=self.dtrain, params=para['cv_params'], nfold=5, num_boost_round=30,
                       early_stopping_rounds = 10, metrics='auc')
        return self.train_cla_cv(cla_cv, para)

    def train_class(self, cla, para):
        cla.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = cla.predict(self.x_test)
        score = para['loss_func'](self.y_test, pred)
        loss = 1 - score
        return {'loss': loss, 'status': STATUS_OK}
    
    def train_cla_cv(self, cross_v, para):
        num_rounds = len(cross_v["test-auc-mean"])
        cv_result = cross_v["test-auc-mean"][num_rounds - 1]
        loss = 1- cv_result
        return {'loss': loss, 'status': STATUS_OK}

In [7]:
obj = HPOpt(x_train, x_test, y_train, y_test)


xgb_opt_class = obj.process(
    fn_name="xgb_cla", space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100
)
xgb_opt_cv = obj.process(
    fn_name="xgb_cv", space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100
)

100%|██████████| 100/100 [02:50<00:00,  1.70s/trial, best loss: 0.19195183395039705]
100%|██████████| 100/100 [11:47<00:00,  7.07s/trial, best loss: 0.09611559999999986]


# Test Score

In [11]:
optimal_hp, trials = xgb_opt_class
optim_class = space_eval(xgb_para['cla_params'], optimal_hp)

obj.xgb_test(optim_class, xgb_para)

0.808048166049603

In [13]:
optimal_hp, trials = xgb_opt_cv
optim_cv = space_eval(xgb_para['cv_params'], optimal_hp)

obj.xgb_test(optim_cv, xgb_para)

0.806816353813445

# Grid Search

In [16]:
from sklearn.model_selection import GridSearchCV

xgb_clf = xgb.XGBClassifier()

xgb_param_grid = {
    'tree_method':      ['gpu_hist'],
    'booster':          ['gbtree', 'dart'],
    'learning_rate':    np.arange(0.05, 0.31, 0.05),
    'max_depth':        np.arange(5, 16, 1, dtype=int),
    'min_child_weight': np.arange(1, 8, 1, dtype=int),
    'colsample_bytree': np.arange(0.3, 0.8, 0.1),
    'colsample_bylevel':np.arange(0.3, 0.8, 0.1),
    'colsample_bynode': np.arange(0.3, 0.8, 0.1),
    'subsample':        np.arange(0.5, 1, 0.1),
    'sampling_method':  ['uniform', 'gradient_based'],
    'n_estimators':     [30],
    'objective':        ['binary:logistic'],
}

In [17]:
hr_grid = GridSearchCV(estimator=xgb_clf,
                       param_grid=xgb_param_grid,
                       scoring='roc_auc',
                       n_jobs=8,
                       cv=5,
                       refit=True, 
                       return_train_score=True)

In [19]:
hr_grid.fit(x_train, y_train)


KeyboardInterrupt: 

In [None]:
## 최고성능
best_score = hr_grid.best_score_
# 최고성능을 내는 행을 찾아냄
best_row = hr_grid.best_index_

# Random Search

In [None]:
xgb_clf = xgb.XGBClassifier()

xgb_param_grid = {
    'tree_method':      ['gpu_hist'],
    'booster':          ['gbtree', 'dart'],
    'learning_rate':    np.arange(0.05, 0.31, 0.05),
    'max_depth':        np.arange(5, 16, 1, dtype=int),
    'min_child_weight': np.arange(1, 8, 1, dtype=int),
    'colsample_bytree': np.arange(0.3, 0.8, 0.1),
    'colsample_bylevel':np.arange(0.3, 0.8, 0.1),
    'colsample_bynode': np.arange(0.3, 0.8, 0.1),
    'subsample':        np.arange(0.5, 1, 0.1),
    'sampling_method':  ['uniform', 'gradient_based'],
    'n_estimators':     [30],
    'objective':        ['binary:logistic'],
}

# Create a random search object
xgb_random = RandomizedSearchCV(estimator = xgb_clf,
                                param_distributions = xgb_param_grid,
                                n_iter = 10,
                                scoring='roc_auc', 
                                n_jobs=8, 
                                cv = 3, 
                                refit=True, 
                                return_train_score = True)

In [None]:
xgb_random.fit(X_train, y_train)