# XGBOOST - RANDOM SEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [20]:
DATAPATH = 'data/train_test/'
SEED = 47
NITER = 100
CV = 5
SCORE = 'balanced_accuracy'
handlingnull = False
NJOBS = 5
USEGPU = True
NCLASS = 3 # number class to predict (if bivar set 0)

### LOAD DATASET

In [3]:
train_features = np.load(DATAPATH+'X_features_clusters_001_train.npy')

In [4]:
train_labels = np.load(DATAPATH+'y_features_clusters_001_train.npy')

In [5]:
train_features.shape

(1225, 1275)

In [6]:
train_labels.shape

(1225,)

In [7]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    xgtrain = xgb.DMatrix(train_features, train_labels, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features, train_labels)

### TRAIN MODEL

#### Set hyperparameters

In [8]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 

# Defines the minimum sum of weights of all observations required in a child.
min_child_weight = [i for i in range(1,10,2)]

# The maximum depth of a tree
max_depth = [i for i in range(3,10,2)] 

# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]

# Denotes the fraction of observations to be randomly samples for each tree.
subsample = [i/10.0 for i in range(6,10)]

# Denotes the fraction of columns to be randomly samples for each tree.
colsample_bytree = [i/10.0 for i in range(6,10)]

# L2 regularization term on weights (analogous to Ridge regression)
reg_lambda = [i/10.0 for i in range(4,10)]

# L1 regularization term on weight (analogous to Lasso regression)
reg_alpha = [0, 0.001, 0.005, 0.01, 0.05]

# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = [5, 1, 2] #int((len(train_labels) - np.sum(train_labels))/np.sum(train_labels))


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'multi:softprob'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'mlogloss'

[xgboost params](https://xgboost.readthedocs.io/en/latest/python/python_api.html)

In [9]:
# Create the random grid
random_grid = {
    'learning_rate' : eta,
    'min_child_weight' : min_child_weight,
    'max_depth' : max_depth,
    'gamma': gamma,
    'subsample': subsample,
    'colsample_bytree' : colsample_bytree,
    'reg_lambda' : reg_lambda,
    'reg_alpha' : reg_alpha,
}

In [10]:
random_grid

{'learning_rate': [0.01],
 'min_child_weight': [1, 3, 5, 7, 9],
 'max_depth': [3, 5, 7, 9],
 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'subsample': [0.6, 0.7, 0.8, 0.9],
 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
 'reg_lambda': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]}

#### Find num boost

In [11]:
early_stopping_rounds = 50

In [12]:
model=XGBClassifier(seed=SEED, booster=booster, objective=objective,  scale_pos_weight = scale_pos_weight, nthread=NJOBS)
xgb_param = model.get_xgb_params()
xgb_param['num_class'] = NCLASS

if USEGPU:
    xgb_param['tree_method'] = 'gpu_hist'
    xgb_param['gpu_id'] = 0


In [13]:
cvresult = xgb.cv(xgb_param, xgtrain, 
                  num_boost_round = 1000, 
                  nfold = CV, 
                  metrics = eval_metric, 
                  early_stopping_rounds = early_stopping_rounds,
                  #num_class= NCLASS,
                  seed = SEED)

In [14]:
n_estimators = cvresult.shape[0]

In [15]:
print("Best number of boosters: ", n_estimators)

Best number of boosters:  58


In [21]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = XGBClassifier(n_estimators=n_estimators, scale_pos_weight=scale_pos_weight,  objective=objective)

In [22]:
if USEGPU:
    model.set_params(gpu_id = 0)
    model.set_params(tree_method='gpu_hist')

In [23]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
xgboost_rsearch = RandomizedSearchCV(estimator = model, param_distributions = random_grid, scoring=SCORE, n_iter = NITER, cv = CV, verbose=2, random_state=SEED, n_jobs = NJOBS)# Fit the random search model


#### Training

In [24]:
start = time()
xgboost_rsearch.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 14.0min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed: 30.1min
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed: 39.8min finished


RandomizedSearchCV took 2399.15 seconds for 100 candidates parameter settings.


#### Saving results

In [25]:
cv_results = pd.DataFrame(xgboost_rsearch.cv_results_)



In [26]:
cv_results.to_csv('output/results/rsearch_xgboost_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [27]:
xgboost_rsearch.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0.1, gpu_id=0,
       learning_rate=0.01, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=58, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0.05, reg_lambda=0.4, scale_pos_weight=[5, 1, 2],
       seed=None, silent=None, subsample=0.7, tree_method='gpu_hist',
       verbosity=1)

#### Best parameter

In [28]:
xgboost_rsearch.best_params_

{'subsample': 0.7,
 'reg_lambda': 0.4,
 'reg_alpha': 0.05,
 'min_child_weight': 5,
 'max_depth': 9,
 'learning_rate': 0.01,
 'gamma': 0.1,
 'colsample_bytree': 0.9}

#### Best Score

In [29]:
print(SCORE,' : ', xgboost_rsearch.best_score_)

balanced_accuracy  :  0.7563362902305817


#### Saving best hyperparameters

In [30]:
np.save('output/hyperparameters/rseach_xgboost_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', xgboost_rsearch.best_params_)

In [31]:
np.save('output/results/rseach_xgboost_classifier_best_estimator_d' + str(datetime.now().date()) + '.npy', xgboost_rsearch.best_estimator_)