# BUILD XGBOOST

In [1]:
import numpy as np
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import balanced_accuracy_score

### GLOBAL VARIABLES

In [2]:
DATAPATH = 'data/features/'
SEED = 47
NITER = 100
CV = 3
SCORE = 'balanced_accuracy'
handlingnull = False
NJOBS = 7
USEGPU = True
NCLASS = 3 # number class to predict (if bivar set 0)

### LOAD DATASET

In [3]:
train_features = np.load(DATAPATH+'X_features_002.npy')

In [4]:
train_labels = np.load(DATAPATH+'y.npy')

### TRAIN MODEL

#### Load and set hyperparameters

In [5]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = [5,1,2]


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'multi:softprob'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'mlogloss'

In [6]:
xgb_params = np.load('output/hyperparameters/gseach_xgboost_classifier_bestparams_d2019-11-09.npy', allow_pickle=True).tolist()

In [7]:
xgb_params['seed'] = SEED
xgb_params['booster'] = booster
xgb_params['objective'] = objective
xgb_params['eval_metric'] = eval_metric
xgb_params['num_threads'] = NJOBS
xgb_params['num_class'] = NCLASS
xgb_params['verbose'] = 0
xgb_params['scale_pos_weight'] = scale_pos_weight

In [8]:
xgb_params['n_estimators'] = 402

In [9]:
if USEGPU:
    xgb_params['tree_method'] = 'gpu_hist'
    xgb_params['gpu_id'] = 0

In [10]:
xgb_params

{'colsample_bytree': 0.72,
 'gamma': 0.1,
 'learning_rate': 0.01,
 'max_depth': 9,
 'min_child_weight': 3,
 'reg_alpha': 0.001,
 'reg_lambda': 0.9,
 'subsample': 0.85,
 'seed': 47,
 'booster': 'gbtree',
 'objective': 'multi:softprob',
 'eval_metric': 'mlogloss',
 'num_threads': 7,
 'num_class': 3,
 'verbose': 0,
 'scale_pos_weight': [5, 1, 2],
 'n_estimators': 402,
 'tree_method': 'gpu_hist',
 'gpu_id': 0}

In [11]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    xgtrain = xgb.DMatrix(train_features, train_labels, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features, train_labels)

In [12]:
model = xgb.train(xgb_params, xgtrain, verbose_eval=False)

In [13]:
model.save_model('models/xgb_002.model')