In [1]:
"""
Created on Mon Jun 14 00:23:44 2021

@author: G. Cao
"""
from sklearn import datasets
import xgboost as xgb

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
# # Let’s get all of our data set up. We’ll start off by creating a train-test split so we can see just how well XGBoost performs. 
# # We’ll go with an 80%-20% split this time.

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# In order for XGBoost to be able to use our data, we’ll need to transform it into 
# a specific format that XGBoost can handle. That format is called DMatrix. It’s a 
# very simple one-linear to transform a numpy array of data to DMatrix format:

In [6]:
D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)

# Defining an XGBoost model

In [7]:
# The simplest parameters are the max_depth (maximum depth of the decision trees being trained), 
# objective (the loss function being used), and num_class (the number of classes in the dataset). 
# The eta ( the learning ratae) algorithm requires special attention.

In [8]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20  # The number of training iterations

In [9]:
# It is common to have small values in the range of 0.1 to 0.3. 
# The smaller weighting of these residuals will still help us train a powerful model, 
# but won’t let that model run away into deep complexity where overfitting is more likely to happen.

In [10]:
model = xgb.train(param, D_train, steps)



In [11]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [12]:
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [13]:
print("Precision = {}".format(precision_score(Y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(Y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))

Precision = 1.0
Recall = 1.0
Accuracy = 1.0


In [14]:
#     The gamma parameter can also help with controlling overfitting. It specifies the minimum reduction in the loss required to make a further partition on a leaf node of the tree. I.e if creating a new node doesn’t reduce the loss by a certain amount, then we won’t create it at all.
#     The booster parameter allows you to set the type of model you will use when building the ensemble. The default is gbtree which builds an ensemble of decision trees. If your data isn’t too complicated, you can go with the faster and simpler gblinear option which builds an ensemble of linear models.
#     Setting the optimal hyperparameters of any ML model can be a challenge. So why not let Scikit Learn do it for you? We can combine Scikit Learn’s grid search with an XGBoost classifier quite easily:

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
clf = xgb.XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

In [17]:
grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

In [18]:
grid.fit(X_train, Y_train)





GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [19]:
# Once your XGBoost model is trained, you can dump a human readable description of it into a text file:
model.dump_model('dump.raw.txt')