In [None]:
## XGBoost ## 
# eXtreme Gradient Boosted trees
# Boosting (ensemble): Each tree boosts attributes previous tree mis-classified
# Wins Kaggles comps, fast, easy to use and good starting choice

In [None]:
## Features ##
# Regularised boosting prevents overfitting
# Auto handle missing values
# Parallel processing across multiple GPU's/clusters
# Cross-validate at each iteration (eval performance at each step)
# Incremental training (can halt/resume training when needed)
# Can use for your optimise objectives 
# Tree pruning (Gives deeper but optimised trees)
# Uses DMatrix and params passed as dictionary

In [1]:
# Iris dataset
from sklearn.datasets import load_iris

iris = load_iris()

samples, features = iris.data.shape
print(samples)
print(features)
print(list(iris.target_names))

150
4
['setosa', 'versicolor', 'virginica']


In [2]:
# Train/Test (random_state = keeps test/train data the same. If not set, data will be diff for each time it's run)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0)

In [3]:
# Convert data into DMatrix format for XGBoost.
import xgboost as xgb

train = xgb.DMatrix(X_train,label=y_train)
test = xgb.DMatrix(X_test,label=y_test)

In [4]:
# Define hyperparameters. Using softmax for multiple classification, normally would experiment
params = {
    'max_depth':4, # Tree depth
    'eta':0.3, # Learning rate
    'objective':'multi:softmax', # Softmax gives best probability
    'num_class':3 # 3 features
}
epochs = 10 # No. of times algo is run

In [8]:
# Train model and run prediction (numbers = specific iris species)
model = xgb.train(params, train, epochs)
predict = model.predict(test)
print(predict)

[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 1. 0. 1. 1. 0. 0. 2. 1. 0. 0.
 2. 0. 0. 1. 1. 0.]


In [9]:
# Measure accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predict)

1.0