This follows the example at https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

But modified to use GBM (Gradient Boosting Machines)

In [11]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

import time

# Create an object called iris with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

print(len(df))

# View the top 5 rows
df.head()

150


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [12]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [13]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [14]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

## Create training and test data

In [15]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [16]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [17]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 112
Number of observations in the test data: 38


## Preprocess Data

In [18]:
# Create a list of the feature column's names

# print df.columns
# Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
#        u'petal width (cm)', u'species', u'is_train'],
#       dtype='object')

features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [19]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.

# print pd.factorize(train['species'])
# (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2]), Index([u'setosa', u'versicolor', u'virginica'], dtype='object'))

y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Train the Random Forest Classifier

In [20]:
# Create the classifier
clf = GradientBoostingClassifier(n_estimators=5000, learning_rate=0.01, max_depth=3, random_state=0)

# Train the classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5000,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

## Apply Classifier to Test Data

In [21]:
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2])

In [22]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]
# There are three species of plant, thus [ 1. , 0. , 0. ] tells us that the classifier is certain that 
# the plant is the first class. Taking another example, [ 0.9, 0.1, 0. ] tells us that the classifier gives 
# a 90% probability the plant belongs to the first class and a 10% probability the plant belongs to the 
# second class. Because 90 is greater than 10, the classifier predicts the plant is the first class.

array([[  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05],
       [  9.99890742e-01,   5.92935526e-05,   4.99647745e-05]])

## Evaluate Classifier

In [23]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica'],
      dtype='<U10')

In [24]:
# View the ACTUAL species for the first five observations
test['species'].head()

0     setosa
5     setosa
6     setosa
7     setosa
10    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [25]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,12,0,0
versicolor,0,11,1
virginica,0,1,13


In [26]:
# View a list of the features and their importance scores

# print clf.feature_importances_
# [ 0.02412595  0.01061858  0.47888931  0.48636617]

list(zip(train[features], clf.feature_importances_))


[('sepal length (cm)', 0.0048889356079267329),
 ('sepal width (cm)', 0.030486161408710483),
 ('petal length (cm)', 0.067511922637997615),
 ('petal width (cm)', 0.062979647012031847)]

## Compute Accuracy, Precision, Recall

In [27]:
print(classification_report(test['species'], preds))
print(accuracy_score(test['species'], preds))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        12
 versicolor       0.92      0.92      0.92        12
  virginica       0.93      0.93      0.93        14

avg / total       0.95      0.95      0.95        38

0.947368421053
