In [1]:
# organize imports
from __future__ import print_function

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import h5py
import os
import json
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import time
import xgboost as xgb

In [2]:
# load the user configs
with open('room_recognition/conf/conf_xgboost.json') as f:    
  config = json.load(f)

# config variables

seed      = config["seed"]
train_features_path   = config["train_features_path"]
train_labels_path   = config["train_labels_path"]
test_features_path   = config["test_features_path"]
test_labels_path   = config["test_labels_path"]

results     = config["results"]
classifier_path = config["classifier_path"]

num_classes   = config["num_classes"]

model_path=config["model_path"]

In [3]:
# import train features and labels
h5f_data_train  = h5py.File(train_features_path, 'r')
h5f_label_train = h5py.File(train_labels_path, 'r')

train_features_string = h5f_data_train['dataset_1']
train_labels_string   = h5f_label_train['dataset_1']

trainfeatures = np.array(train_features_string)
trainlabels   = np.array(train_labels_string)

h5f_data_train.close()
h5f_label_train.close()

In [4]:
# verify the shape of features and labels
print ("[INFO] features shape: {}".format(trainfeatures.shape))
print ("[INFO] labels shape: {}".format(trainlabels.shape))



[INFO] features shape: (1040, 2048)
[INFO] labels shape: (1040,)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(trainfeatures, trainlabels, test_size=0.1, random_state=123)

In [6]:
dtrain = xgb.DMatrix(X_train, label=y_train) 
dtest = xgb.DMatrix(X_test, label=y_test)

In [7]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective': 'multi:softmax'
}

Parameters num_boost_round and early_stopping_rounds
From https://cambridgespark.com/content/tutorials/hyperparameter-tuning-in-xgboost/index.html
Since trees are built sequentially, instead of fixing the number of rounds at the beginning, we can test our model at each step and see if adding a new tree/round improves performance.

To do so, we define a test dataset and a metric that is used to assess performance at each round. If performance haven't improved for N rounds (N is defined by the variable early_stopping_round), we stop the training and keep the best number of boosting rounds.


Evaluation metrics for validation data is specified using 'eval_metric'
merror: Multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases)

In [8]:
params['eval_metric'] = "merror"  #multiclass Classification error rate. (#wrong cases/ #all cases)

In [9]:
params['num_class'] = 8
params['silent'] = 1

In [10]:
num_boost_round = 999

In [11]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-merror:0.230769
Will train until Test-merror hasn't improved in 10 rounds.
[1]	Test-merror:0.182692
[2]	Test-merror:0.192308
[3]	Test-merror:0.153846
[4]	Test-merror:0.134615
[5]	Test-merror:0.105769
[6]	Test-merror:0.115385
[7]	Test-merror:0.105769
[8]	Test-merror:0.105769
[9]	Test-merror:0.096154
[10]	Test-merror:0.086538
[11]	Test-merror:0.096154
[12]	Test-merror:0.096154
[13]	Test-merror:0.096154
[14]	Test-merror:0.096154
[15]	Test-merror:0.096154
[16]	Test-merror:0.105769
[17]	Test-merror:0.096154
[18]	Test-merror:0.096154
[19]	Test-merror:0.096154
[20]	Test-merror:0.096154
Stopping. Best iteration:
[10]	Test-merror:0.086538



In [12]:
print("Best MERROR: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MERROR: 0.09 with 11 rounds


To tune the other hyperparameters, next we will use the cv function from XGBoost. It does cross-validation on the training dataset and returns a mean merror score.

 Here we will use a large number again fro num_boost_round and count on early_stopping_rounds to find the optimal number of rounds before reaching the maximum.

In [14]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=seed,
    nfold=5,
    metrics={'merror'},
    early_stopping_rounds=10
)

cv_results

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.310906,0.035761,0.026441,0.00628
1,0.264939,0.042871,0.005075,0.002591
2,0.241438,0.038071,0.000801,0.000654
3,0.233957,0.042327,0.0,0.0
4,0.227552,0.036747,0.0,0.0
5,0.212584,0.034398,0.0,0.0
6,0.212595,0.036728,0.0,0.0
7,0.202981,0.039528,0.0,0.0
8,0.200842,0.039747,0.0,0.0
9,0.206172,0.043787,0.0,0.0


cv returns a table where the rows correspond to the number of boosting trees used, Here we stopped after 45 trees.
The 4 columns correspond to the mean and standard deviation of "merror" on the test dataset and on the train dataset. We will focus on improving the merror for test.

In [15]:
cv_results['test-merror-mean'].min()


0.16769240000000002

Parameters max_depth and min_child_weight:
These parameters add constraints on the architecture of the trees.

max_depth is the maximum number of nodes allowed from the root to the farthest leaf of a tree. Deeper trees can model more complex relationships by adding more nodes, but as we go deeper, splits become less relevant and are sometimes only due to noise, causing the model to overfit.
min_child_weight is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.
Thus, those parameters can be used to control the complexity of the trees. It is important to tune them together in order to find a good trade-off between model bias and variance

In [18]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,9)
    for min_child_weight in range(1,4)
]

comment:

float("Inf") It acts as an unbounded upper value for comparison. This is useful for finding lowest values for something. 

In [19]:
# Define initial best params and MAE
min_merror = float("Inf")  
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=seed,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )

    # Update best MAE
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

CV with max_depth=4, min_child_weight=1
	MERROR 0.1634146 for 48 rounds
CV with max_depth=4, min_child_weight=2
	MERROR 0.1570314 for 39 rounds
CV with max_depth=4, min_child_weight=3
	MERROR 0.16450099999999998 for 50 rounds
CV with max_depth=5, min_child_weight=1
	MERROR 0.1698204 for 25 rounds
CV with max_depth=5, min_child_weight=2
	MERROR 0.163426 for 32 rounds
CV with max_depth=5, min_child_weight=3
	MERROR 0.166629 for 35 rounds
CV with max_depth=6, min_child_weight=1
	MERROR 0.16769240000000002 for 45 rounds
CV with max_depth=6, min_child_weight=2
	MERROR 0.161281 for 38 rounds
CV with max_depth=6, min_child_weight=3
	MERROR 0.1612982 for 32 rounds
CV with max_depth=7, min_child_weight=1
	MERROR 0.15915339999999997 for 56 rounds
CV with max_depth=7, min_child_weight=2
	MERROR 0.16128699999999999 for 26 rounds
CV with max_depth=7, min_child_weight=3
	MERROR 0.1527476 for 45 rounds
CV with max_depth=8, min_child_weight=1
	MERROR 0.1687622 for 47 rounds
CV with max_depth=8, min_ch

#### Best score with a max_depth of 7 and min_child_weight of 3 ,  MERROR=0.152

In [20]:
params['max_depth'] = 7
params['min_child_weight'] = 3

Parameters subsample and colsample_bytree
Those parameters control the sampling of the dataset that is done at each boosting round.

Instead of using the whole training set every time, we can build a tree on slightly different data at each step, which makes it less likely to overfit to a single sample or feature.

subsample corresponds to the fraction of observations (the rows) to subsample at each step. By default it is set to 1 meaning that we use all rows.
colsample_bytree corresponds to the fraction of features (the columns) to use. By default it is set to 1 meaning that we will use all features.

In [21]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(1,10)]
    for colsample in [i/10. for i in range(1,10)]
]

In [22]:
min_merror = float("Inf")  
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=seed,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )
    # Update best MERROR
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (subsample,colsample)

print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

CV with subsample=0.9, colsample=0.9
	MERROR 0.1644898 for 26 rounds
CV with subsample=0.9, colsample=0.8
	MERROR 0.17088399999999998 for 32 rounds
CV with subsample=0.9, colsample=0.7
	MERROR 0.16449560000000002 for 36 rounds
CV with subsample=0.9, colsample=0.6
	MERROR 0.1623618 for 41 rounds
CV with subsample=0.9, colsample=0.5
	MERROR 0.1794458 for 22 rounds
CV with subsample=0.9, colsample=0.4
	MERROR 0.15807279999999996 for 71 rounds
CV with subsample=0.9, colsample=0.3
	MERROR 0.1559448 for 35 rounds
CV with subsample=0.9, colsample=0.2
	MERROR 0.1602228 for 25 rounds
CV with subsample=0.9, colsample=0.1
	MERROR 0.16982599999999998 for 40 rounds
CV with subsample=0.8, colsample=0.9
	MERROR 0.1516782 for 39 rounds
CV with subsample=0.8, colsample=0.8
	MERROR 0.1580952 for 49 rounds
CV with subsample=0.8, colsample=0.7
	MERROR 0.17838759999999998 for 21 rounds
CV with subsample=0.8, colsample=0.6
	MERROR 0.17626579999999997 for 37 rounds
CV with subsample=0.8, colsample=0.5
	MERRO

In [23]:
print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

Best params: 0.6, 0.2, MERROR: 0.14952200000000002


In [24]:
params['subsample'] = .6
params['colsample_bytree'] = 0.2

Parameter ETA
The ETA parameter controls the learning rate. It corresponds to the shrinkage of the weights associated to features after each round, in other words it defines the amount of "correction" we make at each step.
In practice, having a lower eta makes our model more robust to overfitting thus, usually, the lower the learning rate, the best. But with a lower eta, we need more boosting rounds, which takes more time to train, sometimes for only marginal improvements. Let's try a couple of values here, and time them with the notebook command:

In [25]:
min_merror = float("Inf")  
best_params = None

for eta in [.5, .4 ,.3, .2, .1]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params['eta'] = eta

    # Run and time CV
    
    %time cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=seed, nfold=5, metrics=['merror'], early_stopping_rounds=10)
    
    

    # Update best score
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds\n".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = eta

print("Best params: {}, MERROR: {}".format(best_params, min_merror))

CV with eta=0.5
CPU times: user 29.9 s, sys: 140 ms, total: 30.1 s
Wall time: 7.6 s
	MERROR 0.17837660000000002 for 24 rounds

CV with eta=0.4
CPU times: user 49.6 s, sys: 212 ms, total: 49.9 s
Wall time: 12.5 s
	MERROR 0.15702 for 62 rounds

CV with eta=0.3
CPU times: user 44.2 s, sys: 176 ms, total: 44.4 s
Wall time: 11.1 s
	MERROR 0.14952200000000002 for 36 rounds

CV with eta=0.2
CPU times: user 43.4 s, sys: 176 ms, total: 43.6 s
Wall time: 10.9 s
	MERROR 0.1634486 for 24 rounds

CV with eta=0.1
CPU times: user 1min 4s, sys: 204 ms, total: 1min 4s
Wall time: 16.1 s
	MERROR 0.15808379999999997 for 36 rounds

Best params: 0.3, MERROR: 0.14952200000000002


In [26]:
params["eta"] = 0.3

final dictionary of parameters

In [27]:
params

{'max_depth': 7,
 'min_child_weight': 3,
 'eta': 0.3,
 'subsample': 0.6,
 'colsample_bytree': 0.2,
 'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'num_class': 8,
 'silent': 1}

Now lets use these parameters to train a model and evaluate.

In [28]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-merror:0.317308
Will train until Test-merror hasn't improved in 10 rounds.
[1]	Test-merror:0.25
[2]	Test-merror:0.182692
[3]	Test-merror:0.192308
[4]	Test-merror:0.153846
[5]	Test-merror:0.134615
[6]	Test-merror:0.125
[7]	Test-merror:0.125
[8]	Test-merror:0.125
[9]	Test-merror:0.115385
[10]	Test-merror:0.105769
[11]	Test-merror:0.115385
[12]	Test-merror:0.105769
[13]	Test-merror:0.105769
[14]	Test-merror:0.115385
[15]	Test-merror:0.125
[16]	Test-merror:0.105769
[17]	Test-merror:0.096154
[18]	Test-merror:0.115385
[19]	Test-merror:0.096154
[20]	Test-merror:0.096154
[21]	Test-merror:0.086538
[22]	Test-merror:0.086538
[23]	Test-merror:0.096154
[24]	Test-merror:0.105769
[25]	Test-merror:0.096154
[26]	Test-merror:0.096154
[27]	Test-merror:0.086538
[28]	Test-merror:0.086538
[29]	Test-merror:0.096154
[30]	Test-merror:0.086538
[31]	Test-merror:0.086538
Stopping. Best iteration:
[21]	Test-merror:0.086538



In [30]:
print("Best MERROR: {:.3f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best MERROR: 0.087 in 22 rounds


Out Merror has improved marginally from the original 0.09 on the validation set to 0.087b on the validation set after parameter
tuning,
Saving the model:
Although we found the best number of rounds, our model has been trained with more rounds than optimal, thus before using it for predictions, we should retrain it with the good number of rounds. Since we now the exact best num_boost_round, we don't need the early_stopping_round anymore

In [31]:

params
    

{'max_depth': 7,
 'min_child_weight': 3,
 'eta': 0.3,
 'subsample': 0.6,
 'colsample_bytree': 0.2,
 'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'num_class': 8,
 'silent': 1}

In [32]:
num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-merror:0.317308
[1]	Test-merror:0.25
[2]	Test-merror:0.182692
[3]	Test-merror:0.192308
[4]	Test-merror:0.153846
[5]	Test-merror:0.134615
[6]	Test-merror:0.125
[7]	Test-merror:0.125
[8]	Test-merror:0.125
[9]	Test-merror:0.115385
[10]	Test-merror:0.105769
[11]	Test-merror:0.115385
[12]	Test-merror:0.105769
[13]	Test-merror:0.105769
[14]	Test-merror:0.115385
[15]	Test-merror:0.125
[16]	Test-merror:0.105769
[17]	Test-merror:0.096154
[18]	Test-merror:0.115385
[19]	Test-merror:0.096154
[20]	Test-merror:0.096154
[21]	Test-merror:0.086538


Look at evaluation metrics with the scikit-learn function on the validation data. 

In [33]:
y_pred=best_model.predict(dtest)


In [35]:
metrics.precision_recall_fscore_support(y_pred, y_test)

(array([0.9       , 0.92307692, 1.        , 0.55555556, 0.91666667,
        0.81818182, 1.        , 1.        ]),
 array([1.        , 0.85714286, 0.9375    , 1.        , 0.91666667,
        0.64285714, 1.        , 1.        ]),
 array([0.94736842, 0.88888889, 0.96774194, 0.71428571, 0.91666667,
        0.72      , 1.        , 1.        ]),
 array([ 9, 14, 16,  5, 12, 14, 15, 19]))

In [36]:
 conf_matrix = pd.crosstab(y_test, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False)

In [37]:
conf_matrix

Predicted,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,9,1,0,0,0,0,0,0
1,0,12,0,0,0,1,0,0
2,0,0,15,0,0,0,0,0
3,0,0,0,5,1,3,0,0
4,0,0,0,0,11,1,0,0
5,0,1,1,0,0,9,0,0
6,0,0,0,0,0,0,15,0
7,0,0,0,0,0,0,0,19


To Save Model: 

In [38]:
best_model.save_model(model_path)

Load Model

In [None]:
#loaded_model = xgb.Booster()
#loaded_model.load_model(model_path)