In [1]:
# organize imports
from __future__ import print_function

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import h5py
import os
import json
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import time
import xgboost as xgb

In [2]:
# load the user configs
with open('room_recognition/conf/conf_xgboost.json') as f:    
  config = json.load(f)

# config variables

seed      = config["seed"]
train_features_path   = config["train_features_path"]
train_labels_path   = config["train_labels_path"]
test_features_path   = config["test_features_path"]
test_labels_path   = config["test_labels_path"]

results     = config["results"]
classifier_path = config["classifier_path"]

num_classes   = config["num_classes"]

model_path=config["model_path"]

In [3]:
# import train features and labels
h5f_data_train  = h5py.File(train_features_path, 'r')
h5f_label_train = h5py.File(train_labels_path, 'r')

train_features_string = h5f_data_train['dataset_1']
train_labels_string   = h5f_label_train['dataset_1']

trainfeatures = np.array(train_features_string)
trainlabels   = np.array(train_labels_string)

h5f_data_train.close()
h5f_label_train.close()

In [4]:
# verify the shape of features and labels
print ("[INFO] features shape: {}".format(trainfeatures.shape))
print ("[INFO] labels shape: {}".format(trainlabels.shape))



[INFO] features shape: (1040, 2048)
[INFO] labels shape: (1040,)


In [5]:
h5f_data_test  = h5py.File(test_features_path, 'r')
h5f_label_test = h5py.File(test_labels_path, 'r')

test_features_string = h5f_data_test['dataset_1']
test_labels_string   = h5f_label_test['dataset_1']

testfeatures = np.array(test_features_string)
testlabels   = np.array(test_labels_string)

h5f_data_test.close()
h5f_label_test.close()

In [6]:
# verify the shape of features and labels
print ("[INFO] features shape: {}".format(testfeatures.shape))
print ("[INFO] labels shape: {}".format(testlabels.shape))


[INFO] features shape: (160, 2048)
[INFO] labels shape: (160,)


In [7]:
dtrain = xgb.DMatrix(trainfeatures, label=trainlabels) 
dtest = xgb.DMatrix(testfeatures, label=testlabels)

In [8]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective': 'multi:softmax'
}

Parameters num_boost_round and early_stopping_rounds
From https://cambridgespark.com/content/tutorials/hyperparameter-tuning-in-xgboost/index.html
Since trees are built sequentially, instead of fixing the number of rounds at the beginning, we can test our model at each step and see if adding a new tree/round improves performance.

To do so, we define a test dataset and a metric that is used to assess performance at each round. If performance haven't improved for N rounds (N is defined by the variable early_stopping_round), we stop the training and keep the best number of boosting rounds. 

In [9]:
params['eval_metric'] = "merror"

In [10]:
params['num_class'] = 8
params['silent'] = 1

In [11]:
num_boost_round = 999

In [12]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-merror:0.29375
Will train until Test-merror hasn't improved in 10 rounds.
[1]	Test-merror:0.25
[2]	Test-merror:0.25625
[3]	Test-merror:0.23125
[4]	Test-merror:0.21875
[5]	Test-merror:0.1875
[6]	Test-merror:0.18125
[7]	Test-merror:0.1875
[8]	Test-merror:0.18125
[9]	Test-merror:0.1875
[10]	Test-merror:0.175
[11]	Test-merror:0.175
[12]	Test-merror:0.1625
[13]	Test-merror:0.1625
[14]	Test-merror:0.15625
[15]	Test-merror:0.15625
[16]	Test-merror:0.15625
[17]	Test-merror:0.15625
[18]	Test-merror:0.15625
[19]	Test-merror:0.15
[20]	Test-merror:0.15
[21]	Test-merror:0.15
[22]	Test-merror:0.15
[23]	Test-merror:0.15
[24]	Test-merror:0.14375
[25]	Test-merror:0.15
[26]	Test-merror:0.14375
[27]	Test-merror:0.14375
[28]	Test-merror:0.14375
[29]	Test-merror:0.14375
[30]	Test-merror:0.1375
[31]	Test-merror:0.14375
[32]	Test-merror:0.14375
[33]	Test-merror:0.1375
[34]	Test-merror:0.1375
[35]	Test-merror:0.1375
[36]	Test-merror:0.13125
[37]	Test-merror:0.13125
[38]	Test-merror:0.13125
[39]	Test-

In [13]:
print("Best MERROR: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MERROR: 0.13 with 37 rounds


In [14]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=seed,
    nfold=5,
    metrics={'merror'},
    early_stopping_rounds=10
)

cv_results

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.301923,0.036665,0.026923,0.007624
1,0.254808,0.025258,0.006971,0.001923
2,0.243269,0.020757,0.002404,0.001317
3,0.222115,0.015916,0.001442,0.000899
4,0.218269,0.022876,0.000721,0.000589
5,0.218269,0.022261,0.000481,0.000589
6,0.214423,0.022052,0.0,0.0
7,0.203846,0.022468,0.0,0.0
8,0.200962,0.022219,0.0,0.0
9,0.202885,0.024963,0.0,0.0


In [15]:
cv_results['test-merror-mean'].min()


0.168269

Parameters max_depth and min_child_weight:
Those parameters add constraints on the architecture of the trees.

max_depth is the maximum number of nodes allowed from the root to the farthest leaf of a tree. Deeper trees can model more complex relationships by adding more nodes, but as we go deeper, splits become less relevant and are sometimes only due to noise, causing the model to overfit.
min_child_weight is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.
Thus, those parameters can be used to control the complexity of the trees. It is important to tune them together in order to find a good trade-off between model bias and variance

In [16]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

comment:

float("Inf") It acts as an unbounded upper value for comparison. This is useful for finding lowest values for something. 

In [17]:
# Define initial best params and MAE
min_merror = float("Inf")  
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=seed,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )

    # Update best MAE
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

CV with max_depth=9, min_child_weight=5
	MERROR 0.1586538 for 32 rounds
CV with max_depth=9, min_child_weight=6
	MERROR 0.15961540000000002 for 42 rounds
CV with max_depth=9, min_child_weight=7
	MERROR 0.160577 for 47 rounds
CV with max_depth=10, min_child_weight=5
	MERROR 0.1586538 for 32 rounds
CV with max_depth=10, min_child_weight=6
	MERROR 0.15961540000000002 for 42 rounds
CV with max_depth=10, min_child_weight=7
	MERROR 0.160577 for 47 rounds
CV with max_depth=11, min_child_weight=5
	MERROR 0.1586538 for 32 rounds
CV with max_depth=11, min_child_weight=6
	MERROR 0.15961540000000002 for 42 rounds
CV with max_depth=11, min_child_weight=7
	MERROR 0.160577 for 47 rounds
Best params: 9, 5, MERROR: 0.1586538


#### Best score with a max_depth of 9 and min_child_weight of 5 ,  MERROR=0.158

In [18]:
params['max_depth'] = 9
params['min_child_weight'] = 5

Parameters subsample and colsample_bytree
Those parameters control the sampling of the dataset that is done at each boosting round.

Instead of using the whole training set every time, we can build a tree on slightly different data at each step, which makes it less likely to overfit to a single sample or feature.

subsample corresponds to the fraction of observations (the rows) to subsample at each step. By default it is set to 1 meaning that we use all rows.
colsample_bytree corresponds to the fraction of features (the columns) to use. By default it is set to 1 meaning that we will use all features.

In [19]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [20]:
min_merror = float("Inf")  
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=seed,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )
    # Update best MERROR
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (subsample,colsample)

print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

CV with subsample=1.0, colsample=1.0
	MERROR 0.1586538 for 32 rounds
CV with subsample=1.0, colsample=0.9
	MERROR 0.1567306 for 30 rounds
CV with subsample=1.0, colsample=0.8
	MERROR 0.1625 for 22 rounds
CV with subsample=1.0, colsample=0.7
	MERROR 0.14807679999999998 for 51 rounds
CV with subsample=0.9, colsample=1.0
	MERROR 0.1615384 for 17 rounds
CV with subsample=0.9, colsample=0.9
	MERROR 0.1740384 for 27 rounds
CV with subsample=0.9, colsample=0.8
	MERROR 0.1596152 for 26 rounds
CV with subsample=0.9, colsample=0.7
	MERROR 0.1634614 for 56 rounds
CV with subsample=0.8, colsample=1.0
	MERROR 0.1471156 for 27 rounds
CV with subsample=0.8, colsample=0.9
	MERROR 0.1557694 for 49 rounds
CV with subsample=0.8, colsample=0.8
	MERROR 0.1567308 for 19 rounds
CV with subsample=0.8, colsample=0.7
	MERROR 0.1490384 for 25 rounds
CV with subsample=0.7, colsample=1.0
	MERROR 0.1624998 for 18 rounds
CV with subsample=0.7, colsample=0.9
	MERROR 0.1509616 for 39 rounds
CV with subsample=0.7, cols

In [21]:
print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

Best params: 0.8, 1.0, MERROR: 0.1471156


In [22]:
params['subsample'] = .8
params['colsample_bytree'] = 1.0

Parameter ETA
The ETA parameter controls the learning rate. It corresponds to the shrinkage of the weights associated to features after each round, in other words it defines the amount of "correction" we make at each step.
In practice, having a lower eta makes our model more robust to overfitting thus, usually, the lower the learning rate, the best. But with a lower eta, we need more boosting rounds, which takes more time to train, sometimes for only marginal improvements. Let's try a couple of values here, and time them with the notebook command:

In [23]:
min_merror = float("Inf")  
best_params = None

for eta in [.5, .4 ,.3, .2, .1]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params['eta'] = eta

    # Run and time CV
    
    %time cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=seed, nfold=5, metrics=['merror'], early_stopping_rounds=10)
    
    

    # Update best score
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    print("\tMERROR {} for {} rounds\n".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = eta

print("Best params: {}, MERROR: {}".format(best_params, min_merror))

CV with eta=0.5
CPU times: user 1min 53s, sys: 156 ms, total: 1min 53s
Wall time: 28.4 s
	MERROR 0.153846 for 22 rounds

CV with eta=0.4
CPU times: user 2min 40s, sys: 200 ms, total: 2min 40s
Wall time: 40.3 s
	MERROR 0.15000000000000002 for 45 rounds

CV with eta=0.3
CPU times: user 2min 43s, sys: 224 ms, total: 2min 43s
Wall time: 40.9 s
	MERROR 0.1471156 for 27 rounds

CV with eta=0.2
CPU times: user 4min 15s, sys: 224 ms, total: 4min 15s
Wall time: 1min 4s
	MERROR 0.1509616 for 53 rounds

CV with eta=0.1
CPU times: user 6min 34s, sys: 352 ms, total: 6min 34s
Wall time: 1min 38s
	MERROR 0.1442308 for 60 rounds

Best params: 0.1, MERROR: 0.1442308


Note the best eta of 0.1 takes 1 min 38 s for Merror of 0.144 where as with eta=0.3 we have 40.9s and merror of 0.147. SO we will choose eta=0.3

In [24]:
params["eta"] = 0.3

final dictionary of parameters

In [25]:
params

{'max_depth': 9,
 'min_child_weight': 5,
 'eta': 0.3,
 'subsample': 0.8,
 'colsample_bytree': 1.0,
 'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'num_class': 8,
 'silent': 1}

In [26]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-merror:0.3
Will train until Test-merror hasn't improved in 10 rounds.
[1]	Test-merror:0.21875
[2]	Test-merror:0.21875
[3]	Test-merror:0.21875
[4]	Test-merror:0.21875
[5]	Test-merror:0.2125
[6]	Test-merror:0.2
[7]	Test-merror:0.18125
[8]	Test-merror:0.1875
[9]	Test-merror:0.18125
[10]	Test-merror:0.175
[11]	Test-merror:0.175
[12]	Test-merror:0.175
[13]	Test-merror:0.175
[14]	Test-merror:0.16875
[15]	Test-merror:0.1625
[16]	Test-merror:0.15
[17]	Test-merror:0.15625
[18]	Test-merror:0.15625
[19]	Test-merror:0.15625
[20]	Test-merror:0.1625
[21]	Test-merror:0.15625
[22]	Test-merror:0.15625
[23]	Test-merror:0.15625
[24]	Test-merror:0.15625
[25]	Test-merror:0.15625
[26]	Test-merror:0.15625
Stopping. Best iteration:
[16]	Test-merror:0.15



In [27]:
print("Best MERROR: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best MERROR: 0.15 in 17 rounds


Saving the model:
Although we found the best number of rounds, our model has been trained with more rounds than optimal, thus before using it for predictions, we should retrain it with the good number of rounds. Since we now the exact best num_boost_round, we don't need the early_stopping_round anymore

In [28]:

params
    

{'max_depth': 9,
 'min_child_weight': 5,
 'eta': 0.3,
 'subsample': 0.8,
 'colsample_bytree': 1.0,
 'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'num_class': 8,
 'silent': 1}

In [29]:
num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-merror:0.3
[1]	Test-merror:0.21875
[2]	Test-merror:0.21875
[3]	Test-merror:0.21875
[4]	Test-merror:0.21875
[5]	Test-merror:0.2125
[6]	Test-merror:0.2
[7]	Test-merror:0.18125
[8]	Test-merror:0.1875
[9]	Test-merror:0.18125
[10]	Test-merror:0.175
[11]	Test-merror:0.175
[12]	Test-merror:0.175
[13]	Test-merror:0.175
[14]	Test-merror:0.16875
[15]	Test-merror:0.1625
[16]	Test-merror:0.15


use the test dataset and compute evaluation metrics with the scikit-learn function.

In [30]:
y_pred=best_model.predict(dtest)


In [31]:
metrics.precision_recall_fscore_support(y_pred, testlabels)

(array([0.8 , 0.8 , 0.95, 0.95, 0.7 , 0.65, 1.  , 0.95]),
 array([0.88888889, 0.76190476, 0.95      , 0.73076923, 0.875     ,
        0.72222222, 0.95238095, 0.95      ]),
 array([0.84210526, 0.7804878 , 0.95      , 0.82608696, 0.77777778,
        0.68421053, 0.97560976, 0.95      ]),
 array([18, 21, 20, 26, 16, 18, 21, 20]))

In [32]:
 conf_matrix = pd.crosstab(testlabels, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False)

In [33]:
conf_matrix

Predicted,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,16,2,0,0,1,1,0,0
1,1,16,0,0,0,3,0,0
2,0,0,19,1,0,0,0,0
3,0,0,0,19,0,0,1,0
4,1,0,0,3,14,1,0,1
5,0,3,0,3,1,13,0,0
6,0,0,0,0,0,0,20,0
7,0,0,1,0,0,0,0,19


To Save Model: 

In [34]:
best_model.save_model(model_path)

Load Model

In [None]:
#loaded_model = xgb.Booster()
#loaded_model.load_model(model_path)