## Imports

In [76]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold, train_test_split

dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('ML-B')
root_path = os.path.sep.join(dir_parts[:root_index + 1])
sys.path.append(root_path + '/code/')
from data.data_config import Dataset
from data.data_utils import load_monk, load_cup, store_monk_result, store_cup_result
from hyperparameter_tuning import grid_search_top_configs
from training.metrics import mean_euclidean_error

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Random Forest
In this notebook we test a **RandomForestClassifier** and a **RandomForestRegressor** w.r.t. the tasks at hand, i.e. the three MONK's problems and the CUP dataset respectively.

In [77]:
MODEL_NAME = 'RandomForest'

VAL_SPLIT = 0.2

RANDOM_STATE = 128

## Path

In [78]:
# Directories
results_dir = root_path + '/results/' + MODEL_NAME

# Filepaths (MONK)
m1_dev_path, m1_test_path = Dataset.MONK_1.dev_path, Dataset.MONK_1.test_path # MONK 1
m2_dev_path, m2_test_path = Dataset.MONK_2.dev_path, Dataset.MONK_2.test_path # MONK 2
m3_dev_path, m3_test_path = Dataset.MONK_3.dev_path, Dataset.MONK_3.test_path # MONK 3

# Filepaths (CUP)
cup_dev_path, cup_test_path = Dataset.CUP.dev_path, Dataset.CUP.test_path

# MONK-1

In [79]:
# Load MONK-1
x_dev_m1, y_dev_m1, x_test_m1, y_test_m1 = load_monk(m1_dev_path, m1_test_path)

In [80]:
train_test_split(x_dev_m1, y_dev_m1, test_size=VAL_SPLIT, random_state=RANDOM_STATE)

[array([[ True, False, False, ..., False, False,  True],
        [ True, False, False, ..., False, False,  True],
        [False, False,  True, ..., False, False,  True],
        ...,
        [False, False,  True, ..., False,  True, False],
        [False,  True, False, ..., False,  True, False],
        [False,  True, False, ..., False, False,  True]]),
 array([[ True, False, False, False,  True, False,  True, False,  True,
         False, False, False, False,  True, False,  True, False],
        [False, False,  True, False, False,  True, False,  True, False,
         False,  True,  True, False, False, False, False,  True],
        [False,  True, False, False, False,  True,  True, False, False,
          True, False, False, False,  True, False,  True, False],
        [False, False,  True,  True, False, False, False,  True, False,
         False,  True, False,  True, False, False, False,  True],
        [ True, False, False, False,  True, False,  True, False, False,
         False,  Tr

First of all, we define the grid search spaces for the RandomForestClassifier that we're going to use for all three MONK's problems.

## Grid search
Grid search is a simple hyper-parameter tuning techniques useful for finding the best configuration for a specific ML model. It involves:
- defining a grid of hparams values
- systematically test all possible combinations

Other approaches include Random search or Bayesian optimization.

In [81]:
# hparams grid for grid search
rfc_hparams_spaces = {
    'n_estimators': [25, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
}

In [82]:
# perform grid search with KFold
grid_search_m1 = GridSearchCV(
    RandomForestClassifier(random_state=128),
    param_grid=rfc_hparams_spaces,
    cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=128),
    scoring='accuracy',
    verbose=1
)

grid_search_m1.fit(x_dev_m1, y_dev_m1)

Fitting 2 folds for each of 324 candidates, totalling 648 fits


In [83]:
best_configs_m1 = grid_search_top_configs(grid_search_m1.cv_results_) # top k config

Model rank 1 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Mean score 0.8790 - Std score: 0.0081

Model rank 2 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Mean score 0.8790 - Std score: 0.0081

Model rank 3 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Mean score 0.8790 - Std score: 0.0081

Model rank 4 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Mean score 0.8790 - Std score: 0.0081

Model rank 5 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_spli

In [84]:
# Create a RandomForest with the best hparams
rfc = RandomForestClassifier(**grid_search_m1.best_params_)

# Train the model
rfc.fit(x_dev_m1, y_dev_m1)

In [85]:
print('-- DEVELOPMENT --')
print(classification_report(y_dev_m1, rfc.predict(x_dev_m1)))

-- DEVELOPMENT --
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00        62

    accuracy                           1.00       124
   macro avg       1.00      1.00      1.00       124
weighted avg       1.00      1.00      1.00       124



In [86]:
print('-- TEST --')
report_test_m1 = classification_report(y_test_m1, rfc.predict(x_test_m1))
print(report_test_m1)

-- TEST --
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       216
           1       0.98      0.94      0.96       216

    accuracy                           0.96       432
   macro avg       0.96      0.96      0.96       432
weighted avg       0.96      0.96      0.96       432



## Store results

In [87]:
store_monk_result(results_dir + '/MONK1/', best_configs_m1, report_test_m1)

# MONK-2

In [88]:
# Load MONK-2
x_dev_m2, y_dev_m2, x_test_m2, y_test_m2 = load_monk(m2_dev_path, m2_test_path)

In [89]:
train_test_split(x_dev_m2, y_dev_m2, test_size=VAL_SPLIT, random_state=RANDOM_STATE)

[array([[False, False,  True, ..., False,  True, False],
        [False,  True, False, ..., False, False,  True],
        [False, False,  True, ..., False,  True, False],
        ...,
        [False, False,  True, ...,  True, False,  True],
        [False,  True, False, ..., False,  True, False],
        [False,  True, False, ..., False, False,  True]]),
 array([[ True, False, False, False,  True, False,  True, False, False,
          True, False, False, False, False,  True,  True, False],
        [False, False,  True,  True, False, False,  True, False, False,
          True, False, False, False, False,  True,  True, False],
        [False, False,  True,  True, False, False, False,  True,  True,
         False, False, False,  True, False, False, False,  True],
        [ True, False, False,  True, False, False,  True, False, False,
         False,  True, False, False, False,  True,  True, False],
        [False,  True, False, False,  True, False, False,  True,  True,
         False, Fal

### grid search, monk 2

In [90]:
# perform grid search with KFold
grid_search_m2 = GridSearchCV(
    RandomForestClassifier(random_state=128),
    param_grid=rfc_hparams_spaces,
    cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=128),
    scoring='accuracy',
    verbose=1
)

grid_search_m2.fit(x_dev_m2, y_dev_m2)

Fitting 2 folds for each of 324 candidates, totalling 648 fits


In [91]:
best_configs_m2 = grid_search_top_configs(grid_search_m2.cv_results_) # top k config

Model rank 1 - Config: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Mean score 0.6274 - Std score: 0.0274

Model rank 2 - Config: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 25}
Mean score 0.6274 - Std score: 0.0274

Model rank 3 - Config: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Mean score 0.6274 - Std score: 0.0274

Model rank 4 - Config: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 25}
Mean score 0.6274 - Std score: 0.0274

Model rank 5 - Config: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_s

In [92]:
# Create a RandomForest with the best hparams
rfc = RandomForestClassifier(**grid_search_m2.best_params_)

# Train the model
rfc.fit(x_dev_m2, y_dev_m2)

In [93]:
print('-- DEVELOPMENT --')
print(classification_report(y_dev_m2, rfc.predict(x_dev_m2)))

-- DEVELOPMENT --
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       105
           1       1.00      0.97      0.98        64

    accuracy                           0.99       169
   macro avg       0.99      0.98      0.99       169
weighted avg       0.99      0.99      0.99       169



In [94]:
print('-- TEST --')
report_test_m2 = classification_report(y_test_m1, rfc.predict(x_test_m2))
print(report_test_m2)

-- TEST --
              precision    recall  f1-score   support

           0       0.48      0.64      0.55       216
           1       0.45      0.30      0.36       216

    accuracy                           0.47       432
   macro avg       0.46      0.47      0.45       432
weighted avg       0.46      0.47      0.45       432



In [95]:
# RISULTATI MONK2

store_monk_result(results_dir + '/MONK2/', best_configs_m2, report_test_m2)

# MONK-3

In [96]:
# Load MONK-3
x_dev_m3, y_dev_m3, x_test_m3, y_test_m3 = load_monk(m3_dev_path, m3_test_path)

In [97]:
train_test_split(x_dev_m3, y_dev_m3, test_size=VAL_SPLIT, random_state=RANDOM_STATE)

[array([[False, False,  True, ..., False,  True, False],
        [ True, False, False, ..., False,  True, False],
        [False, False,  True, ...,  True,  True, False],
        ...,
        [False, False,  True, ..., False, False,  True],
        [False,  True, False, ..., False,  True, False],
        [False,  True, False, ...,  True,  True, False]]),
 array([[ True, False, False, False,  True, False, False,  True, False,
          True, False, False,  True, False, False, False,  True],
        [False, False,  True, False, False,  True,  True, False,  True,
         False, False, False, False, False,  True,  True, False],
        [False, False,  True, False, False,  True, False,  True,  True,
         False, False,  True, False, False, False,  True, False],
        [ True, False, False, False,  True, False, False,  True,  True,
         False, False, False, False, False,  True, False,  True],
        [ True, False, False, False,  True, False,  True, False, False,
          True, Fal

In [98]:

# perform grid search with KFold
grid_search_m3 = GridSearchCV(
    RandomForestClassifier(random_state=128),
    param_grid=rfc_hparams_spaces,
    cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=128),
    scoring='accuracy',
    verbose=1
)

grid_search_m3.fit(x_dev_m3, y_dev_m3)

Fitting 2 folds for each of 324 candidates, totalling 648 fits


In [99]:
best_configs_m3 = grid_search_top_configs(grid_search_m3.cv_results_) # top k config

Model rank 1 - Config: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Mean score 0.9344 - Std score: 0.0492

Model rank 2 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 25}
Mean score 0.9344 - Std score: 0.0492

Model rank 3 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Mean score 0.9344 - Std score: 0.0492

Model rank 4 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Mean score 0.9344 - Std score: 0.0492

Model rank 5 - Config: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_sp

In [100]:
# Create a RandomForest with the best hparams
rfc = RandomForestClassifier(**grid_search_m3.best_params_)

# Train the model
rfc.fit(x_dev_m3, y_dev_m3)

In [101]:
print('-- DEVELOPMENT --')
print(classification_report(y_dev_m3, rfc.predict(x_dev_m3)))

-- DEVELOPMENT --
              precision    recall  f1-score   support

           0       0.95      0.92      0.93        62
           1       0.92      0.95      0.93        60

    accuracy                           0.93       122
   macro avg       0.93      0.93      0.93       122
weighted avg       0.93      0.93      0.93       122



In [102]:
print('-- TEST --')
report_test_m3 = classification_report(y_test_m3, rfc.predict(x_test_m3))
print(report_test_m3)

-- TEST --
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       204
           1       1.00      0.95      0.97       228

    accuracy                           0.97       432
   macro avg       0.97      0.97      0.97       432
weighted avg       0.97      0.97      0.97       432



In [103]:
# RISULTATI MONK3

store_monk_result(results_dir + '/MONK3/', best_configs_m3, report_test_m3)

# CUP

In [104]:
# Load CUP
x_dev_cup, y_dev_cup, x_test_cup = load_cup(cup_dev_path, cup_test_path)

Here, we define the grid search spaces for the RandomForestRegression for the CUP dataset.

In [105]:
rfr_hparams_spaces = {
    'n_estimators': [25, 50, 100],
    'max_depth': [5, 10, 20, 40],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
    'criterion': ['squared_error'],
    'bootstrap': [True, False],
}

## Grid search

In [106]:
train_test_split(x_dev_cup, y_dev_cup, test_size=VAL_SPLIT, random_state=RANDOM_STATE)

[array([[ 0.61149246,  0.9951097 , -0.9565687 , ...,  0.99912935,
         -0.6267533 , -0.9778796 ],
        [-0.9590803 ,  0.7285206 , -0.9971069 , ...,  0.8586828 ,
         -0.8859362 ,  0.7572728 ],
        [-0.786939  ,  0.963124  , -0.99197435, ...,  0.9884821 ,
         -0.89324707, -0.45198295],
        ...,
        [-0.8153907 ,  0.24062237, -0.9951211 , ...,  0.9472486 ,
         -0.86275923, -0.10523301],
        [ 0.9704041 , -0.9608475 ,  0.95187455, ...,  0.7523884 ,
          0.72169656, -0.9888743 ],
        [ 0.9919152 ,  0.7415747 ,  0.78942555, ...,  0.9980771 ,
         -0.9616746 , -0.9735993 ]]),
 array([[-0.5298587 ,  0.46753138, -0.99022156, ...,  0.9769632 ,
         -0.9473218 , -0.16472916],
        [-0.99163544,  0.9788987 , -0.99908805, ...,  0.16020666,
         -0.00505242,  0.9638261 ],
        [-0.798371  , -0.90621626, -0.964229  , ..., -0.10577794,
         -0.7499545 ,  0.7000535 ],
        ...,
        [-0.90590656,  0.3944241 , -0.99603516, ...,  

In [107]:

# perform grid search with KFold
grid_search_cup = GridSearchCV(
    RandomForestRegressor(random_state=128),
    param_grid=rfr_hparams_spaces,
    cv=KFold(n_splits=5, shuffle=True, random_state=128),
    scoring=make_scorer(mean_euclidean_error, greater_is_better=False),
    verbose=1
)

grid_search_cup.fit(x_dev_cup, y_dev_cup)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [108]:
best_configs_cup = grid_search_top_configs(grid_search_cup.cv_results_) # top k config

Model rank 1 - Config: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Mean score -2.2206 - Std score: 0.1443

Model rank 2 - Config: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Mean score -2.2206 - Std score: 0.1443

Model rank 3 - Config: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 50}
Mean score -2.2753 - Std score: 0.1805

Model rank 4 - Config: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 50}
Mean score -2.2753 - Std score: 0.1805

Model rank 5 - Config: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Mean score -2.4006 - Std score: 0.1223



In [109]:
# Create a RandomForest with the best hparams
rfr = RandomForestRegressor(**grid_search_cup.best_params_)

# Train the model
rfr.fit(x_dev_cup, y_dev_cup)

In [110]:
print('-- DEVELOPMENT --')
dev_mee_cup = mean_euclidean_error(y_dev_cup, rfr.predict(x_dev_cup))
print(dev_mee_cup)

-- DEVELOPMENT --
5.515234493593812e-14


In [111]:
# Blind test set predictions
test_preds_cup = rfr.predict(x_test_cup)

## Store Result

In [112]:
store_cup_result(results_dir + '/CUP/', best_configs_cup, dev_mee_cup, test_preds_cup)