## Imports

In [2]:
import os
import sys

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.initializers import HeNormal, GlorotUniform
from keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l2


dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('ML-B')
root_path = os.path.sep.join(dir_parts[:root_index + 1])
sys.path.append(root_path + '/code/')
from data.data_config import Dataset
from data.data_utils import load_monk, load_cup, store_monk_result, store_cup_result
from hyperparameter_tuning import grid_search_top_configs
from training.metrics import mean_euclidean_error
from training.solver import Solver

%load_ext autoreload
%autoreload 2

# NN
In this notebook we implement and test a custom (feed-forward) Neural Network w.r.t. the tasks at hand, i.e. the three MONK's problem and the CUP dataset.

Specifically:
- **get_nn_classifier(...)**: defines the NN classifier for the MONK's problems;
- **get_nn_regressor(...)**: defines the NN regressor for the CUP dataset.

## Settings

In [3]:
MODEL_NAME = 'NN'
VAL_SPLIT = 0.2 # validation set percentage
RANDOM_STATE = 128

## Model

In [4]:
hparams = {
    'lr': [0.1, 0.01],
    'h_dim': [16, 32],
    'activation': ['relu', 'tanh'],
    'reg': [0.1, 0.01],
}

def get_nn_classifier(hparams):
    # Determine appropriate initializer
    if hparams['activation'] == 'relu':
        initializer = HeNormal() # Kaiming (He)
    else:
        initializer = GlorotUniform() # Xavier (Glorot)
    
    model = Sequential([
        Dense(hparams['h_dim'], activation=hparams['activation'], input_shape=(17,), kernel_initializer=initializer),
        Dense(1, activation='sigmoid', kernel_regularizer=l2(hparams['reg']))
    ])
    
    optimizer = SGD(learning_rate=hparams['lr'])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

## Path

In [5]:
# Directories
results_dir = root_path + '/results/' + MODEL_NAME

# Filepaths (MONK)
m1_dev_path, m1_test_path = Dataset.MONK_1.dev_path, Dataset.MONK_1.test_path # MONK 1
m2_dev_path, m2_test_path = Dataset.MONK_2.dev_path, Dataset.MONK_2.test_path # MONK 2
m3_dev_path, m3_test_path = Dataset.MONK_3.dev_path, Dataset.MONK_3.test_path # MONK 3

# Filepaths (CUP)
cup_dev_path, cup_test_path = Dataset.CUP.dev_path, Dataset.CUP.test_path

# MONK-1

In [6]:
# Load MONK-1
x_dev_m1, y_dev_m1, x_test_m1, y_test_m1 = load_monk(m1_dev_path, m1_test_path)

First of all, we define the grid search spaces for the RandomForestClassifier that we're going to use for all three MONK's problems.

In [None]:
# hparams grid for grid search
rfc_hparams_spaces = {
    'n_estimators': [25, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
}

## Grid search
Grid search is a simple hyper-parameter tuning techniques useful for finding the best configuration for a specific ML model. It involves:
- defining a grid of hparams values
- systematically test all possible combinations

Other approaches include Random search or Bayesian optimization.

## Split Train-Val

In [7]:
x_train_m1, x_val_m1, y_train_m1, y_val_m1 = train_test_split(x_dev_m1, y_dev_m1, test_size=VAL_SPLIT, random_state=RANDOM_STATE)

In [27]:
hparams = {
    'lr': 0.1,
    'h_dim': 16,
    'activation': 'relu',
    'reg': 0,
}

model_m1 = get_nn_classifier(hparams)
solver = Solver(model_m1, x_train_m1, y_train_m1, x_val_m1, y_val_m1)
solver.train(epochs=20, patience=5, batch_size=32)


Epoch 1/15 - loss: 0.7956 - accuracy: 0.5657 - val_loss: 1.0366 - val_accuracy: 0.3200


Epoch 2/15 - loss: 0.7457 - accuracy: 0.5859 - val_loss: 0.9038 - val_accuracy: 0.3200


Epoch 3/15 - loss: 0.6932 - accuracy: 0.5960 - val_loss: 0.8525 - val_accuracy: 0.4000


Epoch 4/15 - loss: 0.6758 - accuracy: 0.6061 - val_loss: 0.7801 - val_accuracy: 0.4800


Epoch 5/15 - loss: 0.6515 - accuracy: 0.6061 - val_loss: 0.7216 - val_accuracy: 0.4400


Epoch 6/15 - loss: 0.6396 - accuracy: 0.6364 - val_loss: 0.7158 - val_accuracy: 0.4400


Epoch 7/15 - loss: 0.6305 - accuracy: 0.6465 - val_loss: 0.7228 - val_accuracy: 0.4400


Epoch 8/15 - loss: 0.6236 - accuracy: 0.6667 - val_loss: 0.7349 - val_accuracy: 0.4000


Epoch 9/15 - loss: 0.6195 - accuracy: 0.6465 - val_loss: 0.6803 - val_accuracy: 0.5600


Epoch 10/15 - loss: 0.6117 - accuracy: 0.6768 - val_loss: 0.7258 - val_accuracy: 0.4000


Epoch 11/15 - loss: 0.6059 - accuracy: 0.6768 - val_loss: 0.7435 - val_accuracy: 0.4000


Epoch 12/15 - loss

In [22]:
print('-- DEVELOPMENT --')
y_dev_pred_m1 = np.where(model_m1.predict(x_dev_m1) > 0.5, 1, 0)
print(classification_report(y_dev_m1, y_dev_pred_m1, digits=4))

-- DEVELOPMENT --
              precision    recall  f1-score   support

           0     0.5046    0.8871    0.6433        62
           1     0.5333    0.1290    0.2078        62

    accuracy                         0.5081       124
   macro avg     0.5190    0.5081    0.4255       124
weighted avg     0.5190    0.5081    0.4255       124



In [23]:
print('-- TEST --')
y_test_pred_m1 = np.where(model_m1.predict(x_test_m1) > 0.5, 1, 0)
print(classification_report(y_test_m1, y_test_pred_m1, digits=4))

-- TEST --
              precision    recall  f1-score   support

           0     0.5046    0.8871    0.6433        62
           1     0.5333    0.1290    0.2078        62

    accuracy                         0.5081       124
   macro avg     0.5190    0.5081    0.4255       124
weighted avg     0.5190    0.5081    0.4255       124



## Store results

In [None]:
store_monk_result(results_dir + '/MONK1/', best_configs_m1, report_test_m1)

# MONK-2

# MONK-3

# CUP

In [None]:
# Load CUP
x_dev_cup, y_dev_cup, x_test_cup = load_cup(cup_dev_path, cup_test_path)

Here, we define the grid search spaces for the RandomForestRegression for the CUP dataset.

In [None]:
rfr_hparams_spaces = {
    'n_estimators': [25, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt'],
    'criterion': ['squared_error'],
    'bootstrap': [True, False],
}

## Grid search

In [None]:
# perform grid search with KFold
grid_search_cup = GridSearchCV(
    RandomForestRegressor(random_state=69),
    param_grid=rfr_hparams_spaces,
    cv=KFold(n_splits=5, shuffle=True, random_state=69),
    scoring=make_scorer(mean_euclidean_error, greater_is_better=False),
    verbose=1
)

grid_search_cup.fit(x_dev_cup, y_dev_cup)

In [None]:
best_configs_cup = grid_search_top_configs(grid_search_cup.cv_results_) # top k config

In [None]:
# Create a RandomForest with the best hparams
rfr = RandomForestRegressor(**grid_search_cup.best_params_)

# Train the model
rfr.fit(x_dev_cup, y_dev_cup)

In [None]:
print('-- DEVELOPMENT --')
dev_mee_cup = mean_euclidean_error(y_dev_cup, rfr.predict(x_dev_cup))
print(dev_mee_cup)

In [None]:
# Blind test set predictions
test_preds_cup = rfr.predict(x_test_cup)

## Store Result

In [None]:
store_cup_result(results_dir + '/CUP/', best_configs_cup, dev_mee_cup, test_preds_cup)