## Imports

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import randint

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold, train_test_split, ValidationCurveDisplay, validation_curve
from sklearn.preprocessing import PolynomialFeatures

dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('ML-B')
root_path = os.path.sep.join(dir_parts[:root_index + 1])
sys.path.append(root_path + '/code/')
from data.data_config import Dataset
from data.data_utils import load_monk, load_cup, store_monk_result, store_cup_result
from hyperparameter_tuning import tuning_search_top_configs
from training.metrics import mean_euclidean_error

%load_ext autoreload
%autoreload 2

# K-NN
In this notebook we test a **KNeighborsClassifier** and a **KNeighborsRegressor** w.r.t. the tasks at hand, i.e. the three MONK's problems and the CUP dataset respectively.

## Settings

In [None]:
MODEL_NAME = 'K-NN'
INTERNAL_TEST_SPLIT = 0.1 # internal test split percentage
RANDOM_STATE = 128 # reproducibility
N_SPLITS = 5 # cross-validation
POLY_DEGREE = 3 # polynomial features pre-processing

## Path

In [None]:
# Directories
results_dir = root_path + '/results/' + MODEL_NAME

# Filepaths (MONK)
m1_dev_path, m1_test_path = Dataset.MONK_1.dev_path, Dataset.MONK_1.test_path # MONK 1
m2_dev_path, m2_test_path = Dataset.MONK_2.dev_path, Dataset.MONK_2.test_path # MONK 2
m3_dev_path, m3_test_path = Dataset.MONK_3.dev_path, Dataset.MONK_3.test_path # MONK 3

# Filepaths (CUP)
cup_dev_path, cup_test_path = Dataset.CUP.dev_path, Dataset.CUP.test_path

# MONK-1

In [None]:
# Load MONK-1
x_dev_m1, y_dev_m1, x_test_m1, y_test_m1 = load_monk(m1_dev_path, m1_test_path)

Let's perform a grid-search to identify promising hyper-paramaters for the task.

In [None]:
# Grid-search spaces
clf_hparams_spaces = {
    'n_neighbors': list(range(2,int(len(x_dev_m1)/2))),
    'weights': ['uniform', 'distance'],
    'p': [1,2]
}

In [None]:
grid_search_m1 = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=clf_hparams_spaces,
    cv=StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),
    scoring='accuracy',
    verbose=1
)

grid_search_m1.fit(x_dev_m1, y_dev_m1)

In [None]:
tuning_search_top_configs(grid_search_m1.cv_results_) # top k config

## Training - Testing

In [None]:
# Create a SVC with the best hparams
#knn_m1 = KNeighborsClassifier(**grid_search_m1.best_params_)

# Train the model
knn_m1 = KNeighborsClassifier(n_neighbors=22, p=1, weights='distance')
knn_m1.fit(x_dev_m1, y_dev_m1)

In [None]:
print('-- DEVELOPMENT --')
acc_dev_m1 = accuracy_score(y_dev_m1, knn_m1.predict(x_dev_m1))
mse_dev_m1 = mean_squared_error(y_dev_m1, knn_m1.predict(x_dev_m1))
print(f'Loss (MSE): {mse_dev_m1:.4f} - Accuracy: {acc_dev_m1:.4f}')

In [None]:
print('-- TEST --')
acc_test_m1 = accuracy_score(y_test_m1, knn_m1.predict(x_test_m1))
mse_test_m1 = mean_squared_error(y_test_m1, knn_m1.predict(x_test_m1))
print(f'Loss (MSE): {mse_test_m1:.4f} - Accuracy: {acc_test_m1:.4f}')

## Store results

In [None]:
report_m1 = {
    'dev': {'mse': mse_dev_m1, 'accuracy': acc_dev_m1},
    'test': {'mse': mse_test_m1, 'accuracy': acc_test_m1}
}

store_monk_result(results_dir + '/MONK1/', knn_m1.get_params(), report_m1)

# MONK-2

In [None]:
# Load MONK-2
x_dev_m2, y_dev_m2, x_test_m2, y_test_m2 = load_monk(m2_dev_path, m2_test_path)

Let's perform a grid-search to identify promising hyper-paramaters for the task.

In [None]:
grid_search_m2 = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=clf_hparams_spaces,
    cv=StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),
    scoring='accuracy',
    verbose=1
)

grid_search_m2.fit(x_dev_m2, y_dev_m2)

In [None]:
best_configs_m2 = tuning_search_top_configs(grid_search_m2.cv_results_) # top k config

In [None]:
# Create a SVC with the best hparams
#knn_m2 = KNeighborsClassifier(**grid_search_m2.best_params_)

# Train the model
knn_m2 = KNeighborsClassifier(n_neighbors=56, p=1, weights='distance')
knn_m2.fit(x_dev_m2, y_dev_m2)

In [None]:
print('-- DEVELOPMENT --')
acc_dev_m2 = accuracy_score(y_dev_m2, knn_m2.predict(x_dev_m2))
mse_dev_m2 = mean_squared_error(y_dev_m2, knn_m2.predict(x_dev_m2))
print(f'MSE: {mse_dev_m2:.4f} - Accuracy: {acc_dev_m2:.4f}')

In [None]:
print('-- TEST --')
acc_test_m2 = accuracy_score(y_test_m2, knn_m2.predict(x_test_m2))
mse_test_m2 = mean_squared_error(y_test_m2, knn_m2.predict(x_test_m2))
print(f'MSE: {mse_test_m2:.4f} - Accuracy: {acc_test_m2:.4f}')

## Store results

In [None]:
report_m2 = {
    'dev': {'mse': mse_dev_m2, 'accuracy': acc_dev_m2},
    'test': {'mse': mse_test_m2, 'accuracy': acc_test_m2}
}

store_monk_result(results_dir + '/MONK2/', knn_m2.get_params(), report_m2)

# MONK-3

In [None]:
# Load MONK-3
x_dev_m3, y_dev_m3, x_test_m3, y_test_m3 = load_monk(m3_dev_path, m3_test_path)

Let's perform a grid-search to identify promising hyper-paramaters for the task.

In [None]:
# Grid-search with KFold
grid_search_m3 = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=clf_hparams_spaces,
    cv=StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),
    scoring='accuracy',
    verbose=1
)

grid_search_m3.fit(x_dev_m3, y_dev_m3)

In [None]:
best_configs_m3 = tuning_search_top_configs(grid_search_m3.cv_results_) # top k config

## Training - Testing

In [None]:
# Create a SVC with the best hparams
#knn_m3 = KNeighborsClassifier(**grid_search_m3.best_params_)

# Train the model
knn_m3 = KNeighborsClassifier(n_neighbors=43, p=1, weights='distance')
knn_m3.fit(x_dev_m3, y_dev_m3)

In [None]:
print('-- DEVELOPMENT --')
acc_dev_m3 = accuracy_score(y_dev_m3, knn_m3.predict(x_dev_m3))
mse_dev_m3 = mean_squared_error(y_dev_m3, knn_m3.predict(x_dev_m3))
print(f'MSE: {mse_dev_m3:.4f} - Accuracy: {acc_dev_m3:.4f}')

In [None]:
print('-- TEST --')
acc_test_m3 = accuracy_score(y_test_m3, knn_m3.predict(x_test_m3))
mse_test_m3 = mean_squared_error(y_test_m3, knn_m3.predict(x_test_m3))
print(f'MSE: {mse_test_m3:.4f} - Accuracy: {acc_test_m3:.4f}')

## Store results

In [None]:
report_m3 = {
    'dev': {'mse': mse_dev_m3, 'accuracy': acc_dev_m3},
    'test': {'mse': mse_test_m3, 'accuracy': acc_test_m3}
}

store_monk_result(results_dir + '/MONK3/', knn_m3.get_params(), report_m3)

# CUP

In [None]:
# Load CUP
x_dev_cup, y_dev_cup, x_test_cup = load_cup(cup_dev_path, cup_test_path)

## Dev - Internal Test Split 
The development dataset is split between training and internal test ($90-10$).

In [None]:
# Split the internal test set
x_train_cup, x_internal_test_cup, y_train_cup, y_internal_test_cup = train_test_split(
    x_dev_cup, y_dev_cup, test_size=INTERNAL_TEST_SPLIT, random_state=128
)

## Polynomial Features pre-processing

In [None]:
# --- COMMENT TO USE NON-ENCHANED DATASET --- 
# Polynomial features pre-processing
poly = PolynomialFeatures(degree=POLY_DEGREE)
x_train_cup = poly.fit_transform(x_train_cup)
x_internal_test_cup = poly.transform(x_internal_test_cup)
x_test_cup = poly.transform(x_test_cup)

## Hyper-parameters Tuning
A common approach is to start with a coarse search across a wide range of values to find promising sub-ranges of our parameter space. Then, you would zoom into these ranges and perform another search to fine-tune the configurations.

Here, we proceed as follows:
1. (coarse) Grid-search across a wide range of hyper-paramaters and values;
2. (fine-tune) Random-search into zoomed intervals w.r.t. best configuration found by grid-search.

Then, we perform a single run of grid-search and random-search with the respectively best configurations while taking into account a PolynomialFeatures pre-processing with fixed degree. The best configurations that will be used for final re-training and evaluation on internal test is the one with the best mean MEE on the validation cross-validation.

Note that, tuning of the polynomial degree wasn't performed because it would be very expensive. Thus, we simply decided to use a fixed degree value.

### Grid search

In [None]:
# Grid-search spaces
grid_search_spaces_cup = {
    'n_neighbors': list(range(2,int(len(x_train_cup)/2))),
     'weights': ['uniform', 'distance'],
     'p': [1,2]
}

In [None]:
grid_search_cup = GridSearchCV(
    KNeighborsRegressor(),
    param_grid=grid_search_spaces_cup,
    cv=KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),
    scoring=make_scorer(mean_euclidean_error, greater_is_better=False),
    verbose=1
)

grid_search_cup.fit(x_train_cup, y_train_cup)

In [None]:
tuning_search_top_configs(grid_search_cup.cv_results_) # top k config

### Random Search

In [None]:
best_params = grid_search_cup.best_params_

# Random-search spaces
random_search_spaces_cup = {
    'n_neighbors': randint(max(2, best_params['n_neighbors'] * 0.5), best_params['n_neighbors'] * 2),
    'weights': [best_params['weights']],
    'p': [1,2]
}

In [None]:
random_search_cup = RandomizedSearchCV(
    KNeighborsRegressor(), 
    random_search_spaces_cup, 
    n_iter=20,
    cv=KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),
    scoring=make_scorer(mean_euclidean_error, greater_is_better = False),
    verbose=1,
    random_state=RANDOM_STATE
)

random_search_cup.fit(x_train_cup, y_train_cup)

In [None]:
tuning_search_top_configs(random_search_cup.cv_results_) # top k config

### Save tuning results

In [None]:
best_score_grid = grid_search_cup.best_score_
best_score_random = random_search_cup.best_score_

# Check if best result is from GridSearc or RandomSearch
if best_score_random > best_score_grid:
    print("Best configuration from RandomizedSearch:\n")
    best_params = random_search_cup.best_params_
    print(best_params)
else:
    best_params = grid_search_cup.best_params_
    print("GridSearchCV resulted in the best configuration.")

## Training and Internal test assessment
Let's perform a re-training of our model on the entire development set. In this way, we're able to leverage the entire training data (early stopping is applied w.r.t. the train MEE). Finally, predict on the (untouched) internal test to perform model assessment and estimate our performance on the blind test set.

In [None]:
# --- UNCOMMENT TO TEST ---
# Best (tuning) configuration
"""
# Best configuration
best_params = {'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
"""

# Create a KNN with the best hparams
knn = KNeighborsRegressor(**best_params)
knn.fit(x_train_cup, y_train_cup)

In [None]:
print('-- TRAINING --')
mee_train_cup = mean_euclidean_error(y_train_cup, knn.predict(x_train_cup))
mse_train_cup = mean_squared_error(y_train_cup, knn.predict(x_train_cup))
print(f'Loss (MSE): {mse_train_cup:.4f} - MEE: {mee_train_cup:.4f}')

In [None]:
print('-- INTERNAL TEST --')
mee_internal_test_cup = mean_euclidean_error(y_internal_test_cup, knn.predict(x_internal_test_cup))
mse_internal_test_cup = mean_squared_error(y_internal_test_cup, knn.predict(x_internal_test_cup))
print(f'Loss (MSE): {mse_internal_test_cup:.4f} - MEE: {mee_internal_test_cup:.4f}')

In [None]:
# Blind test set predictions
blind_test_preds_cup = knn.predict(x_test_cup)

### Store Result

In [None]:
report_cup = {
    'train': {'mse': mse_train_cup, 'mee': mee_train_cup},
    'internal_test': {'mse': mse_internal_test_cup, 'mee': mee_internal_test_cup},
}

store_cup_result(results_dir + '/CUP/', best_params, report_cup, blind_test_preds_cup, is_poly=True)