# Imports

In [None]:
# Imports
import numpy as np
import pandas as pd

import scikit_posthocs as sp
import plotly.express as px
import plotly.graph_objects as go

from sklearn.datasets import fetch_openml
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import wilcoxon, friedmanchisquare
import scikit_posthocs as sp

import warnings
warnings.filterwarnings('ignore')

# Random seed
seed = 1


# Data split approaches

| Method                  | Stability     | Risk of Overfitting | Computational Cost | Recommended Use Case                          |
|-------------------------|---------------|----------------------|--------------------|-----------------------------------------------|
| **Random Split**        | Low           | High                 | Low                | Fast prototyping, quick checks                |
| **Cross-Validation**    | Medium        | Medium               | Medium             | Reliable model evaluation                     |
| **Nested Cross-Validation** | High     | Low                  | High               | Final evaluation, rigorous comparisons        |

<br />

# Load data

In [None]:
# Load the Boston dataset from OpenML
boston = fetch_openml(name='boston', version=1, as_frame=True)
X = boston.data
y = boston.target.astype(float)


# Nested Cross-validation scheme

The [KFold method](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html) from `scikit-learn` will be used to k-fold cross-validation splitting.

However, it will not always be possible to use this method. In such cases, you should implement your own outer and inner cross-validation loops.

<br />

In [None]:
k_outer = 10
k_inner = 5
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
outer_cv = KFold(n_splits=k_outer, shuffle=True, random_state=seed)
inner_cv = KFold(n_splits=k_inner, shuffle=True, random_state=seed)

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')


In [None]:
# CV Evaluation
def negative_rmse(y_true, y_pred):
    """Calculates the negative of the Root Mean Squared Error (RMSE).
        Args:
            y_true: True values
            y_pred: Predicted values
        Returns:
            float: The negative value of the RMSE. It returns negative because the
                   scoring functions in Scikit-learn aim to maximize the score.
                   Maximizing -RMSE is equivalent to minimizing RMSE.
    """
    return -np.sqrt(mean_squared_error(y_true, y_pred))
scoring_neg = make_scorer(negative_rmse)


# Models and search spaces

The **Support Vector Regressor (SVR)** and **Random Forest Regressor (RF)** models will be used in this class to explore cross-validation and nested cross-validation approaches.

- The **SVR** model requires **data scaling**, so it will be wrapped in a **Pipeline**.
- For **SVR**, the hyperparameters to be tuned are:
    - **C** (regularization strength)
    - **gamma** (kernel coefficient for some kernels)
    - **kernel** (e.g., 'linear', 'rbf')
- For **RF**, the hyperparameters to be tuned are:
    - **n_estimators** (number of trees in the forest)
    - **max_depth** (maximum depth of each tree)
    - **max_features** (number of features to consider when looking for the best split)


In [None]:
# SVM pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])
svm_param_grid = {
    'svr__C': [0.1, 1, 10],
    'svr__gamma': ['scale', 'auto'],
    'svr__kernel': ['rbf', 'linear']
}

# Random Forest direct grid
rf_model = RandomForestRegressor(random_state=seed)
rf_param_grid = {
    'n_estimators': [2, 3],
    'max_depth': [2, 3],
    'max_features': [None, 'sqrt']
}

# Significance tests
alpha_sig = 0.05


<br />
<hr />

# Nested CV loop

The cross-validation grid search will be done with the [GridSearchCV method](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) from `scikit-learn`, with **fixed grid search** and a **fixed number of iterations**.

For a more detailed instructions on how to use the `GridSearchCV` method, check its [user manual](https://scikit-learn.org/stable/modules/grid_search.html#grid-search).


<br />

## SVM and RF deterministic hyperparameters choice

In [None]:
# SVM Nested CV
#
# The GridSearchCV searches over specified parameter values for an estimator
svm_grid = GridSearchCV(estimator=svm_pipeline, param_grid=svm_param_grid, cv=inner_cv, scoring=scoring_neg)
#
# The cross_val_score evaluates the model using different splits of the data and returns the performance scores
# using the best parameters found in the grid search
svm_scores = cross_val_score(svm_grid, X, y, cv=outer_cv, scoring=scoring_neg)

# Random Forest Nested CV
rf_grid = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=inner_cv, scoring=scoring_neg)
rf_scores = cross_val_score(rf_grid, X, y, cv=outer_cv, scoring=scoring_neg)


## Comparing models

In [None]:
# Global significance test
df_test_set = pd.DataFrame({
    'Model': ['SVM'] * len(svm_scores) + ['RF'] * len(rf_scores),
    'RMSE': np.concatenate((-svm_scores, -rf_scores))
})
stat, p = stat, p_value = wilcoxon(-svm_scores, -rf_scores)
print(f'\nWilcoxon test statistic: {stat:.4f}, p-value: {p:.4f}')

# Boxplots
fig = go.Figure()
fig.add_trace(go.Box(
    x=df_test_set['Model'],
    y=df_test_set['RMSE'],
    fillcolor='rgba(108, 140, 200, 0.3)',
    line=dict(color='rgba(108, 140, 200, 1)'),
    boxpoints='all',
    jitter=0,
    pointpos=0,
    marker=dict(color='rgba(108, 140, 200, 1)')
))
fig.update_yaxes(range=[0, max(df_test_set['RMSE'])*1.05])
fig.update_layout(
    title='SVM vs. RF', yaxis_title='Test RMSE',
    width=500, height=300,
    plot_bgcolor='#f1f1f1',
    margin=dict(l=50, r=50, t=75, b=20),
    showlegend=False
)
fig.show()


<hr />
<br />

## SVM statistical hyperparameter analysis

In [None]:
best_grid_models = []
validation_results = {'svm':[]}
i_outer = 0
for train_idx, test_idx in outer_cv.split(X, y):
    print('\n\n----------\nOuter CV {}'.format(i_outer))

    # Outer CV (test set) -------------------------------------------------
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Inner CV (validation set for hyperparameters tuning) ----------------
    # SVM
    grid_search = GridSearchCV(estimator=svm_pipeline, 
                    param_grid=svm_param_grid, 
                    cv=inner_cv, 
                    scoring=scoring_neg)
    grid_search.fit(X_train, y_train)
    
    # Grid CV automated model selection
    best_model = grid_search.best_estimator_
    best_grid_models.append({
        'C': grid_search.best_params_['svr__C'],
        'gamma': grid_search.best_params_['svr__gamma'],
        'kernel': grid_search.best_params_['svr__kernel']
    })
    print('Best model hyperparameters:\n\tC\t{}\n\tgamma\t{}\n\tkernel\t{}'.format(
        best_model.get_params()['svr__C'], best_model.get_params()['svr__gamma'], best_model.get_params()['svr__kernel']
    ))
    
    # Statistical tests to analyse best hyperparameter combination --------
    # Retrieve results
    df = []
    for i_cv in range(k_inner):
        res = {}
        for i_par in range(len(grid_search.cv_results_['split'+str(i_cv)+'_test_score'])):
            res.update({'par_'+str(i_par): grid_search.cv_results_['split'+str(i_cv)+'_test_score'][i_par]})
        df.append(res)
    df = pd.DataFrame(df)
    validation_results['svm'].append(df.iloc[:, list(grid_search.cv_results_['rank_test_score']).index(1)])
    
    # Global significance test
    stat, p = friedmanchisquare(*[df[col] for col in df.columns])
    print(f'\nFriedman test statistic: {stat:.4f}, p-value: {p:.4f}')
    if(p<alpha_sig):
        # Pairwise significance test
        posthoc_result = sp.posthoc_nemenyi_friedman(df.to_numpy())
        significant_columns = posthoc_result.columns[posthoc_result.lt(0.05).any(axis=0)]
        df_significant = df.iloc[:, list(significant_columns)]
        # P-values
        filtered_result = posthoc_result[posthoc_result < alpha_sig]
        filtered_result = filtered_result.dropna(how='all').dropna(axis=1, how='all')
        filtered_result = filtered_result.fillna('-')
        print('P-values of significant differences:')
        display(filtered_result)
    else:
        print('No significant differences found.')        

    # Boxplots
    df_long = df.melt(var_name='group', value_name='score')
    label_map = {
        f'par_{i}': f'[par_{i}] C {param['svr__C']}<br>gamma: {param['svr__gamma']}<br>kernel: {param['svr__kernel']}'
        for i, param in enumerate(grid_search.cv_results_['params'])
    }
    df_long['label'] = df_long['group'].map(label_map)
    fig = go.Figure()
    fig.add_trace(go.Box(
        x=df_long['label'],
        y=-df_long['score'],
        fillcolor='rgba(108, 140, 200, 0.3)',
        line=dict(color='rgba(108, 140, 200, 1)'),
        boxpoints='all',
        jitter=0,
        pointpos=0,
        marker=dict(color='rgba(108, 140, 200, 1)')
    ))
    fig.update_layout(
        title='CV '+str(i_outer), yaxis_title='Validation Fitness',
        width=1000, height=400,
        plot_bgcolor='#f1f1f1',
        xaxis_tickangle=-90,
        margin=dict(l=50, r=50, t=50, b=20),
        showlegend=False
    )
    fig.show()

    # Outer CV update
    i_outer +=1


In [None]:
best_grid_models

## RF with statistical hyperparameter analysis

In [None]:
best_grid_models = []
validation_results.update({'rf': []})
i_outer = 0
for train_idx, test_idx in outer_cv.split(X, y):
    print('\n\n----------\nOuter CV {}'.format(i_outer))

    # Outer CV (test set) -------------------------------------------------
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Inner CV (validation set for hyperparameters tuning) ----------------
    # Random Forest
    grid_search = GridSearchCV(estimator=rf_model, 
                    param_grid=rf_param_grid, 
                    cv=inner_cv, 
                    scoring=scoring_neg)
    grid_search.fit(X_train, y_train)
    
    # Grid CV automated model selection
    best_model = grid_search.best_estimator_
    best_grid_models.append({
        'max_depth': grid_search.best_params_['max_depth'],
        'max_features': grid_search.best_params_['max_features'],
        'n_estimators': grid_search.best_params_['n_estimators']
    })
    print('Best model hyperparameters:\n\tmax_depth\t{}\n\tmax_features\t{}\n\tn_estimators\t{}'.format(
        best_model.get_params()['max_depth'], best_model.get_params()['max_features'], best_model.get_params()['n_estimators']
    ))
    
    # Statistical tests to analyse best hyperparameter combination --------
    # Retrieve results
    df = []
    for i_cv in range(k_inner):
        res = {}
        for i_par in range(len(grid_search.cv_results_['split'+str(i_cv)+'_test_score'])):
            res.update({'par_'+str(i_par): grid_search.cv_results_['split'+str(i_cv)+'_test_score'][i_par]})
        df.append(res)
    df = pd.DataFrame(df)
    validation_results['rf'].append(df.iloc[:, list(grid_search.cv_results_['rank_test_score']).index(1)])
    
    # Global significance test    
    stat, p = friedmanchisquare(*[df[col] for col in df.columns])
    print(f'\nFriedman test statistic: {stat:.4f}, p-value: {p:.4f}')
    if(p<alpha_sig):
        # Pairwise significance test
        # Pairwise significance test
        posthoc_result = sp.posthoc_nemenyi_friedman(df.to_numpy())
        significant_columns = posthoc_result.columns[posthoc_result.lt(0.05).any(axis=0)]
        df_significant = df.iloc[:, list(significant_columns)]
        # P-values
        filtered_result = posthoc_result[posthoc_result < alpha_sig]
        filtered_result = filtered_result.dropna(how='all').dropna(axis=1, how='all')
        filtered_result = filtered_result.fillna('-')
        print('P-values of significant differences:')
        display(filtered_result)
    else:
        print('No significant differences found.')
    
    # Boxplots
    df_long = df.melt(var_name='group', value_name='score')
    label_map = {
        f'par_{i}': f'mx_depth: {param['max_depth']}<br>mx_feat: {param['max_features']}<br>n_est: {param['n_estimators']}'
        for i, param in enumerate(grid_search.cv_results_['params'])
    }
    df_long['label'] = df_long['group'].map(label_map)
    
    fig = go.Figure()
    fig.add_trace(go.Box(
        x=df_long['label'],
        y=-df_long['score'],
        fillcolor='rgba(108, 140, 200, 0.3)',
        line=dict(color='rgba(108, 140, 200, 1)'),
        boxpoints='all',
        jitter=0,
        pointpos=0,
        marker=dict(color='rgba(108, 140, 200, 1)')
    ))
    fig.update_yaxes(range=[0, max(-df_long['score'])*1.05])
    fig.update_layout(
        title='CV '+str(i_outer), yaxis_title='Validation Fitness',
        width=1000, height=400,
        plot_bgcolor='#f1f1f1',
        xaxis_tickangle=-90,
        margin=dict(l=50, r=50, t=50, b=20),
        showlegend=False
    )
    fig.show()
    
    # Outer CV update
    i_outer +=1


In [None]:
best_grid_models

<br />
<hr />

# Excercises

- Try different ranges for the algorithms' hyperparameters.
- Compare validation and test results. Tip: look at `df_test_set` and `validation_results` objects.
- **Implement the nested cross-validation loop for SVM or RF without using the KFold method from** `scikit-learn`.

<br />