In [45]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from get_root import PROJECT_ROOT

from src.utils_performance import (
    _all_algorithms_all_datasets_performance,
)

from src.utils_visualization import NotebookFigureSaver
from src.utils_data_centric import _get_all_data_set_characteristics

# Where to save the figures
CHAPTER_ID = "g_uncertainty_estimation"
fig_saver = NotebookFigureSaver(CHAPTER_ID)

In [46]:
def _get_available_algorithm_names():
    # get the mean performance of all algorithms on all data sets
    algorithm_data_set_performance = _all_algorithms_all_datasets_performance(
        performance_of_interest="$\\hat{\\mu}$", multivariate=False
    )
    Y = pd.DataFrame(algorithm_data_set_performance).T

    return Y.columns

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import matplotlib.pyplot as plt
import numpy as np

# Hyperparameters
test_size = 0.25
validation_size = 0.25  # This means 25% of the training set, or 20% of the total dataset
random_state = 42
length_scale = 1.
alpha = 0.1
n_restarts_optimizer = 3
lower_scale_choice = 10
upper_scale_choice = 10
applied_algorithm= _get_available_algorithm_names()[0]

def get_data(applied_algorithm, test_size, validation_size, random_state):
    # get the characteristic statistics on all data sets
    normalized_data_set_characteristics = _get_all_data_set_characteristics(
        multivariate=False, number_data_sets=None, normalize_each_characteristic=True
    )
    X = pd.DataFrame(normalized_data_set_characteristics)

    # get the mean performance of all algorithms on all data sets
    algorithm_data_set_performance = _all_algorithms_all_datasets_performance(
        performance_of_interest="$\\hat{\\mu}$", multivariate=False
    )
    Y = pd.DataFrame(algorithm_data_set_performance).T

    # inner join x and y on index
    matched_data_sets = X.join(Y, how="inner")
    X = matched_data_sets[X.columns]
    y = matched_data_sets[applied_algorithm]
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=validation_size, random_state=random_state)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

def define_model(length_scale, alpha, n_restarts_optimizer, lower_scale_choice, upper_scale_choice, **kwargs):
    length_scale_lower_bound = 10 ** -upper_scale_choice
    length_scale_upper_bound = 10 ** lower_scale_choice
    length_scale_bounds = (length_scale_lower_bound, length_scale_upper_bound)
    kernel = 1 * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds)
    model = GaussianProcessRegressor(kernel=kernel, alpha=alpha, n_restarts_optimizer=n_restarts_optimizer)
    return model

def evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test):

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred_val = model.predict(X_val)
    mae_val = mean_absolute_error(y_val, y_pred_val)
    mse_val = mean_squared_error(y_val, y_pred_val)

    # Predict on the test set
    y_pred_test = model.predict(X_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)

    # Naive baseline performance for validation and test
    naive_baseline_val = np.repeat(y_train.mean(), len(y_val))
    naive_baseline_test = np.repeat(y_train.mean(), len(y_test))

    mae_naive_val = mean_absolute_error(naive_baseline_val, y_val)
    mse_naive_val = mean_squared_error(naive_baseline_val, y_val)
    mae_naive_test = mean_absolute_error(naive_baseline_test, y_test)
    mse_naive_test = mean_squared_error(naive_baseline_test, y_test)

    # Calculate relative improvement for validation and test for both MAE and MSE
    relative_improvement_mae_val = (mae_val - mae_naive_val) / mae_naive_val
    relative_improvement_mse_val = (mse_val - mse_naive_val) / mse_naive_val
    relative_improvement_mae_test = (mae_test - mae_naive_test) / mae_naive_test
    relative_improvement_mse_test = (mse_test - mse_naive_test) / mse_naive_test

    # Extending results_dict with MSE improvements
    results_dict = {
        "Validation": {
            "raw_predictions": {
                "GT Performance": y_val,
                "Estimated Performance": y_pred_val,
                "Naive Baseline": naive_baseline_val,
            },
            "metrics":{
                "Naive MAE": mae_naive_val,
                "Naive MSE": mse_naive_val,
                "Model MAE": mae_val,
                "Model MSE": mse_val,
                "Improvement MAE": relative_improvement_mae_val,
                "Improvement MSE": relative_improvement_mse_val,
            },
        },
        "Test": {
            "raw_predictions": {
                "GT Performance": y_test,
                "Estimated Performance": y_pred_test,
                "Naive Baseline": naive_baseline_test,
            },
            "metrics":{
                "Naive MAE": mae_naive_test,
                "Naive MSE": mse_naive_test,
                "Model MAE": mae_test,
                "Model MSE": mse_test,
                "Improvement MAE": relative_improvement_mae_test,
                "Improvement MSE": relative_improvement_mse_test,
            }
        }
    }
    
    return results_dict

X_train, X_val, X_test, y_train, y_val, y_test = get_data(applied_algorithm=applied_algorithm, test_size=test_size, validation_size=validation_size, random_state=random_state)
model = define_model(length_scale, alpha, n_restarts_optimizer, lower_scale_choice, upper_scale_choice)
results_dict = evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test)


In [49]:
import itertools
from tqdm import tqdm  
import pandas as pd


def grid_search(
hyperparameter_grid, X_train, X_val, X_test, y_train, y_val, y_test, target_metric,
test_size = 0.25,
validation_size = 0.25,
random_state = 0,
applied_algorithm= _get_available_algorithm_names()[0],
):
    
    best_score = float('inf') 
    best_params = None
    all_results = []

    # Generate all combinations of hyperparameters
    keys, values = zip(*hyperparameter_grid.items())
    total = np.prod([len(v) for v in values])  # Calculate total iterations for tqdm
    X_train, X_val, X_test, y_train, y_val, y_test = get_data(applied_algorithm=applied_algorithm,test_size=test_size, validation_size=validation_size, random_state=random_state)
    
    # Wrap the product in tqdm for a progress bar
    for v in tqdm(itertools.product(*values), total=total, desc="Grid Search Progress"):
        params = dict(zip(keys, v))
        
        # Update the model with current set of hyperparameters
        model = define_model(**params)
        
        # Evaluate the model
        results_dict = evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test)
        validation_score = results_dict['Validation']["metrics"]['Model ' + target_metric]
        all_results.append((params, results_dict))
        
        # Check if the current model is better
        if (validation_score < best_score):
            best_score = validation_score
            best_params = params

    return best_params, best_score, all_results

hyperparameter_grid = {
    'length_scale': [0.01, 0.1, 1.],
    'alpha': [1e-3, 1e-2, 1e-1],
    'n_restarts_optimizer': [0, 1, 3],
    'lower_scale_choice': [1, 3, 10],
    'upper_scale_choice': [1, 3, 10]
}
target_metric = "MSE"

# Perform the grid search
best_params, best_score, all_results = grid_search(hyperparameter_grid, X_train, X_val, X_test, y_train, y_val, y_test, target_metric)

# Initialize lists to store the flattened data
data = []

# Iterate over all results to flatten the structure
for params, results in all_results:
    entry = params.copy()  # Start with hyperparameter values
    # Add validation metrics
    entry.update({
        'Validation MAE': results['Validation']['metrics']['Model MAE'],
        'Validation MSE': results['Validation']['metrics']['Model MSE'],
        'Validation Improvement MAE': results['Validation']['metrics']['Improvement MAE'],
        'Validation Improvement MSE': results['Validation']['metrics']['Improvement MSE'],
    })
    # Add test metrics
    entry.update({
        'Test MAE': results['Test']['metrics']['Model MAE'],
        'Test MSE': results['Test']['metrics']['Model MSE'],
        'Test Improvement MAE': results['Test']['metrics']['Improvement MAE'],
        'Test Improvement MSE': results['Test']['metrics']['Improvement MSE'],
    })
    data.append(entry)

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(data)

sorted_results_df = results_df.sort_values('Validation Improvement MAE')


Grid Search Progress: 100%|██████████| 243/243 [00:07<00:00, 31.47it/s]


In [51]:
sorted_results_df.sort_values('Validation Improvement MAE')

Unnamed: 0,test_size,validation_size,length_scale,alpha,n_restarts_optimizer,lower_scale_choice,upper_scale_choice,Validation MAE,Validation MSE,Validation Improvement MAE,Validation Improvement MSE,Test MAE,Test MSE,Test Improvement MAE,Test Improvement MSE
227,0.25,0.2,1.00,0.1,1,1,10,0.126776,0.023024,0.000792,-0.066634,0.129182,0.023780,-0.043385,-0.075650
74,0.25,0.2,0.01,0.1,3,1,10,0.126776,0.023024,0.000792,-0.066634,0.129182,0.023780,-0.043385,-0.075650
73,0.25,0.2,0.01,0.1,3,1,3,0.126776,0.023024,0.000792,-0.066634,0.129182,0.023780,-0.043385,-0.075650
234,0.25,0.2,1.00,0.1,3,1,1,0.126776,0.023024,0.000792,-0.066634,0.129182,0.023780,-0.043385,-0.075650
54,0.25,0.2,0.01,0.1,0,1,1,0.126776,0.023024,0.000792,-0.066634,0.129182,0.023780,-0.043385,-0.075650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,0.25,0.2,0.01,0.1,1,3,10,0.872689,0.781715,5.889143,30.689536,0.864387,0.769402,5.400938,28.907476
58,0.25,0.2,0.01,0.1,0,3,3,0.872689,0.781715,5.889143,30.689536,0.864387,0.769402,5.400938,28.907476
56,0.25,0.2,0.01,0.1,0,1,10,0.872689,0.781715,5.889143,30.689536,0.864387,0.769402,5.400938,28.907476
59,0.25,0.2,0.01,0.1,0,3,10,0.872689,0.781715,5.889143,30.689536,0.864387,0.769402,5.400938,28.907476
