In [9]:
import joblib
import pandas as pd
import numpy as np 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import os
from joblib import Parallel, delayed

In [10]:
# Two function to load the baseline RandomForest model and train/test data

def load_model(best_model_path):
    best_model = joblib.load(best_model_path)

    return best_model

def load_data(X_train_file_path, X_test_file_path, y_train_file_path, y_test_file_path):
    X_train = pd.read_csv(X_train_file_path)
    X_test = pd.read_csv(X_test_file_path)
    y_train = pd.read_csv(y_train_file_path).squeeze()
    y_test = pd.read_csv(y_test_file_path).squeeze()

    return X_train, X_test, y_train, y_test
    

In [11]:
# Function to perform the hyperparameter tuning

def tune_hyperparameters(X_train, y_train):
    # Define hyperparameters
    param_grid = {
        "n_estimators": [50, 100, 200, 300, 500],
        "max_depth": [10, 20, 30, None],
        "min_samples_split": [2, 5, 10, 15],
        "min_samples_leaf": [1, 2, 4, 6, 8],
        "bootstrap": [True, False],
        "max_features": ['sqrt', 'log2']
    }

    base_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    search = GridSearchCV(base_model, param_grid, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1, verbose=2, return_train_score=True)

    # Performing hyperparameter tuning
    search.fit(X_train, y_train)
    print('Hyperparameter tuning has now finished')

    return search.cv_results_

In [1]:
# For every set of hyperparameters (from cv_results_), re-train the model on the training data,
# evaluate it on the test data (computing MAE), and select the candidate with the lowest test MAE.

def best_model_evaluation(cv_results, X_train, X_test, y_train, y_test):
    print('evaluating grid search results for best test MAE')
    params_list = cv_results['params']
    # Convert the (negative) mean training scores to positive MAE values
    train_mae_list = [-score for score in cv_results['mean_train_score']]

    def evaluate_candidate(params):
        model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_mae = mean_absolute_error(y_test, y_pred)
        return model, params, test_mae

    # Evaluate all candidate hyperparameter combinations in parallel.
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_candidate)(params) for params in params_list
    )

    # Assemble all results into a DataFrame for inspection or saving.
    test_mae_list = [res[2] for res in results]
    all_results = [
        {"params": p, "train_mae": t, "test_mae": te}
        for p, t, te in zip(params_list, train_mae_list, test_mae_list)
    ]
    results_df = pd.DataFrame(all_results)
    
    return results_df
    
    

In [13]:
if __name__ == '__main__':
    # Paths to model and training data defined below
    best_model_path = '../Models/RandomForest_best.pkl'
    X_train_file_path = '../Data/X_train_dataset.csv'
    X_test_file_path = '../Data/X_test_dataset.csv'
    y_train_file_path = '../Data/y_train_dataset.csv'
    y_test_file_path = '../Data/y_test_dataset.csv'

    # Executing function to load best RandomForest model
    best_model = load_model(best_model_path)

    # Executing function to load train and test data
    X_train, X_test, y_train, y_test = load_data(X_train_file_path, X_test_file_path, y_train_file_path, y_test_file_path)

    # Executing function to perform hyperparameter tuning
    cv_results = tune_hyperparameters(X_train, y_train)

    # Executing function to evaluate the hyperparameter tuning results
    results_df = best_model_evaluation(
        cv_results, X_train, X_test, y_train, y_test
    )
    
    # Saving hyperparameter tuning results
    results_df.to_csv("../Data/hyperparameter_results.csv", index=False)
    print("💾 Hyperparameter tuning results saved to 'hyperparameter_results.csv'")
    

    

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits
Hyperparameter tuning has now finished
evaluating grid search results for best test MAE
Best hyperparameters (based on test MAE): {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Test MAE for the best model: 0.11448929596690186
✅ Best Tuned Model saved as 'best_tuned_model.pkl'
✅ Hyperparameter tuning results saved to 'hyperparameter_results.csv'
