In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import ast

In [2]:
# Three functions to load the best tuned RandomForest model, train/test data, and hyperparameter tuning results

def load_data(X_train_file_path, X_test_file_path, y_train_file_path, y_test_file_path):
    X_train = pd.read_csv(X_train_file_path)
    X_test = pd.read_csv(X_test_file_path)
    y_train = pd.read_csv(y_train_file_path).squeeze()
    y_test = pd.read_csv(y_test_file_path).squeeze()

    return X_train, X_test, y_train, y_test

def load_results(hyperparameter_results_file_path):
    hyperparameter_results = pd.read_csv(hyperparameter_results_file_path)

    return hyperparameter_results
    

In [3]:
# This function creates a naive baseline prediction based on the average of all the values in our train dataset.
# We then calculate the MAE from this average value

def baseline_MAE(y_test):
    # Calculating baseline average value of all y_test values
    y_test_mean = y_test.mean()

    # Creating pd.Series with this y_test_mean value throughout
    y_pred_naive = pd.Series([y_test_mean] * len(y_test))

    # Calculating the baseline MAE
    mae_baseline = mean_absolute_error(y_test, y_pred_naive)

    return round(mae_baseline, 3)
    

In [4]:
# Now let's call upon the model which produced the best test MAE and evaluate to see if it overfitted and how it compares to the baseline MAE

def best_tuned_model(hyperp_results, percent_threshold):
    # First we sift out all the models with hyperparameters that cause overfitting, we calculate
    hyperp_results['MAE_percentage_difference'] = ((hyperp_results['test_mae'] - hyperp_results['train_mae']) / (hyperp_results['train_mae']))*100

    # Drop all rows that have a MAE percent difference of greater than the percent_threshold
    hyperp_results = hyperp_results[hyperp_results['MAE_percentage_difference'] <= percent_threshold]

    # Now that we have all the models that showed little overfitting, let's find the one with the bets test MAE
    best = hyperp_results.loc[hyperp_results['test_mae'].idxmin()]
    best_params = best['params']
    best_test_mae = best['test_mae']

    return best_params, round(best_test_mae, 3)
    
        

In [5]:
# Making the final best_model to be saved as our final model for implemenation

def best_model(best_params, X_train, y_train):
    best_model = RandomForestRegressor(**best_params)
    best_model.fit(X_train, y_train)

    return best_model
    

In [6]:
if __name__ == '__main__':
    # Paths to training data and hyperparameter tuning results
    hyperparameter_results_file_path = '../Data/hyperparameter_results.csv'
    X_train_file_path = '../Data/X_train_dataset.csv'
    X_test_file_path = '../Data/X_test_dataset.csv'
    y_train_file_path = '../Data/y_train_dataset.csv'
    y_test_file_path = '../Data/y_test_dataset.csv'

    # Executing function to load train data and test data
    X_train, X_test, y_train, y_test = load_data(X_train_file_path, X_test_file_path, y_train_file_path, y_test_file_path)

    # Executing function to load hyperparameter results
    hyperparameter_results = load_results(hyperparameter_results_file_path)
    
    # Executing naive, baseline MAE prediction for the test data
    mae_baseline = baseline_MAE(y_test)
    print(f"The baseline MAE, based on averaging all the y_test values, is {mae_baseline}. This is the value we are aiming to beat.")

    # Executing function to evaluate the best performing model
    percent_threshold = 20       # the percentage difference between test and train MAE to determine overfitting acceptability
    best_params, best_test_mae = best_tuned_model(hyperparameter_results, percent_threshold)
    print(f"The best model hyperparameters are {best_params}")
    print(f"This best model has a test MAE of {best_test_mae}")
    print(f"The best model performs {(best_test_mae / mae_baseline)*100} percent more accurate than our baseline model") 

    # Saving best model
    best_model = best_model(ast.literal_eval(best_params), X_train, y_train)
    joblib.dump(best_model, '../Models/Final_Model.pkl')
    print("💾 final model has been saved as 'Final_Model.pkl'")

    
    

The baseline MAE, based on averaging all the y_test values, is 0.248. This is the value we are aiming to beat.
The best model hyperparameters are {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
This best model has a test MAE of 0.124
The best model performs 50.0 percent more accurate than our baseline model
💾 final model has been saved as 'Final_Model.pkl'
