Importing libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import optuna

Loading data

In [2]:
df = pd.read_csv('../data/autos_clean.csv')

Splitting the dataset

In [3]:
X = df.drop('price', axis=1) # Predictors
y = df['price'] # Explained variable
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=2812) # Using random_state for reproducibility

Setting common hyperparameters

In [4]:
common_params = {'n_jobs':-1, 
                 'random_state':2812} # Using random_state for reproducibility

Training the baseline model

In [5]:
baseline_model = RandomForestRegressor(min_samples_split=8,
                                       n_estimators=15,
                                       **common_params)
baseline_model.fit(X_train, y_train)

Testing the baseline model

In [6]:
y_pred_test = baseline_model.predict(X_test)
y_pred_train = baseline_model.predict(X_train)


print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.94
Test R^2: 0.87
Diff: 0.07


Declaring optuna objective function

In [7]:
def objective_1(trial): 
    # Maximizing the test R2 score to find hyperparameters that reduce overfitting the most without sacrificing a lot of performance
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 5, 25)

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  min_samples_split=min_samples_split, 
                                  **common_params)
    model.fit(X_train, y_train)
    
    y_pred_test = model.predict(X_test)
    score = r2_score(y_test, y_pred_test) 
    return score

Running the test

In [8]:
study_1 = optuna.create_study(direction='maximize')

[I 2025-06-06 17:39:57,644] A new study created in memory with name: no-name-7498ab77-f6ce-47d2-b884-e69d076a0f59


In [9]:
study_1.optimize(objective_1, n_trials=50, n_jobs=-1)

[I 2025-06-06 17:40:38,827] Trial 4 finished with value: 0.8684323158055867 and parameters: {'n_estimators': 11, 'min_samples_split': 17}. Best is trial 4 with value: 0.8684323158055867.
[I 2025-06-06 17:41:29,145] Trial 10 finished with value: 0.8718761279935423 and parameters: {'n_estimators': 40, 'min_samples_split': 14}. Best is trial 10 with value: 0.8718761279935423.
[I 2025-06-06 17:41:37,404] Trial 0 finished with value: 0.8714609299353582 and parameters: {'n_estimators': 43, 'min_samples_split': 14}. Best is trial 10 with value: 0.8718761279935423.
[I 2025-06-06 17:41:39,453] Trial 14 finished with value: 0.8698781835284332 and parameters: {'n_estimators': 44, 'min_samples_split': 18}. Best is trial 10 with value: 0.8718761279935423.
[I 2025-06-06 17:41:39,505] Trial 13 finished with value: 0.871069110213035 and parameters: {'n_estimators': 44, 'min_samples_split': 15}. Best is trial 10 with value: 0.8718761279935423.
[I 2025-06-06 17:41:39,530] Trial 3 finished with value: 0.

In [10]:
print("Best hyperparameters:", study_1.best_params)
print("Best test set R2 score:", study_1.best_value)

Best hyperparameters: {'n_estimators': 199, 'min_samples_split': 7}
Best test set R2 score: 0.875140264589207


Training the optimized model

In [11]:
best_model_1 = RandomForestRegressor(**study_1.best_params,
                                   **common_params)
best_model_1.fit(X_train, y_train)

Testing the optimized model

In [12]:
y_pred_test = best_model_1.predict(X_test)
y_pred_train = best_model_1.predict(X_train)

print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.94
Test R^2: 0.88
Diff: 0.07


Although the difference between train and test scores hasn't improved dramatically, we were still able to find a better set of hypereparameters with optuna. A more drastic approach to reducing overfitting would be to minimize the difference between train and test scores. 

In [13]:
def objective_2(trial): 
    # Minimizing the difference between train and test scores for maximum generalisation
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 5, 25)

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  min_samples_split=min_samples_split, 
                                  **common_params)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    score = r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test) 
    return score

In [14]:
study_2 = optuna.create_study(direction='minimize') 

[I 2025-06-06 17:50:37,247] A new study created in memory with name: no-name-91de0d41-b114-4746-a514-5e2c7151932e


In [15]:
study_2.optimize(objective_2, n_trials=50, n_jobs=-1)

[I 2025-06-06 17:51:09,167] Trial 0 finished with value: 0.057452232402123715 and parameters: {'n_estimators': 15, 'min_samples_split': 13}. Best is trial 0 with value: 0.057452232402123715.
[I 2025-06-06 17:51:09,603] Trial 11 finished with value: 0.059878896318221964 and parameters: {'n_estimators': 14, 'min_samples_split': 11}. Best is trial 0 with value: 0.057452232402123715.
[I 2025-06-06 17:51:34,531] Trial 1 finished with value: 0.052513559044514935 and parameters: {'n_estimators': 25, 'min_samples_split': 16}. Best is trial 1 with value: 0.052513559044514935.
[I 2025-06-06 17:51:35,304] Trial 15 finished with value: 0.049264207490668954 and parameters: {'n_estimators': 20, 'min_samples_split': 18}. Best is trial 15 with value: 0.049264207490668954.
[I 2025-06-06 17:52:14,034] Trial 5 finished with value: 0.045317153802411414 and parameters: {'n_estimators': 59, 'min_samples_split': 21}. Best is trial 5 with value: 0.045317153802411414.
[I 2025-06-06 17:52:31,552] Trial 17 finis

In [16]:
print("Best hyperparameters:", study_2.best_params)
print("Best test set R2 score:", study_2.best_value)

Best hyperparameters: {'n_estimators': 160, 'min_samples_split': 25}
Best test set R2 score: 0.04070115417628706


Training the optimized model

In [17]:
best_model_2 = RandomForestRegressor(**study_2.best_params,
                                   **common_params)
best_model_2.fit(X_train, y_train)

Testing the optimized model

In [18]:
y_pred_test = best_model_2.predict(X_test)
y_pred_train = best_model_2.predict(X_train)

print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.91
Test R^2: 0.87
Diff: 0.04


Now the model generalizes almost twice as good as the baseline model. Ideally, we should optimise other parameters as well, such as max_depth