Importing libraries

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import optuna

Loading data

In [None]:
df = pd.read_csv('data/autos_clean.csv')

Splitting the dataset

In [26]:
X = df.drop('price', axis=1) # Predictors
y = df['price'] # Explained variable
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=2812) # Using random_state for reproducibility

Setting common hyperparameters

In [27]:
common_params = {'n_jobs':-1, 
                 'random_state':2812} # Using random_state for reproducibility

Training the baseline model

In [28]:
baseline_model = RandomForestRegressor(min_samples_split=8,
                                       n_estimators=15,
                                       **common_params)
baseline_model.fit(X_train, y_train)

Testing the baseline model

In [29]:
y_pred_test = baseline_model.predict(X_test)
y_pred_train = baseline_model.predict(X_train)


print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.94
Test R^2: 0.87
Diff: 0.07


Declaring optuna objective function

In [30]:
def objective_1(trial): 
    # Maximizing the test R2 score to find hyperparameters that reduce overfitting the most without sacrificing a lot of performance
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 5, 25)

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  min_samples_split=min_samples_split, 
                                  **common_params)
    model.fit(X_train, y_train)
    
    y_pred_test = model.predict(X_test)
    score = r2_score(y_test, y_pred_test) 
    return score

Running the test

In [31]:
study_1 = optuna.create_study(direction='maximize')

[I 2025-06-04 13:55:24,748] A new study created in memory with name: no-name-6eb542fb-f2b1-4e4d-bc5f-9c140d458e40


In [32]:
study_1.optimize(objective_1, n_trials=50, n_jobs=-1)

[I 2025-06-04 13:56:34,324] Trial 14 finished with value: 0.8740215468052825 and parameters: {'n_estimators': 28, 'min_samples_split': 9}. Best is trial 14 with value: 0.8740215468052825.
[I 2025-06-04 13:56:37,419] Trial 2 finished with value: 0.8730286238276157 and parameters: {'n_estimators': 37, 'min_samples_split': 11}. Best is trial 14 with value: 0.8740215468052825.
[I 2025-06-04 13:56:39,495] Trial 5 finished with value: 0.8713276329340982 and parameters: {'n_estimators': 41, 'min_samples_split': 13}. Best is trial 14 with value: 0.8740215468052825.
[I 2025-06-04 13:56:55,813] Trial 13 finished with value: 0.8716547577738898 and parameters: {'n_estimators': 49, 'min_samples_split': 5}. Best is trial 14 with value: 0.8740215468052825.
[I 2025-06-04 13:57:28,525] Trial 12 finished with value: 0.8680217016787599 and parameters: {'n_estimators': 81, 'min_samples_split': 25}. Best is trial 14 with value: 0.8740215468052825.
[I 2025-06-04 13:57:29,172] Trial 7 finished with value: 0.

In [33]:
print("Best hyperparameters:", study_1.best_params)
print("Best test set R2 score:", study_1.best_value)

Best hyperparameters: {'n_estimators': 148, 'min_samples_split': 9}
Best test set R2 score: 0.8751293869363935


Training the optimized model

In [34]:
best_model_1 = RandomForestRegressor(**study_1.best_params,
                                   **common_params)
best_model_1.fit(X_train, y_train)

Testing the optimized model

In [35]:
y_pred_test = best_model_1.predict(X_test)
y_pred_train = best_model_1.predict(X_train)

print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.94
Test R^2: 0.88
Diff: 0.06


Although the difference between train and test scores hasn't improved dramatically, we were still able to find a better set of hypereparameters with optuna. A more drastic approach to reducing overfitting would be to minimize the difference between train and test scores. 

In [36]:
def objective_2(trial): 
    # Minimizing the difference between train and test scores for maximum generalisation
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 5, 25)

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  min_samples_split=min_samples_split, 
                                  **common_params)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    score = r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test) 
    return score

In [37]:
study_2 = optuna.create_study(direction='minimize') 

[I 2025-06-04 14:05:59,824] A new study created in memory with name: no-name-071e0a2c-a1cd-43b5-829d-1af7cb132053


In [38]:
study_2.optimize(objective_2, n_trials=50, n_jobs=-1)

[I 2025-06-04 14:06:57,455] Trial 4 finished with value: 0.05000640320178662 and parameters: {'n_estimators': 12, 'min_samples_split': 17}. Best is trial 4 with value: 0.05000640320178662.
[I 2025-06-04 14:08:06,914] Trial 14 finished with value: 0.04219940911700071 and parameters: {'n_estimators': 59, 'min_samples_split': 24}. Best is trial 14 with value: 0.04219940911700071.
[I 2025-06-04 14:08:11,233] Trial 6 finished with value: 0.05346037245326085 and parameters: {'n_estimators': 66, 'min_samples_split': 15}. Best is trial 14 with value: 0.04219940911700071.
[I 2025-06-04 14:08:15,818] Trial 15 finished with value: 0.06009666335647623 and parameters: {'n_estimators': 65, 'min_samples_split': 11}. Best is trial 14 with value: 0.04219940911700071.
[I 2025-06-04 14:08:18,154] Trial 5 finished with value: 0.042173333872126184 and parameters: {'n_estimators': 87, 'min_samples_split': 24}. Best is trial 5 with value: 0.042173333872126184.
[I 2025-06-04 14:08:21,949] Trial 10 finished wi

In [39]:
print("Best hyperparameters:", study_2.best_params)
print("Best test set R2 score:", study_2.best_value)

Best hyperparameters: {'n_estimators': 33, 'min_samples_split': 25}
Best test set R2 score: 0.04073219167498687


Training the optimized model

In [40]:
best_model_2 = RandomForestRegressor(**study_2.best_params,
                                   **common_params)
best_model_2.fit(X_train, y_train)

Testing the optimized model

In [41]:
y_pred_test = best_model_2.predict(X_test)
y_pred_train = best_model_2.predict(X_train)

print(f"Train R^2: {np.round(r2_score(y_train, y_pred_train),2)}")
print(f"Test R^2: {np.round(r2_score(y_test, y_pred_test),2)}")
print(f"Diff: {np.round(r2_score(y_train, y_pred_train) - r2_score(y_test, y_pred_test),2)}")

Train R^2: 0.91
Test R^2: 0.87
Diff: 0.04


Now the model generalizes almost twice as good as the baseline model. Ideally, we should optimise other parameters as well, such as max_depth