#### Importing Libraries

In [22]:
import seaborn as sns
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

#### Loading Dataset

In [23]:
healthexp = sns.load_dataset('healthexp')
healthexp.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [24]:
healthexp = pd.get_dummies(healthexp)
healthexp.head(10)

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True
5,1971,313.391,72.8,True,False,False,False,False,False
6,1971,298.251,70.8,False,False,True,False,False,False
7,1971,134.172,71.9,False,False,False,True,False,False
8,1971,163.854,72.9,False,False,False,False,True,False
9,1971,357.988,71.2,False,False,False,False,False,True


In [25]:
X = healthexp.drop('Life_Expectancy', axis=1)
y = healthexp['Life_Expectancy']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
rfr = RandomForestRegressor(random_state=42)

In [28]:
rfr.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
y_pred = rfr.predict(X_test)

In [30]:
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test, y_pred)

In [31]:
print('Mean Absolute Error:', MAE)
print('Mean Squared Error:', MSE)
print('Root Mean Squared Error:', RMSE)
print('R^2 Score:', R2)

Mean Absolute Error: 0.274527272727264
Mean Squared Error: 0.12436518181817355
Root Mean Squared Error: 0.35265447936779926
R^2 Score: 0.9898132982462418


#### Hyperparameter Tuning with Optuna

In [32]:
def objective(trial: optuna.trial.Trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    score = cross_val_score(model,
                            X_train,
                            y_train,
                            n_jobs=-1,
                            cv=3,
                            scoring='neg_mean_squared_error')
    rmse = np.sqrt(-score.mean())
    return rmse

In [33]:
class EarlyStoppingCallback:
    """
    Early stopping callback for Optuna studies.
    Stops the study if there is no improvement in the best value for a specified number of trials (patience).
    """
    def __init__(self, patience: int, min_delta: float = 0.0):
        """
        Args:
            patience (int): Number of trials to wait for improvement before stopping.
            min_delta (float): Minimum change in the monitored value to qualify as an improvement.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_value = None

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial):
        if self.best_value is None:
            self.best_value = study.best_value
            return

        if study.direction == optuna.study.StudyDirection.MINIMIZE:
            if study.best_value < self.best_value - self.min_delta:
                self.best_value = study.best_value
                self.counter = 0
            else:
                self.counter += 1
        else:
            if study.best_value > self.best_value + self.min_delta:
                self.best_value = study.best_value
                self.counter = 0
            else:
                self.counter += 1

        if self.counter >= self.patience:
            study.stop()
            print(f'Early stopping triggered after {self.counter} trials with no improvement.')

In [34]:
study = optuna.create_study(direction='minimize', # we want to minimize RMSE, so we use 'minimize'
                            sampler=optuna.samplers.RandomSampler() # Using RandomSampler for hyperparameter sampling
                            )

[I 2025-11-14 22:43:12,710] A new study created in memory with name: no-name-51fa876e-b725-46af-8497-27ebf69938f2


In [53]:
early_stopping = EarlyStoppingCallback(patience=5)

study.optimize(
    objective,
    n_trials=100,               # Number of trials for optimization
    n_jobs=-1,                  # Utilize all available CPU cores
    # callbacks=[early_stopping]  # Adding early stopping callback
)

[I 2025-11-14 22:48:36,895] Trial 70 finished with value: 1.1261408455892088 and parameters: {'n_estimators': 145, 'max_depth': 43, 'min_samples_split': 28, 'min_samples_leaf': 2}. Best is trial 26 with value: 0.6423643673233009.
[I 2025-11-14 22:48:37,869] Trial 73 finished with value: 1.541162305956006 and parameters: {'n_estimators': 507, 'max_depth': 39, 'min_samples_split': 4, 'min_samples_leaf': 19}. Best is trial 26 with value: 0.6423643673233009.
[I 2025-11-14 22:48:37,897] Trial 71 finished with value: 1.4272177778970279 and parameters: {'n_estimators': 648, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 12}. Best is trial 26 with value: 0.6423643673233009.
[I 2025-11-14 22:48:38,823] Trial 74 finished with value: 1.7788123761687324 and parameters: {'n_estimators': 485, 'max_depth': 33, 'min_samples_split': 12, 'min_samples_leaf': 26}. Best is trial 26 with value: 0.6423643673233009.
[I 2025-11-14 22:48:39,137] Trial 75 finished with value: 0.918383929133433 and 

In [55]:
study.best_params

{'n_estimators': 727,
 'max_depth': 19,
 'min_samples_split': 9,
 'min_samples_leaf': 1}

In [57]:
# collect score after optimization
optuna_rfr = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    random_state=42
)

optuna_rfr.fit(X_train, y_train)
y_pred = optuna_rfr.predict(X_test)

MAE_optuna = mean_absolute_error(y_test, y_pred)
MSE_optuna = mean_squared_error(y_test, y_pred)
RMSE_optuna = np.sqrt(MSE_optuna)
R2_optuna = r2_score(y_test, y_pred)

print('MAE:', MAE_optuna)
print('MSE:', MSE_optuna)
print('RMSE:', RMSE_optuna)
print('R2:', R2_optuna)

MAE: 0.41492813501365927
MSE: 0.2351499653555023
RMSE: 0.4849226385264997
R2: 0.9807389614242252


In [52]:
study.best_value

0.6423643673233009

In [39]:
optuna.visualization.plot_optimization_history(study)

In [40]:
optuna.visualization.plot_parallel_coordinate(study)

In [41]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'])

In [42]:
optuna.visualization.plot_param_importances(study)