In [1]:
import pandas as pd
import seaborn as sns


In [2]:
health_exp = sns.load_dataset('healthexp')

In [3]:
health_exp.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [4]:
health_exp = pd.get_dummies(health_exp)

In [5]:
health_exp.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [6]:
X = health_exp.drop(['Life_Expectancy'], axis=1)
y = health_exp['Life_Expectancy']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train , X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rf = RandomForestRegressor(random_state=42)

In [11]:
rf.fit(X_train, y_train)

In [12]:
y_pred =rf.predict(X_test)

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [14]:
mean_squared_error(y_test, y_pred)


0.12436518181817355

In [15]:
mean_absolute_error(y_test, y_pred)


0.274527272727264

In [16]:
r2_score(y_test, y_pred)

0.9898132982462418

In [18]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [19]:
import optuna

In [20]:
from sklearn.model_selection import cross_val_score

In [32]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 1000)
  max_depth = trial.suggest_int('max_depth', 10, 50)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

  model = RandomForestRegressor(
      n_estimators=n_estimators,
      max_depth=max_depth,
      min_samples_split=min_samples_split,
      min_samples_leaf=min_samples_leaf,
      random_state=42) # Added random_state for reproducibility

  score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

  return score.mean() # Return the mean of the scores

In [29]:
study = optuna.create_study(direction='maximize')


[I 2025-11-02 15:49:04,910] A new study created in memory with name: no-name-0c55de7e-2e34-40cd-bcf7-7635e29c5cc0


In [34]:
study.optimize(objective, n_trials=200)

[I 2025-11-02 16:00:39,534] Trial 7 finished with value: -2.7875947598321495 and parameters: {'n_estimators': 179, 'max_depth': 33, 'min_samples_split': 14, 'min_samples_leaf': 27}. Best is trial 5 with value: -2.5722624356584376.
[I 2025-11-02 16:00:40,379] Trial 8 finished with value: -2.4175773392338464 and parameters: {'n_estimators': 124, 'max_depth': 20, 'min_samples_split': 12, 'min_samples_leaf': 21}. Best is trial 8 with value: -2.4175773392338464.
[I 2025-11-02 16:00:45,953] Trial 9 finished with value: -2.3784900980300705 and parameters: {'n_estimators': 690, 'max_depth': 35, 'min_samples_split': 17, 'min_samples_leaf': 20}. Best is trial 9 with value: -2.3784900980300705.
[I 2025-11-02 16:00:51,487] Trial 10 finished with value: -2.53431816199118 and parameters: {'n_estimators': 866, 'max_depth': 36, 'min_samples_split': 25, 'min_samples_leaf': 24}. Best is trial 9 with value: -2.3784900980300705.
[I 2025-11-02 16:00:55,329] Trial 11 finished with value: -3.343303867628954 

In [35]:
study.best_params

{'n_estimators': 274,
 'max_depth': 12,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [36]:
import matplotlib.pyplot as plt

In [37]:
optuna.visualization.plot_optimization_history(study)


In [38]:
optuna.visualization.plot_parallel_coordinate(study)

In [39]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth','min_samples_split', 'min_samples_leaf'])

In [40]:
optuna.visualization.plot_param_importances(study)

In [41]:
best_n_estimators = study.best_params['n_estimators']
best_max_depth = study.best_params['max_depth']
best_min_samples_split = study.best_params['min_samples_split']
best_min_samples_leaf = study.best_params['min_samples_leaf']

In [42]:
best_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf)

In [43]:
best_model.fit(X_train,y_train)

In [44]:
y_pred = best_model.predict(X_test)

In [45]:
mean_absolute_error(y_test, y_pred)

0.26137973056953157

In [46]:
mean_squared_error(y_test, y_pred)

0.1082646676073263

In [47]:
r2_score(y_test, y_pred)

0.9911320848547625