<a href="https://colab.research.google.com/github/MProtik/Mastering_Hyperparameter_Tuning_with_Optuna/blob/main/Mastering_Hyperparameter_Tuning_with_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [2]:
import seaborn as sns
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = sns.load_dataset('healthexp')
df.head(10)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
5,1971,Canada,313.391,72.8
6,1971,Germany,298.251,70.8
7,1971,Great Britain,134.172,71.9
8,1971,Japan,163.854,72.9
9,1971,USA,357.988,71.2


In [4]:
df = pd.get_dummies(df)
df.head(2)

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   274 non-null    int64  
 1   Spending_USD           274 non-null    float64
 2   Life_Expectancy        274 non-null    float64
 3   Country_Canada         274 non-null    bool   
 4   Country_France         274 non-null    bool   
 5   Country_Germany        274 non-null    bool   
 6   Country_Great Britain  274 non-null    bool   
 7   Country_Japan          274 non-null    bool   
 8   Country_USA            274 non-null    bool   
dtypes: bool(6), float64(2), int64(1)
memory usage: 8.2 KB


In [6]:
X = df.loc[:, df.columns != "Life_Expectancy"].values
Y = df.iloc[:, 2].values

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.20, random_state=42)

In [8]:
rfr = RandomForestRegressor(random_state=42)

In [9]:
rfr.fit(X_train, Y_train)

In [10]:
y_pred = rfr.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [12]:
mean_absolute_error(Y_test, y_pred)

0.274527272727264

In [13]:
mean_squared_error(Y_test, y_pred)

0.12436518181817355

In [14]:
r2_score(Y_test, y_pred)

0.9898132982462418

In [15]:
import optuna

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 1000)
  max_depth = trial.suggest_int('max_depth', 10, 50)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

  model = RandomForestRegressor(n_estimators= n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf)


  score = cross_val_score(model, X_train, Y_train, cv=5, scoring='neg_mean_squared_error')

  return score.mean()

In [18]:
study = optuna.create_study(direction="maximize")

[I 2025-07-21 13:54:14,888] A new study created in memory with name: no-name-50db3465-bb3a-4969-9c83-754aea158096


In [19]:
study.optimize(objective, n_trials=200)

[I 2025-07-21 13:54:24,594] Trial 0 finished with value: -1.933050860477279 and parameters: {'n_estimators': 610, 'max_depth': 42, 'min_samples_split': 25, 'min_samples_leaf': 12}. Best is trial 0 with value: -1.933050860477279.
[I 2025-07-21 13:54:32,650] Trial 1 finished with value: -3.2401513592393085 and parameters: {'n_estimators': 715, 'max_depth': 30, 'min_samples_split': 31, 'min_samples_leaf': 30}. Best is trial 0 with value: -1.933050860477279.
[I 2025-07-21 13:54:34,719] Trial 2 finished with value: -2.456412742138801 and parameters: {'n_estimators': 290, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 22}. Best is trial 0 with value: -1.933050860477279.
[I 2025-07-21 13:54:36,233] Trial 3 finished with value: -1.2757557603782537 and parameters: {'n_estimators': 242, 'max_depth': 12, 'min_samples_split': 17, 'min_samples_leaf': 8}. Best is trial 3 with value: -1.2757557603782537.
[I 2025-07-21 13:54:40,701] Trial 4 finished with value: -2.4151381250848667 and pa

In [26]:
study.best_trial

FrozenTrial(number=171, state=1, values=[-0.1832914255311086], datetime_start=datetime.datetime(2025, 7, 21, 14, 10, 37, 948323), datetime_complete=datetime.datetime(2025, 7, 21, 14, 10, 41, 959000), params={'n_estimators': 543, 'max_depth': 47, 'min_samples_split': 2, 'min_samples_leaf': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'max_depth': IntDistribution(high=50, log=False, low=10, step=1), 'min_samples_split': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=32, log=False, low=1, step=1)}, trial_id=171, value=None)

In [20]:
study.best_params

{'n_estimators': 543,
 'max_depth': 47,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [40]:
best_params = study.best_params

In [22]:
import matplotlib.pyplot as plt

In [23]:
optuna.visualization.plot_optimization_history(study)

In [24]:
optuna.visualization.plot_parallel_coordinate(study)

In [41]:
optuna.visualization.plot_slice(study, params = ['n_estimators', 'max_depth', "min_samples_split", "min_samples_leaf"])

In [28]:
optuna.visualization.plot_param_importances(study)

In [35]:
best_params

{'n_estimators': 543,
 'max_depth': 47,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

In [43]:
best_model = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                   max_depth=best_params["max_depth"],
                                   min_samples_split= best_params["min_samples_split"],
                                   min_samples_leaf=best_params["min_samples_leaf"]
                                   )

In [44]:
best_model.fit(X_train, Y_train)

In [42]:
print(best_params['n_estimators'],
                                   best_params["max_depth"],
                                   best_params["min_samples_split"],
                                   best_params["min_samples_leaf"]
                                   )

543 47 2 1


In [47]:
y_pred = best_model.predict(X_test)

In [53]:
mean_absolute_error(Y_test, y_pred)

0.26816675037667376

In [54]:
mean_squared_error(Y_test, y_pred)

0.11581282252641795

In [55]:
r2_score(Y_test, y_pred)

0.9905138185375518