In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.12 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
   ---------------------------------------- 0.0/395.9 kB ? eta -:--:--
   ------------- -------------------------- 133.1/395.9 kB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 395.9/395.9 kB 4.9 MB/s eta 0:00:00
Downloading alembic-1.16.2-py3-none-any.whl (242 kB)
   ---------------------------------------- 0.0/242.7 kB ? eta -:--:--
   --------------------------------------- 242.7/242.7 kB 15.5 MB/s 

In [2]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [3]:
# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# checking the shape of data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


Objective Function

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators',50,200)
    max_depth = trial.suggest_int('max_depth',3,20)

    # Create the RandomForestClassifier with suggested hyperparameters
    clf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42)

    score = cross_val_score(clf,X_train,y_train,cv=5,scoring='accuracy').mean()
    return score

Bayesian Search (using TPE Sampler)

In [15]:
# Create a study object and optimize the objective function
study = optuna.create_study(study_name='Learning Optuna',direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective,n_trials=50) # Run 50 trials to find the best hyperparameters


[I 2025-07-05 10:15:54,104] A new study created in memory with name: Learning Optuna
[I 2025-07-05 10:15:54,759] Trial 0 finished with value: 0.7633956386292835 and parameters: {'n_estimators': 96, 'max_depth': 8}. Best is trial 0 with value: 0.7633956386292835.
[I 2025-07-05 10:15:56,067] Trial 1 finished with value: 0.770889581169955 and parameters: {'n_estimators': 191, 'max_depth': 9}. Best is trial 1 with value: 0.770889581169955.
[I 2025-07-05 10:15:56,898] Trial 2 finished with value: 0.7522845275181723 and parameters: {'n_estimators': 143, 'max_depth': 4}. Best is trial 1 with value: 0.770889581169955.
[I 2025-07-05 10:15:57,408] Trial 3 finished with value: 0.7559709241952232 and parameters: {'n_estimators': 76, 'max_depth': 12}. Best is trial 1 with value: 0.770889581169955.
[I 2025-07-05 10:15:58,185] Trial 4 finished with value: 0.757840083073728 and parameters: {'n_estimators': 123, 'max_depth': 18}. Best is trial 1 with value: 0.770889581169955.
[I 2025-07-05 10:15:58,879

In [16]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7764624437521632
Best hyperparameters: {'n_estimators': 127, 'max_depth': 9}


In [17]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.65




Random Search (using Random Sampler)

In [19]:
study = optuna.create_study(study_name='Learning Optuna',direction='maximize',sampler=optuna.samplers.RandomSampler())
study.optimize(objective,n_trials=50) # Run 50 trials to find the best hyperparameters


[I 2025-07-05 10:22:41,733] A new study created in memory with name: Learning Optuna
[I 2025-07-05 10:22:42,800] Trial 0 finished with value: 0.7671339563862929 and parameters: {'n_estimators': 159, 'max_depth': 19}. Best is trial 0 with value: 0.7671339563862929.
[I 2025-07-05 10:22:43,785] Trial 1 finished with value: 0.7671339563862929 and parameters: {'n_estimators': 161, 'max_depth': 16}. Best is trial 0 with value: 0.7671339563862929.
[I 2025-07-05 10:22:44,408] Trial 2 finished with value: 0.7503980616130149 and parameters: {'n_estimators': 97, 'max_depth': 17}. Best is trial 0 with value: 0.7671339563862929.
[I 2025-07-05 10:22:44,905] Trial 3 finished with value: 0.7615784008307374 and parameters: {'n_estimators': 77, 'max_depth': 19}. Best is trial 0 with value: 0.7671339563862929.
[I 2025-07-05 10:22:45,625] Trial 4 finished with value: 0.7577881619937694 and parameters: {'n_estimators': 118, 'max_depth': 12}. Best is trial 0 with value: 0.7671339563862929.
[I 2025-07-05 10:

In [21]:
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best Hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7746278989269643
Best Hyperparameters: {'n_estimators': 167, 'max_depth': 9}


In [22]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.65




Grid Search (using Grid Sampler)

In [28]:
# for grid search, you need to explicitly define the search space outside of objective function.
search_space = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20]
}

In [25]:
study = optuna.create_study(study_name='Learning Optuna',direction='maximize',sampler=optuna.samplers.GridSampler(search_space=search_space))
study.optimize(objective,n_trials=50) # Run 50 trials to find the best hyperparameters


[I 2025-07-05 10:26:19,611] A new study created in memory with name: Learning Optuna
[I 2025-07-05 10:26:20,191] Trial 0 finished with value: 0.7541190723433715 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7541190723433715.
[I 2025-07-05 10:26:21,103] Trial 1 finished with value: 0.7615264797507788 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.7615264797507788.
[I 2025-07-05 10:26:21,453] Trial 2 finished with value: 0.7634302526825891 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 2 with value: 0.7634302526825891.
[I 2025-07-05 10:26:22,071] Trial 3 finished with value: 0.7578573901003807 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 2 with value: 0.7634302526825891.
[I 2025-07-05 10:26:22,787] Trial 4 finished with value: 0.7597438560055382 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 2 with value: 0.7634302526825891.
[I 2025-07-05 10:

In [26]:
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best Hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7708549671166494
Best Hyperparameters: {'n_estimators': 100, 'max_depth': 10}


Optuna Visualizations

In [29]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [31]:
# Defining study object again for visualization
study = optuna.create_study(study_name='Learning Optuna Visualization',direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective,n_trials=50) # Run 50 trials to find the best hyperparameters


[I 2025-07-05 10:45:10,585] A new study created in memory with name: Learning Optuna Visualization
[I 2025-07-05 10:45:11,655] Trial 0 finished with value: 0.7709068881966079 and parameters: {'n_estimators': 174, 'max_depth': 9}. Best is trial 0 with value: 0.7709068881966079.
[I 2025-07-05 10:45:12,436] Trial 1 finished with value: 0.7559709241952233 and parameters: {'n_estimators': 123, 'max_depth': 7}. Best is trial 0 with value: 0.7709068881966079.
[I 2025-07-05 10:45:13,530] Trial 2 finished with value: 0.7652647975077882 and parameters: {'n_estimators': 160, 'max_depth': 17}. Best is trial 0 with value: 0.7709068881966079.
[I 2025-07-05 10:45:14,652] Trial 3 finished with value: 0.7615264797507789 and parameters: {'n_estimators': 175, 'max_depth': 11}. Best is trial 0 with value: 0.7709068881966079.
[I 2025-07-05 10:45:15,203] Trial 4 finished with value: 0.750380754586362 and parameters: {'n_estimators': 85, 'max_depth': 12}. Best is trial 0 with value: 0.7709068881966079.
[I 20

In [32]:
# 1. Optimization History
plot_optimization_history(study)

In [34]:
# 2. Parallel Coordinates Plot
plot_parallel_coordinate(study)

# from the below graph we can see most trials are conducted in the range (5,15) for max_depth and (100,160) for n_estimators

In [None]:
# 3. Slice Plot
plot_slice(study)

# again we can see that most optimum range for max_depth and n_estimators are (5,15) and (100,<200)

In [None]:
# 4. Contour Plot
plot_contour(study)

# gives the dense and sparse regions of the conducted trials search space

In [37]:
# 5. Hyperparameter Importance
plot_param_importances(study)

Optimizing Multiple ML Models

In [7]:
# Importing the required libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [8]:
# Define the objective function for Optuna
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [None]:
# Create a study and optimize it 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-07-05 11:51:04,041] A new study created in memory with name: no-name-96a6ac05-00c6-43c5-973a-4faa56c84310
[I 2025-07-05 11:51:05,437] Trial 0 finished with value: 0.7374301675977654 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 177, 'learning_rate': 0.27119294875865, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.7374301675977654.
[I 2025-07-05 11:51:07,999] Trial 1 finished with value: 0.7541899441340781 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 219, 'learning_rate': 0.01312422153231917, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7541899441340781.
[I 2025-07-05 11:51:08,325] Trial 2 finished with value: 0.7579143389199255 and parameters: {'classifier': 'RandomForest', 'n_estimators': 98, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 7, 'bootstrap': True}. Best is trial 2 with value: 0.7579143389199255.
[I 2025-07-05 11

In [None]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

In [None]:
study.trials_dataframe()

In [None]:
study.trials_dataframe()['params_classifier'].value_counts()

In [None]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()