In [None]:
# Optuna is an automatic hyperparameter optimization (HPO) framework
# By default optuna use TPE (Tree-structured Parzen Estimator) algorithm
# TPE is a Bayesian optimization method.

# How TPE Works:
    # It runs some initial random trials.
    # It separates results into: Good trials, Bad trials
    # It builds probability models:
    # l(x) → distribution of good parameters
    # g(x) → distribution of bad parameters
    # It intelligently selects next hyperparameters
    # It remembers previous trials

# Choose parameters more likely to be good and less likely to be bad.
# Choose parameters more likely to be good and less likely to be bad.


In [None]:
# Some Key Term:

# 1. Study: A-Z propcess of finding best combination of Hyperparameter to maximize accuracy by using Optuna.
    # A study in Optuna is an optimization session that encompasses multiple trails. 
    # It's essentially a collection of trails aimed at optimizing the objective function
    # You can think of a study as the overall experiment or search process.
    

# 2. Trail: A trail is a single iteration of the optimization process where a specific set of hyperparameters is evaluated.
    # Each trails runs the objective function once with a distict set of hyperparametrs.
    # One single run 
    # Example: One trail could involve training a model with a learning rate of 0.01 and a max depth of 5 

# 3. Trail Parameter: These are the specific hyperparameter values chosen during a trail.
    # Each trail will have a unique a combination of hyperparameter that are evaluated to see how they impact the objective function.
    # One trail the learning rate might be 0.001, while the batch size is 32 and 
    # in another trail the learning rate could be 0.01 and a batch size of 64

# 4. Objective Function: It is a function to be optimized (maximized, minimized) during the hyperparameter search.
    # It takes hyperparametr as input and returns as a value (accuracy, loss and metric)
    # Findout the hidden relationship between hyperparameter(max_depth, n_estimator) and rturn values (accuracy, loss function)
    # Example: In a classification task, the objective function could be the cross entropy loss, which optuna seeks to minimize

# 5. Sampler; A sampler is the algorithm that suggest which hyperparameters should be evaluated next. 
    # Opyuna uses the TPE by default.

In [5]:
# Example with Code

# # Import necessary libraries
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd

# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
import numpy as np

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
# Extract input features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


In [None]:
# Define the objective function
def Objective(trial):
    # Suggest values for the hyperparameters
    n_estimator=trial.suggest_int('n_estimators', 50,200)
    max_depth=trial.suggest_int('max_depth', 3,20)

    #  Create the RandomForestClassifier with suggested hyperparameters
    model=RandomForestClassifier(
        n_estimators=n_estimator,
        max_depth=max_depth,
        random_state=42
    )
    # Perform 3-fold cross-validation and calculate accuracy 
    score=cross_val_score(
        model, X_train, y_train, 
        cv=3,
        scoring='accuracy'
    ).mean()

    # Return the accuracy score for Optuna to maximize
    return score

In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler() # Also can use RandomSampler(), GridSampler(search_space)
    )  # We aim to maximize accuracy 

# Run 50 trials to find the best hyperparameters
study.optimize(Objective, n_trials=50)

[32m[I 2026-02-17 21:26:19,436][0m A new study created in memory with name: no-name-4e9351a4-8f5a-4b96-83ed-e0a9a4fe9887[0m
[32m[I 2026-02-17 21:26:20,315][0m Trial 0 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 165, 'max_depth': 14}. Best is trial 0 with value: 0.7709497206703911.[0m
[32m[I 2026-02-17 21:26:20,714][0m Trial 1 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 80, 'max_depth': 14}. Best is trial 0 with value: 0.7709497206703911.[0m
[32m[I 2026-02-17 21:26:21,017][0m Trial 2 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 60, 'max_depth': 10}. Best is trial 0 with value: 0.7709497206703911.[0m
[32m[I 2026-02-17 21:26:21,413][0m Trial 3 finished with value: 0.7635009310986964 and parameters: {'n_estimators': 81, 'max_depth': 9}. Best is trial 0 with value: 0.7709497206703911.[0m
[32m[I 2026-02-17 21:26:22,144][0m Trial 4 finished with value: 0.7672253258845437 and parameters: {'n_

In [8]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7821229050279329
Best hyperparameters: {'n_estimators': 119, 'max_depth': 20}


In [9]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.75
