In [3]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier 

In [4]:
import optuna

In [5]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# This is an classification problem with 2 class 
# here in each column where value is 0 , it means it is missing 
# First we will convert the 0 to NAN then we will impute these missing value 

In [8]:
from sklearn.impute import SimpleImputer
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

In [9]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [10]:
si = SimpleImputer(strategy='mean')

X = df.drop('Outcome' , axis=1)
y = df['Outcome']

X = si.fit_transform(X)


In [12]:
X[0]

array([  6.        , 148.        ,  72.        ,  35.        ,
       155.54822335,  33.6       ,   0.627     ,  50.        ])

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=2)
X_train.shape , X_test.shape 

((614, 8), (154, 8))

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

# Optuna (Powerfull hyperparameter optimization library)

In [21]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest values for the hyperparameters
    max_depth = trial.suggest_int('max_depth' , 3 ,20)
    n_estimator = trial.suggest_int('n_estimator' , 50 , 200)
    
    # Now create model using these hyperparameter suggest by smapler 
    model = RandomForestClassifier(n_estimators=n_estimator , max_depth=max_depth , random_state=2)
    
    # Now calculate some score , which will be objective of this function ( it may be accuracy , loss e.t.c)
    score = cross_val_score(model , X_train_scaled , y_train ,scoring='accuracy', cv=3).mean()
    # 3 fold cross validation
    
    return score 
    
    
    

In [22]:
study = optuna.create_study(direction='maximize' , sampler=optuna.samplers.TPESampler())
# This is the by default sampler , sampler will sample the hyperparametres on the basis of previous score
# and here we have set direction ='maximize' , because our objective is to maximize the score
study.optimize(objective , n_trials= 50)    # run for 50 combination of hyperparametrs 

[I 2025-09-13 23:35:44,025] A new study created in memory with name: no-name-032dba15-5be1-4766-8ecf-ac9f28ee2ffb
[I 2025-09-13 23:35:44,881] Trial 0 finished with value: 0.7540889526542324 and parameters: {'max_depth': 10, 'n_estimator': 141}. Best is trial 0 with value: 0.7540889526542324.
[I 2025-09-13 23:35:45,476] Trial 1 finished with value: 0.7426829268292683 and parameters: {'max_depth': 13, 'n_estimator': 103}. Best is trial 0 with value: 0.7540889526542324.
[I 2025-09-13 23:35:46,338] Trial 2 finished with value: 0.7573170731707317 and parameters: {'max_depth': 17, 'n_estimator': 135}. Best is trial 2 with value: 0.7573170731707317.
[I 2025-09-13 23:35:47,250] Trial 3 finished with value: 0.7687549816674637 and parameters: {'max_depth': 4, 'n_estimator': 144}. Best is trial 3 with value: 0.7687549816674637.
[I 2025-09-13 23:35:48,053] Trial 4 finished with value: 0.7524390243902439 and parameters: {'max_depth': 7, 'n_estimator': 180}. Best is trial 3 with value: 0.76875498166

In [23]:
print("best score " , study.best_value)
print("best hyperparameter" , study.best_params)

best score  0.7769010043041606
best hyperparameter {'max_depth': 4, 'n_estimator': 136}


In [27]:
# Now lets train the model using these hyperparametr 
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators= study.best_params['n_estimator'],
                            max_depth = study.best_params['max_depth'], random_state=2)
rf.fit(X_train_scaled , y_train)

0,1,2
,n_estimators,136
,criterion,'gini'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
y_pred = rf.predict(X_test_scaled)
print("accuracy score  on test data" , accuracy_score(y_test , y_pred))

accuracy score  on test data 0.7727272727272727


$ Sampler \ in \ Optuna  $

Here also we can use GridSearchCv and RandomSearchCV

RandomSearchCV

In [53]:
def objective(trial):
    # Sampler will sample the hyperparamater 
    n_estimator = trial.suggest_int('n_estimator' , 50 , 200)
    max_depth = trial.suggest_int('max_depth' , 3 ,20)
    
    # Create the model using the hyperparameter suggested by sampler 
    
    model = RandomForestClassifier(n_estimators=n_estimator , max_depth=max_depth , random_state=3)
    
    score = cross_val_score(model , X_train_scaled , y_train , 
                            scoring='accuracy' , cv=3).mean()
    
    return score

In [54]:
study2 = optuna.create_study(direction='maximize' , sampler=optuna.samplers.RandomSampler())
study2.optimize(objective , n_trials=50) # it will take random 50 trial for the hyperparametr 

# here we are using random search cv 

[I 2025-09-14 00:09:42,032] A new study created in memory with name: no-name-c022d09f-69f8-4416-99bc-def66d829ffc
[I 2025-09-14 00:09:42,996] Trial 0 finished with value: 0.7426829268292683 and parameters: {'n_estimator': 197, 'max_depth': 8}. Best is trial 0 with value: 0.7426829268292683.
[I 2025-09-14 00:09:43,640] Trial 1 finished with value: 0.7508289494659652 and parameters: {'n_estimator': 164, 'max_depth': 3}. Best is trial 1 with value: 0.7508289494659652.
[I 2025-09-14 00:09:44,165] Trial 2 finished with value: 0.7655188904830225 and parameters: {'n_estimator': 129, 'max_depth': 4}. Best is trial 2 with value: 0.7655188904830225.
[I 2025-09-14 00:09:44,530] Trial 3 finished with value: 0.7540730113183485 and parameters: {'n_estimator': 77, 'max_depth': 18}. Best is trial 2 with value: 0.7655188904830225.
[I 2025-09-14 00:09:45,348] Trial 4 finished with value: 0.7703809979276263 and parameters: {'n_estimator': 179, 'max_depth': 16}. Best is trial 4 with value: 0.7703809979276

In [55]:
print("best score " , study2.best_value)
print("best parameter" , study2.best_params)

best score  0.7704049099314524
best parameter {'n_estimator': 101, 'max_depth': 4}


In [56]:
rf2 = RandomForestClassifier(n_estimators= study2.best_params['n_estimator'],
                            max_depth = study2.best_params['max_depth'], random_state=2)
rf2.fit(X_train_scaled , y_train)

0,1,2
,n_estimators,101
,criterion,'gini'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
y_pred2 = rf2.predict(X_test_scaled)
print("accuracy score  on test data" , accuracy_score(y_test , y_pred2))

accuracy score  on test data 0.7597402597402597


GridSearchCV

In [None]:
# in this type of sampler first we need to define the search space , it cant be continous 
search_space = {
    'n_estimator': [50 , 100 , 150 ,200],  # it should not be more than 200 , beacuse in objective function the maximum value of n_estimator i have set 200
    'max_depth': [3 , 5 , 7, 9 ,11]
}
# Total 4*5 = 20 trails 

In [48]:
study3 = optuna.create_study(direction='maximize' , sampler=optuna.samplers.GridSampler(search_space))
study3.optimize(objective)

[I 2025-09-14 00:03:57,658] A new study created in memory with name: no-name-1eb858df-933a-42c6-9835-6f5895841449
[I 2025-09-14 00:03:58,516] Trial 0 finished with value: 0.7573648971783835 and parameters: {'n_estimator': 150, 'max_depth': 11}. Best is trial 0 with value: 0.7573648971783835.
[I 2025-09-14 00:03:58,946] Trial 1 finished with value: 0.7491869918699187 and parameters: {'n_estimator': 100, 'max_depth': 3}. Best is trial 0 with value: 0.7573648971783835.
[I 2025-09-14 00:04:00,242] Trial 2 finished with value: 0.7557229395823369 and parameters: {'n_estimator': 200, 'max_depth': 11}. Best is trial 0 with value: 0.7573648971783835.
[I 2025-09-14 00:04:00,525] Trial 3 finished with value: 0.7377969073808385 and parameters: {'n_estimator': 50, 'max_depth': 7}. Best is trial 0 with value: 0.7573648971783835.
[I 2025-09-14 00:04:01,183] Trial 4 finished with value: 0.7524709070620118 and parameters: {'n_estimator': 150, 'max_depth': 7}. Best is trial 0 with value: 0.7573648971783

In [49]:
print("best score" , study3.best_value)
print('best params' , study3.best_params)

best score 0.7670652000637653
best params {'n_estimator': 50, 'max_depth': 11}


In [50]:
rf3 = RandomForestClassifier(n_estimators= study3.best_params['n_estimator'],
                            max_depth = study3.best_params['max_depth'], random_state=2)
rf3.fit(X_train_scaled , y_train)

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,11
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
y_pred3 = rf3.predict(X_test_scaled)
print("accuracy score  on test data" , accuracy_score(y_test , y_pred3))

accuracy score  on test data 0.7597402597402597


$ Visualization \ in \ Optuna $

In [None]:
from optuna.visualization import plot_optimization_history \
                        , plot_slice , plot_parallel_coordinate,plot_contour , plot_param_importances

In [58]:
# Optimization History 
plot_optimization_history(study).show()

In [59]:
# so from above figure after 30 trial there was no improvement in accuracy

In [60]:
plot_parallel_coordinate(study).show()
# it will tell the region where sampler has sampel each hyparaparmeter 
# each vertical line is score , then hypereparameters

In [62]:
# so most of the time max_depth is choosen in the range 10-20 and n_estimators in the range 140-180

In [63]:
plot_slice(study).show()
# it will show plot of objective function vs hyperparameter (one by one )

In [64]:
# contour plot , possible if we are tuning 2 hyperparameter 
plot_contour(study).show()

In [65]:
# hypereparameter importance 
plot_param_importances(study).show()
# this will show which hypereparameter is importance for objective function 

In [66]:
# so max_depth is 56 % important and n_estimator is 44 % important 

$ Optimizing \ Multiple \ ML \ Model $

Optuna can be used to deside which model is good for the training among so many models and along with\
model it will also give best hyperparameter of that model

In [None]:
def objective(trial):
    # create classifier 
    classifier_name = trial.suggest_categorical('classifier' , ['RandomForest' ,'GradientBoosting' , 'SVM'])
    
    # Now create model for each classifier 
    if classifier_name=='RandomForest' :
        n_estimators = trial.suggest_int('n_estimators' , 50 ,200)
        max_depth = trial.suggest_int('max_depth' , 3 , 25)
        max_samples = trial.suggest_float('max_samples' , 0 ,1)
        max_features = trial.suggest_float('max_faetures' , 0,1)
        
        model = RandomForestClassifier(n_estimators=n_estimators , 
                                    max_depth=max_depth , max_samples=max_samples ,
                                    max_features=max_features , random_state=42)
        
    elif classifier_name == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators' , 50 ,200)
        max_depth = trial.suggest_int("max_depth" , 3 ,25)
        learning_rate = trial.suggest_float("learning_rate" , 0.01 , 0.3)
        max_features = trial.suggest_float('max_faetures' , 0,1)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        
        model = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth , 
                                           learning_rate=learning_rate , max_features=max_features,
                                         min_samples_split = min_samples_split)
    elif classifier_name == 'SVM':
        kernel = trial.suggest_categorical("kernel" , ['rbf' , 'linear' , 'poly' , 'sigmoid'])
        degree = trial.suggest_int("degree" , 1 , 6) # degree will be ignored for kernel = rbf , linear , sigmoid 
        c = trial.suggest_float("C" , 0.1 , 100)
        model = SVC(kernel=kernel , degree=degree , C=c)
        
    score = cross_val_score(model , X_train_scaled , y_train , scoring='accuracy' , cv=3).mean()
    
    return score 
    
        
        

In [82]:
study_complex = optuna.create_study(direction='maximize' )
study_complex.optimize(objective , n_trials = 100)

[I 2025-09-14 00:50:10,183] A new study created in memory with name: no-name-dd085052-38b5-4fcc-8bdf-3c96ea6eb720
[I 2025-09-14 00:50:10,946] Trial 0 finished with value: 0.7524709070620118 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 56, 'max_depth': 22, 'learning_rate': 0.19291518007794678, 'max_faetures': 0.1415140608256782, 'min_samples_split': 10}. Best is trial 0 with value: 0.7524709070620118.
[I 2025-09-14 00:50:11,247] Trial 1 finished with value: 0.6417583293479994 and parameters: {'classifier': 'SVM', 'kernel': 'poly', 'degree': 2, 'C': 57.949405616395595}. Best is trial 0 with value: 0.7524709070620118.
[I 2025-09-14 00:50:11,331] Trial 2 finished with value: 0.6237765024709071 and parameters: {'classifier': 'SVM', 'kernel': 'poly', 'degree': 4, 'C': 96.44632767754878}. Best is trial 0 with value: 0.7524709070620118.
[I 2025-09-14 00:50:12,064] Trial 3 finished with value: 0.763860991551092 and parameters: {'classifier': 'RandomForest', 'n_estimators':

In [83]:
print("best values " , study_complex.best_value)

best values  0.7703650565917424


In [84]:
print("best params " , study_complex.best_params)

best params  {'classifier': 'RandomForest', 'n_estimators': 126, 'max_depth': 21, 'max_samples': 0.9665262613100534, 'max_faetures': 0.8019374325259532}


In [86]:
df = study_complex.trials_dataframe()   # it will give the details of each trial 

In [87]:
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_classifier,params_degree,params_kernel,params_learning_rate,params_max_depth,params_max_faetures,params_max_samples,params_min_samples_split,params_n_estimators,state
0,0,0.752471,2025-09-14 00:50:10.185805,2025-09-14 00:50:10.946188,0 days 00:00:00.760383,,GradientBoosting,,,0.192915,22.0,0.141514,,10.0,56.0,COMPLETE
1,1,0.641758,2025-09-14 00:50:10.946188,2025-09-14 00:50:11.247757,0 days 00:00:00.301569,57.949406,SVM,2.0,poly,,,,,,,COMPLETE
2,2,0.623777,2025-09-14 00:50:11.247757,2025-09-14 00:50:11.331581,0 days 00:00:00.083824,96.446328,SVM,4.0,poly,,,,,,,COMPLETE
3,3,0.763861,2025-09-14 00:50:11.331581,2025-09-14 00:50:12.064023,0 days 00:00:00.732442,,RandomForest,,,,23.0,0.950387,0.498448,,141.0,COMPLETE
4,4,0.684075,2025-09-14 00:50:12.064023,2025-09-14 00:50:12.079075,0 days 00:00:00.015052,10.274562,SVM,6.0,sigmoid,,,,,,,COMPLETE


In [88]:
df.params_classifier.value_counts()

params_classifier
RandomForest        77
GradientBoosting    12
SVM                 11
Name: count, dtype: int64

In [None]:
df.groupby('params_classifier')['value'].mean()
# this will give mean of accuracy of all model across all the trials 

params_classifier
GradientBoosting    0.745422
RandomForest        0.759604
SVM                 0.695036
Name: value, dtype: float64

In [91]:
# 1. Optimization History
plot_optimization_history(study_complex).show()

In [92]:
plot_param_importances(study_complex).show()

In [93]:
# it means classifier is most important hyperparametr 