In [2]:
import optuna
import numpy as np 
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
cols_with_missing_values = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_values] = df[cols_with_missing_values].replace(0,np.nan)

In [5]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [6]:
df = df.fillna(df.mean())

In [7]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
x = df.drop('Outcome' , axis=1)

In [9]:
y=df['Outcome']

In [10]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=42)

In [11]:
scalar = StandardScaler()
x_train = scalar.fit_transform(x_train)
x_test  = scalar.transform(x_test)

In [12]:
print(f'Training set shape: {x_train.shape}')
print(f'Test set shape: {x_test.shape}')

Training set shape: (614, 8)
Test set shape: (154, 8)


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [31]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators' , 15 ,200)
    max_depth  = trial.suggest_int('max_depth' , 10,20)
    model= RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    score = cross_val_score(model , x_train , y_train , cv=3 , scoring='accuracy' ).mean()
    return score

In [35]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-04-29 22:40:16,745] A new study created in memory with name: no-name-d259fa42-a23f-464d-863e-fd47474c5885
[I 2025-04-29 22:40:17,746] Trial 0 finished with value: 0.7735931771082417 and parameters: {'n_estimators': 156, 'max_depth': 17}. Best is trial 0 with value: 0.7735931771082417.
[I 2025-04-29 22:40:18,943] Trial 1 finished with value: 0.771967160848079 and parameters: {'n_estimators': 189, 'max_depth': 19}. Best is trial 0 with value: 0.7735931771082417.
[I 2025-04-29 22:40:20,222] Trial 2 finished with value: 0.7670811413996493 and parameters: {'n_estimators': 170, 'max_depth': 19}. Best is trial 0 with value: 0.7735931771082417.
[I 2025-04-29 22:40:20,916] Trial 3 finished with value: 0.7784951378925554 and parameters: {'n_estimators': 114, 'max_depth': 20}. Best is trial 3 with value: 0.7784951378925554.
[I 2025-04-29 22:40:22,085] Trial 4 finished with value: 0.7736011477761836 and parameters: {'n_estimators': 192, 'max_depth': 15}. Best is trial 3 with value: 0.77849

In [53]:
print(study.best_trial.values)
print(study.best_params)

[0.7817551410808226]
{'n_estimators': 124, 'max_depth': 16}


In [55]:
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')

Test Accuracy with best hyperparameters: 0.75
