In [3]:
# Import necessary libraries
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd

# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


In [13]:
df[df['Age'] == 0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [14]:
import numpy as np

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [15]:
# Split into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    
    # Create the RandomForestClassifier with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    # Perform 3-fold cross-validation and calculate accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    return score  # Return the accuracy score for Optuna to maximize


In [19]:
%%time
# Create a study object and optimize the objective function
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())  # We aim to maximize accuracy
# study.optimize(objective, n_trials=50)  # Run 50 trials to find the best hyperparameters
study= optuna.create_study (direction='maximize')
study.optimize (objective, n_trials=50)

[I 2025-06-19 19:23:47,566] A new study created in memory with name: no-name-ebd0252c-4376-4011-9cec-d52fb79ef7be
[I 2025-06-19 19:23:48,249] Trial 0 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 108, 'max_depth': 8}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-19 19:23:48,560] Trial 1 finished with value: 0.7597765363128491 and parameters: {'n_estimators': 64, 'max_depth': 4}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-19 19:23:49,500] Trial 2 finished with value: 0.7765363128491621 and parameters: {'n_estimators': 177, 'max_depth': 14}. Best is trial 2 with value: 0.7765363128491621.
[I 2025-06-19 19:23:50,141] Trial 3 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 128, 'max_depth': 14}. Best is trial 2 with value: 0.7765363128491621.
[I 2025-06-19 19:23:50,713] Trial 4 finished with value: 0.7802607076350093 and parameters: {'n_estimators': 119, 'max_depth': 18}. Best is trial 4 with value: 0.7802607

CPU times: total: 26.5 s
Wall time: 27.1 s


In [23]:
print (f'Best Trial Accuracy : {study.best_trial.value}')
print (f'Best HyperParameters : {study.best_trial.params}')

Best Trial Accuracy : 0.7821229050279329
Best HyperParameters : {'n_estimators': 119, 'max_depth': 20}


In [24]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')

Test Accuracy with best hyperparameters: 0.75


# Model Tuning 

In [35]:
# Importing the required libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [64]:
# Define the objective function for Optuna
def objective (trial):
    classifier_name = trial.suggest_categorical ('classifier', ['SVC', 'RandomForestClassifier', 'LogisticRegression', 'GradientBoosting'])

    if classifier_name == 'SVC':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)
    elif classifier_name == 'RandomForestClassifier':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42, n_jobs=-1
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
    elif classifier_name== 'LogisticRegression':
        penalty= trial.suggest_categorical ('penalty',['l2', 'l1'])
        C = trial.suggest_float('C', 0.01,10.0, log=True)
        solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

        model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        random_state=42,
        max_iter=1000
        )

        
    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1).mean()
    return score

In [65]:
study = optuna.create_study (direction='maximize')
study.optimize (objective, n_trials=100)

[I 2025-06-19 21:25:30,224] A new study created in memory with name: no-name-37026c6e-a0bc-4778-87e4-37db7e00c21d
[I 2025-06-19 21:25:31,245] Trial 0 finished with value: 0.7318435754189944 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 112, 'learning_rate': 0.1442045449601066, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7318435754189944.
[I 2025-06-19 21:25:31,620] Trial 1 finished with value: 0.756052141527002 and parameters: {'classifier': 'SVC', 'C': 0.24113813972137085, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 1 with value: 0.756052141527002.
[I 2025-06-19 21:25:31,635] Trial 2 finished with value: 0.7839851024208566 and parameters: {'classifier': 'SVC', 'C': 0.5459907511663041, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 2 with value: 0.7839851024208566.
[I 2025-06-19 21:25:31,649] Trial 3 finished with value: 0.7094972067039107 and parameters: {'classifier': 'SVC', 'C': 0.11505292880821917, 

In [66]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'classifier': 'SVC', 'C': 77.4478917844088, 'kernel': 'linear', 'gamma': 'scale'}
Best trial accuracy: 0.7858472998137801


In [68]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
SVC                       72
LogisticRegression        10
GradientBoosting           9
RandomForestClassifier     9
Name: count, dtype: int64

In [69]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

params_classifier
GradientBoosting          0.747569
LogisticRegression        0.778212
RandomForestClassifier    0.766398
SVC                       0.767096
Name: value, dtype: float64