In [None]:
!pip install optuna



In [None]:
# Import necessary libraries
import optuna
import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score




url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:



cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)


df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [None]:

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    return score

In [None]:

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())  # We aim to maximize accuracy
study.optimize(objective, n_trials=50)

[I 2024-10-09 15:57:10,064] A new study created in memory with name: no-name-13cdd921-389f-4607-8199-5f33f4e7611a
[I 2024-10-09 15:57:11,000] Trial 0 finished with value: 0.7635009310986964 and parameters: {'n_estimators': 148, 'max_depth': 14}. Best is trial 0 with value: 0.7635009310986964.
[I 2024-10-09 15:57:12,057] Trial 1 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 160, 'max_depth': 11}. Best is trial 1 with value: 0.7672253258845437.
[I 2024-10-09 15:57:13,194] Trial 2 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 170, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2024-10-09 15:57:14,738] Trial 3 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 149, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2024-10-09 15:57:17,017] Trial 4 finished with value: 0.7523277467411545 and parameters: {'n_estimators': 189, 'max_depth': 6}. Best is trial 2 with value: 0.77281

In [None]:

print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7821229050279329
Best hyperparameters: {'n_estimators': 125, 'max_depth': 18}


In [None]:
from sklearn.metrics import accuracy_score


best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(X_train, y_train)


y_pred = best_model.predict(X_test)


test_accuracy = accuracy_score(y_test, y_pred)


print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')

Test Accuracy with best hyperparameters: 0.74


In [None]:

from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [None]:

plot_optimization_history(study).show()

In [None]:

plot_parallel_coordinate(study).show()

In [None]:

plot_slice(study).show()

In [None]:

plot_contour(study).show()

In [None]:

plot_param_importances(study).show()

In [None]:
#Optimizing Multiple ML Models

In [None]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
def objective(trial):

    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [None]:

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-10-09 15:59:04,672] A new study created in memory with name: no-name-791bdcf9-7181-45e7-b580-be6fab4bdd34
[I 2024-10-09 15:59:05,963] Trial 0 finished with value: 0.7635009310986964 and parameters: {'classifier': 'RandomForest', 'n_estimators': 258, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 7, 'bootstrap': True}. Best is trial 0 with value: 0.7635009310986964.
[I 2024-10-09 15:59:10,106] Trial 1 finished with value: 0.7486033519553073 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 269, 'learning_rate': 0.013060897969628999, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.7635009310986964.
[I 2024-10-09 15:59:10,651] Trial 2 finished with value: 0.7746741154562384 and parameters: {'classifier': 'RandomForest', 'n_estimators': 134, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 8, 'bootstrap': False}. Best is trial 2 with value: 0.7746741154562384.
[I 2024-10-09 15:59:10,968] Trial 3

In [None]:

best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'classifier': 'SVM', 'C': 0.122840915513057, 'kernel': 'linear', 'gamma': 'auto'}
Best trial accuracy: 0.7895716945996275


In [None]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.763501,2024-10-09 15:59:04.674690,2024-10-09 15:59:05.963009,0 days 00:00:01.288319,,True,RandomForest,,,,9.0,7.0,3.0,258.0,COMPLETE
1,1,0.748603,2024-10-09 15:59:05.964861,2024-10-09 15:59:10.105643,0 days 00:00:04.140782,,,GradientBoosting,,,0.013061,16.0,5.0,2.0,269.0,COMPLETE
2,2,0.774674,2024-10-09 15:59:10.107466,2024-10-09 15:59:10.651074,0 days 00:00:00.543608,,False,RandomForest,,,,8.0,8.0,3.0,134.0,COMPLETE
3,3,0.778399,2024-10-09 15:59:10.652833,2024-10-09 15:59:10.967977,0 days 00:00:00.315144,,False,RandomForest,,,,6.0,2.0,8.0,73.0,COMPLETE
4,4,0.675978,2024-10-09 15:59:10.969753,2024-10-09 15:59:11.114229,0 days 00:00:00.144476,81.244121,,SVM,scale,poly,,,,,,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.785847,2024-10-09 15:59:49.559132,2024-10-09 15:59:49.596993,0 days 00:00:00.037861,0.287386,,SVM,auto,linear,,,,,,COMPLETE
96,96,0.737430,2024-10-09 15:59:49.598806,2024-10-09 15:59:51.763054,0 days 00:00:02.164248,,,GradientBoosting,,,0.032234,12.0,4.0,3.0,121.0,COMPLETE
97,97,0.785847,2024-10-09 15:59:51.764853,2024-10-09 15:59:51.813682,0 days 00:00:00.048829,0.232742,,SVM,scale,linear,,,,,,COMPLETE
98,98,0.787709,2024-10-09 15:59:51.815426,2024-10-09 15:59:51.849752,0 days 00:00:00.034326,0.177024,,SVM,auto,linear,,,,,,COMPLETE


In [None]:
study.trials_dataframe()['params_classifier'].value_counts()

Unnamed: 0_level_0,count
params_classifier,Unnamed: 1_level_1
SVM,78
RandomForest,12
GradientBoosting,10


In [None]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

Unnamed: 0_level_0,value
params_classifier,Unnamed: 1_level_1
GradientBoosting,0.740223
RandomForest,0.768156
SVM,0.776918
