### BAYESIAN OPTIMIZATION WITH OPTUNA

In [12]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score
import os
import pandas as pd
import kagglehub

In [6]:
# Download latest version
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")
print("Path to dataset files:", path)

file_name = os.listdir(path)[0]
data_url = os.path.join(path, file_name)
data = pd.read_csv(data_url)
data.head()

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/fedesoriano/heart-failure-prediction/versions/1


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [9]:
# encode the categorical variables

cat_cols = data.select_dtypes(include='object').columns
for column in cat_cols:
    encoder = LabelEncoder()
    data[column] = encoder.fit_transform(data[column])

data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [11]:
# split and scale the dataset

X = data.drop(columns=['HeartDisease'])
y = data['HeartDisease']


scaler = StandardScaler()

X = scaler.fit_transform(X)
X

array([[-1.4331398 ,  0.51595242,  0.22903206, ..., -0.8235563 ,
        -0.83243239,  1.05211381],
       [-0.47848359, -1.93816322,  1.27505906, ..., -0.8235563 ,
         0.10566353, -0.59607813],
       [-1.75135854,  0.51595242,  0.22903206, ..., -0.8235563 ,
        -0.83243239,  1.05211381],
       ...,
       [ 0.37009972,  0.51595242, -0.81699495, ...,  1.21424608,
         0.29328271, -0.59607813],
       [ 0.37009972, -1.93816322,  0.22903206, ..., -0.8235563 ,
        -0.83243239, -0.59607813],
       [-1.64528563,  0.51595242,  1.27505906, ..., -0.8235563 ,
        -0.83243239,  1.05211381]], shape=(918, 11))

In [13]:
# train the base model.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=23,
                                                    stratify=y)

model = DecisionTreeClassifier(random_state=23)
model.fit(X_train, y_train)
test_preds = model.predict(X_test)
print(f'test score: {f1_score(y_test, test_preds)}')

test score: 0.7461139896373057


#### TUNING WITH OPTUNA

In [17]:
# create objective function

def objective(trial):

    # define the params
    max_features = trial.suggest_float('max_features', 0.2, 0.9)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    criterion = trial.suggest_categorical('criterion', ['entropy','gini','log_loss'])

    # init the model
    classifier = DecisionTreeClassifier(max_depth=max_depth,
                                        criterion=criterion, random_state=23,max_features=max_features)
    score = cross_val_score(estimator=classifier, X=X, y=y,
                            scoring='f1', cv=5).mean()
    
    return score


# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=23))

# optimize the study
study.optimize(objective, n_trials=100)

[I 2025-08-08 13:27:57,594] A new study created in memory with name: no-name-3881d097-8fa6-4ce8-8489-2b8af2e1673b
[I 2025-08-08 13:27:57,623] Trial 0 finished with value: 0.7849665158193939 and parameters: {'max_features': 0.5621085186926125, 'max_depth': 19, 'criterion': 'entropy'}. Best is trial 0 with value: 0.7849665158193939.
[I 2025-08-08 13:27:57,645] Trial 1 finished with value: 0.8136358757398648 and parameters: {'max_features': 0.6803554596662268, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 1 with value: 0.8136358757398648.
[I 2025-08-08 13:27:57,668] Trial 2 finished with value: 0.7650332751317521 and parameters: {'max_features': 0.20172541684062034, 'max_depth': 18, 'criterion': 'entropy'}. Best is trial 1 with value: 0.8136358757398648.
[I 2025-08-08 13:27:57,696] Trial 3 finished with value: 0.7546352388500593 and parameters: {'max_features': 0.8848988412103815, 'max_depth': 18, 'criterion': 'gini'}. Best is trial 1 with value: 0.8136358757398648.
[I 2025-08-08 13

### Visualize the Performance

In [18]:
import plotly.express as px

In [19]:
optuna.visualization.plot_param_importances(study)

In [20]:
optuna.visualization.plot_optimization_history(study)

In [21]:
optuna.visualization.plot_parallel_coordinate(study)

In [22]:
study.best_params

{'max_features': 0.6292840455296023, 'max_depth': 2, 'criterion': 'entropy'}