In [3]:
import pandas as pd
import numpy as np
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score


In [4]:
df = pd.read_csv("heart.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
df_o = pd.get_dummies(df) #One hot encoding
df_o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   Cholesterol        918 non-null    int64  
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   HeartDisease       918 non-null    int64  
 7   Sex_F              918 non-null    bool   
 8   Sex_M              918 non-null    bool   
 9   ChestPainType_ASY  918 non-null    bool   
 10  ChestPainType_ATA  918 non-null    bool   
 11  ChestPainType_NAP  918 non-null    bool   
 12  ChestPainType_TA   918 non-null    bool   
 13  RestingECG_LVH     918 non-null    bool   
 14  RestingECG_Normal  918 non-null    bool   
 15  RestingECG_ST      918 non-null    bool   
 16  ExerciseAngina_N   918 non

In [6]:
X = df_o.drop("HeartDisease", axis=1).copy()
y = df_o["HeartDisease"].copy()
X.shape, y.shape

((918, 20), (918,))

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=369)

In [136]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
reporte = classification_report(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(acc)

0.7934782608695652


In [144]:
def objective(trial):
    global X_train, y_train, X_test
    params = {
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        "splitter" : trial.suggest_categorical("splitter", ["best", "random"]),
        "max_depth" : trial.suggest_int("max_depth", 1,500),
        "min_samples_split" : trial.suggest_float("min_samples_split", 0, 1),
        "min_samples_leaf" : trial.suggest_float("min_samples_leaf", 0, 1),
        #"min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0, 1),
        "random_state": 369
    }
    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc 

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials= 1000, n_jobs=-1)


[I 2025-08-21 01:14:26,879] A new study created in memory with name: no-name-b0886908-7e40-4def-ba00-0745a4071626
[I 2025-08-21 01:14:26,921] Trial 3 finished with value: 0.5434782608695652 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 122, 'min_samples_split': 0.0374786704944976, 'min_samples_leaf': 0.6499058513424126}. Best is trial 3 with value: 0.5434782608695652.
[I 2025-08-21 01:14:26,934] Trial 1 finished with value: 0.5434782608695652 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 300, 'min_samples_split': 0.6072791417536599, 'min_samples_leaf': 0.571675269722644}. Best is trial 3 with value: 0.5434782608695652.
[I 2025-08-21 01:14:26,934] Trial 0 finished with value: 0.5434782608695652 and parameters: {'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 94, 'min_samples_split': 0.14390378077873245, 'min_samples_leaf': 0.6482334456743469}. Best is trial 3 with value: 0.5434782608695652.
[I 2025-08-21 01:14:26,921] Tria

In [155]:
best_param = study.best_params
print(best_param)
model_tuneado = DecisionTreeClassifier(**best_param)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

{'criterion': 'gini', 'splitter': 'random', 'max_depth': 274, 'min_samples_split': 0.027914544438272612, 'min_samples_leaf': 0.03998550874888765}


0.8115942028985508