#Heart Attack Analysis & Prediction Dataset

In [None]:
# Instalar kagglehub si no está instalado
!pip install kagglehub


import pandas as pd
import kagglehub

In [None]:
# Cargar el dataset
path = kagglehub.dataset_download("rashikrahmanpritom/heart-attack-analysis-prediction-dataset")
file_path = f"{path}/heart.csv"
data = pd.read_csv(file_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rashikrahmanpritom/heart-attack-analysis-prediction-dataset?dataset_version_number=2...


100%|██████████| 4.11k/4.11k [00:00<00:00, 8.01MB/s]

Extracting files...





In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd


categorical_cols = ["sex", "cp", "fbs", "restecg", "exng", "slp", "caa", "thall"]
continuous_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak"]


X = data.drop("output", axis=1)
y = data["output"]


scaler = StandardScaler()
X_scaled_continuous = scaler.fit_transform(data[continuous_cols])


encoder = OneHotEncoder(sparse_output=False, drop="first")  # Cambiado de sparse a sparse_output
X_encoded_categorical = encoder.fit_transform(data[categorical_cols])


import numpy as np
X_prepared = np.hstack((X_scaled_continuous, X_encoded_categorical))


smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_prepared, y)


X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

print("Datos preparados y divididos:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

Datos preparados y divididos:
X_train shape: (264, 22)
y_train shape: (264,)


In [None]:
!pip install lazypredict


##Comparación de modelos Utilizando Lazypredict

In [None]:
import sys
import os
from lazypredict.Supervised import LazyClassifier
from contextlib import redirect_stdout


with open(os.devnull, "w") as fnull:
    with redirect_stdout(fnull):
        clf = LazyClassifier(verbose=0, ignore_warnings=True)
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)


print(models)


100%|██████████| 31/31 [00:01<00:00, 19.86it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
RandomForestClassifier             0.85               0.85     0.85      0.85   
NearestCentroid                    0.83               0.83     0.83      0.83   
ExtraTreesClassifier               0.82               0.82     0.82      0.82   
KNeighborsClassifier               0.82               0.82     0.82      0.82   
Perceptron                         0.80               0.80     0.80      0.80   
PassiveAggressiveClassifier        0.80               0.80     0.80      0.80   
LGBMClassifier                     0.80               0.80     0.80      0.80   
LabelSpreading                     0.80               0.80     0.80      0.80   
NuSVC                              0.80               0.80     0.80      0.80   
LabelPropagation                   0.80               0.80     0.80      0.80   
BernoulliNB                 




##Mejores Modelos
| Model                   | Accuracy | Balanced Accuracy | ROC AUC | F1 Score |
|-------------------------|----------|--------------------|---------|----------|
| RandomForestClassifier  | 0.85     | 0.85               | 0.85    | 0.85     |
| NearestCentroid         | 0.83     | 0.83               | 0.83    | 0.83     |
| ExtraTreesClassifier    | 0.82     | 0.82               | 0.82    | 0.82     |
  

##Búsqueda de hiperparámetros

In [None]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import NearestCentroid
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd



models_and_parameters = {
    "RandomForestClassifier": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "NearestCentroid": {
        "model": NearestCentroid(),
        "params": {
            "metric": ["euclidean", "manhattan"]
        }
    },
    "ExtraTreesClassifier": {
        "model": ExtraTreesClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5, 10]
        }
    }
}


results = {}


for name, model_info in models_and_parameters.items():
    model = model_info["model"]
    params = model_info["params"]
    grid_search = GridSearchCV(model, params, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)


    results[name] = {
        "Best Params": grid_search.best_params_,
        "Best Score": grid_search.best_score_
    }


results_df = pd.DataFrame.from_dict(results, orient="index")


print(results_df)


                                                              Best Params  \
RandomForestClassifier  {'max_depth': 20, 'min_samples_split': 2, 'n_e...   
NearestCentroid                                   {'metric': 'manhattan'}   
ExtraTreesClassifier    {'max_depth': 20, 'min_samples_split': 2, 'n_e...   

                        Best Score  
RandomForestClassifier        0.84  
NearestCentroid               0.80  
ExtraTreesClassifier          0.84  


##Mejor Modelo

El mejor modelo fue:

RandomForestClassifier        0.84  

Se observa diferencias claras entre utilizar pycaret en el taller pasado y lazypredict en este caso.

Los mejores modelos al utilizar pycaret fueron :
* Ridge Classifier
* Linear Discriminant Analysis
* Logistic Regression

Mientras que usando lazypredict fueron:
* RandomForestClassifier
* NearestCentroid
* ExtraTreesClassifier

Resultados bastante diferentes ya que ambas librerías muestran modelos diferentes entre sí, esto principalmente por la validación cruzada que hace pyCaret respecto a la dicisión simple que hace lazypredict. También pycaret maneja una configuración inicial más óptima loque se refleja en el buen desempeños de algunos modelos. Adicional pycaret hace un ajúste automático de hiperparámatros básicos lo que ayuda a utilizar mejores configuraciones de modelos.