<a href="https://colab.research.google.com/github/Many871027/cardio_train/blob/main/cardio_train_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from scipy.stats import randint, uniform  # Para las distribuciones de parámetros
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # Para guardar/cargar modelos
import os
import plotly.express as px

# --- Funciones de Carga, Preprocesamiento y Evaluación (sin cambios) ---
# (Las funciones load_and_preprocess_data, train_model, evaluate_model se mantienen igual)

def load_and_preprocess_data(file_path, sep=';'):
    df = pd.read_csv(file_path, sep=sep)
    df['age'] = (df['age'] / 365).astype(int)
    df.drop('id', axis=1, inplace=True)
    return df

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, y_train, X_test, y_test):
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))
    conf_matrix = confusion_matrix(y_test, model.predict(X_test))
    class_report = classification_report(y_test, model.predict(X_test))

    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}\n")
    print(f"Confusion Matrix:\n{conf_matrix}\n")
    print(f"Classification Report:\n{class_report}")

    return test_accuracy

# --- Carga de Datos (Ajusta la ruta si es necesario) ---
file_path = "/content/drive/MyDrive/Colab Notebooks/3. cardio_train.csv"  # ¡Tu ruta aquí!
df = load_and_preprocess_data(file_path)
X = df.drop('cardio', axis=1)
y = df['cardio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [6]:

# --- Modelos Base (con Pipeline para escalado) ---
# ¡¡¡MUY IMPORTANTE!!! Usamos Pipeline *dentro* del diccionario de modelos.
models = {
    "KNN": Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Gradient Boosting": Pipeline([('scaler', StandardScaler()), ('gb', GradientBoostingClassifier(random_state=42))]),
    "XGBoost": Pipeline([('scaler', StandardScaler()), ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))]),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
}

# --- Definición de Distribuciones de Parámetros para RandomizedSearchCV ---
param_distributions = {
    "KNN": {
        'knn__n_neighbors': randint(3, 21),  # Rango de vecinos
        'knn__weights': ['uniform', 'distance'],
        'knn__p': [1, 2]  # p=1: Manhattan, p=2: Euclidiana
    },
    "Decision Tree": {
        'max_depth': randint(3, 21),  # Profundidad máxima
        'min_samples_split': randint(2, 21),  # Mínimo de muestras para dividir
        'min_samples_leaf': randint(1, 21),  # Mínimo de muestras en una hoja
        'criterion': ['gini', 'entropy']
    },
    "Random Forest": {
        'n_estimators': randint(100, 501), # Número de árboles
        'max_depth': randint(3, 21),
        'min_samples_split': randint(2, 21),
        'min_samples_leaf': randint(1, 21),
        'max_features': [None, 'sqrt', 'log2'],  # Número de características a considerar
        'bootstrap': [True, False]
    },
    "AdaBoost": {
        'n_estimators': randint(50, 201),
        'learning_rate': uniform(0.01, 0.5)  # Tasa de aprendizaje
    },
    "Gradient Boosting": {
        'gb__n_estimators': randint(100, 501),
        'gb__learning_rate': uniform(0.01, 0.2),
        'gb__max_depth': randint(3, 11),
        'gb__min_samples_split': randint(2, 11),
        'gb__min_samples_leaf': randint(1, 11),
        'gb__subsample': uniform(0.7, 0.3),  # Fracción de muestras para cada árbol
        'gb__max_features': [None, 'sqrt', 'log2']
    },
    "XGBoost": {
        'xgb__n_estimators': randint(100, 501),
        'xgb__learning_rate': uniform(0.01, 0.2),
        'xgb__max_depth': randint(3, 11),
        'xgb__subsample': uniform(0.7, 0.3),
        'xgb__colsample_bytree': uniform(0.7, 0.3),
        'xgb__gamma': uniform(0, 0.5)  # Regularización gamma
    },
      "Extra Trees": {
        'n_estimators': randint(100, 501),
        'max_depth': randint(3, 21),
        'min_samples_split': randint(2, 21),
        'min_samples_leaf': randint(1, 21),
        'max_features': [None, 'sqrt', 'log2'],
        'bootstrap': [True, False]
    }

}

# --- Función para Realizar RandomizedSearchCV y Guardar/Cargar ---
def optimize_and_evaluate(model_name, model, param_dist, X_train, y_train, X_test, y_test, n_iter=20, cv=5):
    """Realiza RandomizedSearchCV, guarda/carga resultados, evalúa y devuelve la precisión."""

    save_path = f"/content/drive/MyDrive/{model_name}_random_search.pkl"  # Ruta en Drive

    try:
        if os.path.exists(save_path):
            random_search = joblib.load(save_path)
            print(f"Cargado RandomizedSearchCV para {model_name} desde {save_path}")
        else:
            print(f"No se encontró checkpoint para {model_name}. Comenzando RandomizedSearchCV.")
            random_search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=n_iter,
                cv=cv,
                scoring='accuracy',
                n_jobs=-1,
                random_state=42,
                verbose=0  # Reducido el verbose para mayor claridad
            )
            random_search.fit(X_train, y_train)
            joblib.dump(random_search, save_path)  # Guarda después del fit
            print(f"RandomizedSearchCV para {model_name} guardado en {save_path}")

        best_model = random_search.best_estimator_
        print(f"Mejores parámetros para {model_name}: {random_search.best_params_}")
        test_accuracy = evaluate_model(best_model, X_train, y_train, X_test, y_test)
        return test_accuracy

    except Exception as e:
        print(f"Error en optimize_and_evaluate para {model_name}: {e}")
        return None

# --- Bucle Principal de Optimización y Evaluación ---
results = {}
for name, model in models.items():
    print(f"--- Optimizando y Evaluando {name} ---")
    if name in param_distributions:  # Solo optimiza si tenemos distribuciones definidas
        test_accuracy = optimize_and_evaluate(name, model, param_distributions[name], X_train, y_train, X_test, y_test)
        if test_accuracy is not None:
            results[name] = test_accuracy
    else:
        print(f"No se encontraron distribuciones de parámetros para {name}. Entrenando sin optimización.")
        trained_model = train_model(model, X_train, y_train)
        test_accuracy = evaluate_model(trained_model, X_train, y_train, X_test, y_test)
        results[name] = test_accuracy

# --- Comparación de Modelos (con Plotly) ---
print("\n--- Comparación de Modelos ---")
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Test Accuracy'])
results_df = results_df.sort_values(by='Test Accuracy', ascending=False)  # Ordenar antes de graficar
print(results_df)

# Crear la gráfica de barras con Plotly
fig = px.bar(results_df, x='Test Accuracy', y='Model',
             title='Comparación de Modelos (Test Accuracy)',
             color='Test Accuracy',  # Colorear por precisión
             color_continuous_scale=px.colors.sequential.Viridis,  # Escala de colores
             template='plotly_dark',  # Usar un tema oscuro (opcional)
             text='Test Accuracy')  # Mostrar la precisión en las barras

fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')  # Formato del texto
fig.update_layout(xaxis_title='Accuracy', yaxis_title='Model', xaxis_range=[0, 1])  # Títulos y rango del eje x
fig.show()

--- Optimizando y Evaluando KNN ---
Cargado RandomizedSearchCV para KNN desde /content/drive/MyDrive/KNN_random_search.pkl
Mejores parámetros para KNN: {'knn__n_neighbors': 18, 'knn__p': 1, 'knn__weights': 'distance'}
Training Accuracy: 0.9819
Test Accuracy: 0.6782

Confusion Matrix:
[[7329 3132]
 [3626 6913]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.70      0.68     10461
           1       0.69      0.66      0.67     10539

    accuracy                           0.68     21000
   macro avg       0.68      0.68      0.68     21000
weighted avg       0.68      0.68      0.68     21000

--- Optimizando y Evaluando Decision Tree ---
Cargado RandomizedSearchCV para Decision Tree desde /content/drive/MyDrive/Decision Tree_random_search.pkl
Mejores parámetros para Decision Tree: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 14, 'min_samples_split': 17}
Training Accuracy: 0.7329
Test Accuracy: 0.7344

Confus

In [7]:
# prompt: verifica la conexion a git hub de este archivo

!git clone https://github.com/[your_username]/[your_repository_name].git


fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
