# Heart Disease Prediction — Colab Training and Evaluation

Open in Google Colab. Upload `heart_dataset.csv` when prompted, then run all cells.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import files
    uploaded = files.upload()


In [None]:
from pathlib import Path
possible_paths = ['heart_dataset.csv', 'data/heart_dataset.csv', '../data/heart_dataset.csv', '/content/heart_dataset.csv']
for p in possible_paths:
    if Path(p).exists():
        data_path = p
        break
df = pd.read_csv(data_path)
df.head()


In [None]:
if 'target' not in df.columns:
    risk = np.zeros(len(df), dtype=int)
    if 'age' in df.columns:
        risk += (df['age'] >= 55).astype(int)
    if 'trestbps' in df.columns:
        risk += (df['trestbps'] >= 140).astype(int)
    if 'chol' in df.columns:
        risk += (df['chol'] >= 240).astype(int)
    if 'oldpeak' in df.columns:
        risk += pd.to_numeric(df['oldpeak'], errors='coerce').fillna(0).ge(1.0).astype(int)
    if 'exang' in df.columns:
        risk += df['exang'].astype(str).isin(['True','1','true']).astype(int)
    if 'cp_asymptomatic' in df.columns:
        risk += (df['cp_asymptomatic'] == 1).astype(int)
    df['target'] = (risk >= 2).astype(int)


In [None]:
target_col = 'target'
feature_cols = [c for c in df.columns if c != target_col]
X = df[feature_cols]
y = df[target_col]
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()==2 and len(y)>=20 else None
)


In [None]:
models = {
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
    'decision_tree': DecisionTreeClassifier(random_state=42),
    'random_forest': RandomForestClassifier(random_state=42, n_jobs=-1)
}

pipelines = {name: Pipeline(steps=[('preprocess', preprocessor), ('model', clf)]) for name, clf in models.items()}


In [None]:
results = []
trained = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    metrics = {
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0)
    }
    if hasattr(pipe.named_steps['model'], 'predict_proba'):
        y_proba = pipe.predict_proba(X_test)[:,1]
        metrics['roc_auc'] = roc_auc_score(y_test, y_proba)
    else:
        metrics['roc_auc'] = np.nan
        y_proba = None
    trained[name] = {'pipeline': pipe, 'y_pred': y_pred, 'y_proba': y_proba}
    results.append(metrics)
results_df = pd.DataFrame(results)
results_df


In [None]:
best_idx = results_df['f1'].fillna(0).idxmax()
best_name = results_df.loc[best_idx, 'model']
best = trained[best_name]
y_pred_best = best['y_pred']
y_proba_best = best['y_proba']


In [None]:
fig, ax = plt.subplots(figsize=(5,4))
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title(f'Confusion Matrix - {best_name}')
plt.show()

if y_proba_best is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba_best)
    auc = roc_auc_score(y_test, y_proba_best)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
    plt.plot([0,1],[0,1],'--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {best_name}')
    plt.legend()
    plt.show()
