# Heart Disease Prediction - Model Training & Evaluation

This notebook implements the complete machine learning pipeline for heart disease prediction, following the plan structure:

## Table of Contents
1. Setup & Data Loading
2. Data Preprocessing 
3. Model Training (Logistic Regression & Random Forest)
4. Hyperparameter Tuning
5. Model Evaluation & Comparison
6. Model Persistence
7. Demo Predictions

**Goal**: Build accurate models to predict heart disease risk and save the best performing model.


In [None]:
# 0) Setup - Import libraries and configure environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append('../src')

# Scikit-learn imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, ConfusionMatrixDisplay, 
                             RocCurveDisplay, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# Local utilities
from utils import load_and_create_target, plot_confusion_matrix, plot_roc_curve, print_model_metrics

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")
print(f"📊 Pandas: {pd.__version__}")
print(f"🔢 NumPy: {np.__version__}")
print(f"🤖 Scikit-learn imported")


## 1. Data Loading & Preprocessing


In [None]:
# 1) Load data with target creation
print("📂 Loading heart disease dataset...")
df = load_and_create_target('../data/heart_dataset.csv')

print(f"✅ Dataset loaded successfully!")
print(f"📊 Shape: {df.shape}")
print(f"🎯 Target distribution: {df['target'].value_counts().to_dict()}")

# Separate features and target
y = df['target'].astype(int)
X = df.drop(columns=['target'])

print(f"\n📋 Features: {list(X.columns)}")
print(f"🔢 Feature count: {X.shape[1]}")
print(f"📈 Sample count: {X.shape[0]}")

# Display first few rows
print("\n📝 First few samples:")
X.head()


In [None]:
# 2) Basic data cleaning and feature identification
print("🧹 Data preprocessing...")

# Check for missing values
missing_values = X.isnull().sum().sum()
print(f"❓ Missing values: {missing_values}")

# Identify feature types
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"🔢 Numeric features ({len(num_cols)}): {num_cols}")
print(f"📋 Categorical features ({len(cat_cols)}): {cat_cols}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', 'passthrough', cat_cols)  # Keep categorical as-is since they're already encoded
], remainder='drop')

print("✅ Preprocessor created!")


## 2. Train/Test Split


In [None]:
# 3) Train-test split
print("✂️ Splitting data into train and test sets...")

# Check if we have enough data for proper split
if len(X) > 4:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, 
        stratify=y if len(np.unique(y)) > 1 else None
    )
else:
    print("⚠️ Warning: Very small dataset. Using all data for both training and testing.")
    X_train, X_test = X, X
    y_train, y_test = y, y

print(f"📊 Training set: {X_train.shape}")
print(f"📊 Test set: {X_test.shape}")
print(f"🎯 Training target distribution: {y_train.value_counts().to_dict()}")
print(f"🎯 Test target distribution: {y_test.value_counts().to_dict()}")


## 3. Baseline Models Training


In [None]:
# 4) Define and train baseline models
print("🤖 Training baseline models...")

# Define models
models = {
    "logistic_regression": LogisticRegression(max_iter=1000, random_state=42),
    "random_forest": RandomForestClassifier(random_state=42, n_jobs=-1)
}

# Store results
results = {}
trained_models = {}

# Train each model
for name, clf in models.items():
    print(f"\n🔄 Training {name.upper()}...")
    
    # Create pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor), 
        ('classifier', clf)
    ])
    
    # Train model
    pipe.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe['classifier'], "predict_proba") else None
    
    # Calculate metrics
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    
    # Store results
    results[name] = metrics
    trained_models[name] = pipe
    
    # Print results
    print(f"✅ {name.upper()} Results:")
    print(f"   Accuracy: {metrics['accuracy']:.4f}")
    print(f"   Precision: {metrics['precision']:.4f}")
    print(f"   Recall: {metrics['recall']:.4f}")
    print(f"   F1 Score: {metrics['f1']:.4f}")
    if metrics['roc_auc']:
        print(f"   ROC AUC: {metrics['roc_auc']:.4f}")

print("\n" + "="*50)
print("🏆 BASELINE MODELS TRAINING COMPLETE!")
