# Heart Disease Prediction - Model Training & Evaluation

This notebook implements the complete machine learning pipeline for heart disease prediction, following the plan structure:

## Table of Contents
1. Setup & Data Loading
2. Data Preprocessing 
3. Model Training (Logistic Regression & Random Forest)
4. Hyperparameter Tuning
5. Model Evaluation & Comparison
6. Model Persistence
7. Demo Predictions

**Goal**: Build accurate models to predict heart disease risk and save the best performing model.


In [1]:
# 0) Setup - Import libraries and configure environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append('../src')

# Scikit-learn imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, ConfusionMatrixDisplay, 
                             RocCurveDisplay, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

# Local utilities
from utils import load_and_create_target, plot_confusion_matrix, plot_roc_curve, print_model_metrics

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Scikit-learn imported")


All libraries imported successfully!
Pandas: 2.2.3
NumPy: 2.1.3
Scikit-learn imported


## 1. Data Loading & Preprocessing


In [2]:
# 1) Load data with target creation
print("Loading heart disease dataset...")
df = load_and_create_target('../data/heart_dataset.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Target distribution: {df['target'].value_counts().to_dict()}")

# Separate features and target
y = df['target'].astype(int)
X = df.drop(columns=['target'])

print(f"\nFeatures: {list(X.columns)}")
print(f"Feature count: {X.shape[1]}")
print(f"Sample count: {X.shape[0]}")

# Display first few rows
print("\nFirst few samples:")
X.head()


Loading heart disease dataset...
Dataset loaded successfully!
Shape: (5, 18)
Target distribution: {0: 3, 1: 2}

Features: ['age', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'sex_Female', 'sex_Male', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina']
Feature count: 17
Sample count: 5

First few samples:


Unnamed: 0,age,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina
0,58,130,220,1,normal,150,False,1.4,flat,0,fixed defect,0,1,0,0,0,1
1,67,160,276,0,lv hypertrophy,108,True,1.5,flat,3,normal,0,1,1,0,0,0
2,42,120,230,0,normal,170,False,1.0,upsloping,0,reversable defect,1,0,0,0,1,0
3,50,130,210,0,lv hypertrophy,158,False,0.8,flat,0,normal,0,1,0,0,1,0
4,45,114,230,0,normal,165,False,1.1,downsloping,0,normal,1,0,0,1,0,0


In [3]:
# 2) Basic data cleaning and feature identification
print("Data preprocessing...")

# Check for missing values
missing_values = X.isnull().sum().sum()
print(f"Missing values: {missing_values}")

# Identify feature types
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"Numeric features ({len(num_cols)}): {num_cols}")
print(f"Categorical features ({len(cat_cols)}): {cat_cols}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', 'passthrough', cat_cols)  # Keep categorical as-is since they're already encoded
], remainder='drop')

print("Preprocessor created!")


Data preprocessing...
Missing values: 0
Numeric features (13): ['age', 'trestbps', 'chol', 'fbs', 'thalch', 'oldpeak', 'ca', 'sex_Female', 'sex_Male', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina']
Categorical features (4): ['restecg', 'exang', 'slope', 'thal']
Preprocessor created!


## 2. Train/Test Split


In [15]:
# 3) Train-test split
print("Splitting data into train and test sets...")

# Check if we have enough data for proper split
if len(X) >= 10:  # Need enough samples for proper stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, 
        stratify=y if len(np.unique(y)) > 1 else None
    )
elif len(X) >= 4:  # Small dataset - no stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42  # Larger test size, no stratify
    )
    print("Warning: Small dataset. Using simple split without stratification.")
else:
    print("Warning: Very small dataset. Using all data for both training and testing.")
    X_train, X_test = X, X
    y_train, y_test = y, y

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")


Splitting data into train and test sets...
Training set: (3, 17)
Test set: (2, 17)
Training target distribution: {0: 2, 1: 1}
Test target distribution: {1: 1, 0: 1}


## 3. Baseline Models Training


In [16]:
# 4) Define and train baseline models
print("Training baseline models...")

# Define models
models = {
    "logistic_regression": LogisticRegression(max_iter=1000, random_state=42),
    "random_forest": RandomForestClassifier(random_state=42, n_jobs=-1)
}

# Store results
results = {}
trained_models = {}

# Train each model
for name, clf in models.items():
    print(f"\nTraining {name.upper()}...")
    
    # Create pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor), 
        ('classifier', clf)
    ])
    
    # Train model
    pipe.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe['classifier'], "predict_proba") else None
    
    # Calculate metrics
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    
    # Store results
    results[name] = metrics
    trained_models[name] = pipe
    
    # Print results
    print(f"{name.upper()} Results:")
    print(f"   Accuracy: {metrics['accuracy']:.4f}")
    print(f"   Precision: {metrics['precision']:.4f}")
    print(f"   Recall: {metrics['recall']:.4f}")
    print(f"   F1 Score: {metrics['f1']:.4f}")
    if metrics['roc_auc']:
        print(f"   ROC AUC: {metrics['roc_auc']:.4f}")

print("\n" + "="*50)
print("BASELINE MODELS TRAINING COMPLETE!")


Training baseline models...

Training LOGISTIC_REGRESSION...


ValueError: could not convert string to float: 'normal'

## 4. Hyperparameter Tuning (Optional)


In [6]:
# Optional: Hyperparameter tuning for better performance
print("Hyperparameter tuning (if dataset is large enough)...")

# Define parameter grids
param_grids = {
    'logistic_regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    'random_forest': {
        'classifier__n_estimators': [100, 300],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5]
    }
}

# Only do hyperparameter tuning if we have enough data
if len(X_train) > 10:
    print("Performing hyperparameter tuning...")
    
    tuned_models = {}
    for name, pipe in trained_models.items():
        if name in param_grids:
            print(f"\nTuning {name.upper()}...")
            
            grid_search = GridSearchCV(
                pipe, 
                param_grids[name],
                cv=min(3, len(X_train)//2),  # Adjust CV folds based on data size
                scoring='roc_auc',
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            tuned_models[name] = grid_search.best_estimator_
            
            print(f"Best parameters: {grid_search.best_params_}")
            print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    # Update models if tuning was successful
    if tuned_models:
        trained_models.update(tuned_models)
        print("\nModels updated with best hyperparameters!")
    
else:
    print("Dataset too small for hyperparameter tuning. Using default parameters.")


Hyperparameter tuning (if dataset is large enough)...


NameError: name 'X_train' is not defined

## 5. Model Evaluation & Visualization


In [7]:
# 5) Comprehensive model evaluation and comparison
print("=== COMPREHENSIVE MODEL EVALUATION ===")

# Create comparison DataFrame
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df.round(4))

# Find best model
best_model_name = None
if 'roc_auc' in results_df.columns:
    # Sort by ROC AUC first, then F1
    results_df['combined_score'] = results_df['roc_auc'].fillna(0) + results_df['f1']
    best_model_name = results_df['combined_score'].idxmax()
else:
    # Sort by F1 score if ROC AUC not available
    best_model_name = results_df['f1'].idxmax()

best_model = trained_models[best_model_name]

print(f"\nBEST MODEL: {best_model_name.upper()}")
print(f"Performance: {results_df.loc[best_model_name].to_dict()}")

# Get predictions from best model
y_pred_best = best_model.predict(X_test)
try:
    y_proba_best = best_model.predict_proba(X_test)[:, 1]
except:
    y_proba_best = None

print_model_metrics(y_test, y_pred_best, y_proba_best, best_model_name)


=== COMPREHENSIVE MODEL EVALUATION ===

Model Comparison:
Empty DataFrame
Columns: []
Index: []


KeyError: 'f1'

In [8]:
# 6) Visualizations - Confusion Matrix and ROC Curve
print("Creating visualizations...")

# Create figures directory if it doesn't exist
os.makedirs('../reports', exist_ok=True)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
confusion_fig = plot_confusion_matrix(y_test, y_pred_best, f"Confusion Matrix - {best_model_name.title()}")
plt.savefig('../reports/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot ROC curve if probabilities available
if y_proba_best is not None:
    plt.figure(figsize=(8, 6))
    roc_fig = plot_roc_curve(y_test, y_proba_best, f"ROC Curve - {best_model_name.title()}")
    plt.savefig('../reports/roc_curve.png', dpi=300, bbox_inches='tight')
    plt.show()

# Model comparison chart
plt.figure(figsize=(12, 8))
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(metrics_to_plot))
width = 0.35

for i, (model_name, model_results) in enumerate(results.items()):
    model_scores = [model_results[metric] for metric in metrics_to_plot]
    plt.bar(x + i*width, model_scores, width, label=model_name.replace('_', ' ').title())

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width/2, metrics_to_plot)
plt.legend()
plt.ylim(0, 1)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualizations saved to ../reports/ directory")


Creating visualizations...


NameError: name 'y_test' is not defined

<Figure size 800x600 with 0 Axes>

## 6. Model Persistence


In [9]:
# 7) Save models for future use
print("Saving trained models...")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save all models
saved_models = {}
for name, model in trained_models.items():
    model_path = f'../models/{name}.pkl'
    joblib.dump(model, model_path)
    saved_models[name] = model_path
    print(f"Saved {name}: {model_path}")

# Save best model separately
best_model_path = '../models/best_model.pkl'
joblib.dump(best_model, best_model_path)
saved_models['best_model'] = best_model_path

print(f"\nBest model saved: {best_model_path}")
print(f"Best model type: {best_model_name}")

# Save model metadata
model_info = {
    'best_model_name': best_model_name,
    'best_model_path': best_model_path,
    'performance_metrics': results[best_model_name],
    'feature_columns': list(X.columns),
    'target_classes': [0, 1],
    'model_files': saved_models
}

import json
with open('../models/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print("Model metadata saved: ../models/model_info.json")


Saving trained models...


NameError: name 'best_model' is not defined

## 7. Demo Predictions


In [10]:
# 8) Demo prediction on sample data
print("=== DEMO PREDICTIONS ===")

if len(X_test) > 0:
    # Use first test sample for demo
    sample_data = X_test.iloc[[0]]
    
    print("Sample patient data:")
    for col, val in sample_data.iloc[0].items():
        print(f"  {col}: {val}")
    
    # Make prediction
    prediction = best_model.predict(sample_data)[0]
    if y_proba_best is not None:
        probability = best_model.predict_proba(sample_data)[0][1]
        print(f"\nPrediction: {'Heart Disease Risk' if prediction == 1 else 'No Heart Disease Risk'}")
        print(f"Probability of heart disease: {probability:.4f} ({probability*100:.1f}%)")
    else:
        print(f"\nPrediction: {'Heart Disease Risk' if prediction == 1 else 'No Heart Disease Risk'}")
    
    print(f"Actual label: {'Heart Disease' if y_test.iloc[0] == 1 else 'No Heart Disease'}")
    print(f"Prediction {'CORRECT' if prediction == y_test.iloc[0] else 'INCORRECT'}")

print("\n" + "="*60)
print("TRAINING COMPLETE! Models ready for deployment.")
print("="*60)


=== DEMO PREDICTIONS ===


NameError: name 'X_test' is not defined