## Phase 1: Setup & Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úì Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

### Load Dataset with Column Names

In [None]:
# Define column names (dataset has no header)
column_names = [
    'age',       # Age in years
    'sex',       # Sex (1 = male; 0 = female)
    'cp',        # Chest pain type (1-4)
    'trestbps',  # Resting blood pressure (mm Hg)
    'chol',      # Serum cholesterol (mg/dl)
    'fbs',       # Fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
    'restecg',   # Resting ECG results (0-2)
    'thalach',   # Maximum heart rate achieved
    'exang',     # Exercise induced angina (1 = yes; 0 = no)
    'oldpeak',   # ST depression induced by exercise
    'slope',     # Slope of peak exercise ST segment (1-3)
    'ca',        # Number of major vessels colored by fluoroscopy (0-3)
    'thal',      # Thalassemia (3 = normal; 6 = fixed defect; 7 = reversible defect)
    'target'     # Diagnosis (0 = no disease; 1-4 = disease present)
]

# Load data, treating '?' as NaN
df = pd.read_csv(
    '../data/processed.cleveland.data',
    header=None,
    names=column_names,
    na_values='?'
)

# Convert target to binary (0 = no disease, 1 = disease present)
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

# Save raw data for reference
df.to_csv('../data/df_raw.csv', index=False)

print(f"‚úì Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"‚úì Raw data saved to '../data/df_raw.csv'")

## Phase 2: Exploratory Data Analysis (EDA)

### 2.1 Initial Inspection

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

In [None]:
# Dataset information
print("\nDataset Info:")
print(df.info())

In [None]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

### 2.2 Missing Values Analysis

In [None]:
# Check for missing values
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])
print(f"\nTotal missing values: {missing_counts.sum()}")

### 2.3 Target Variable Analysis

In [None]:
# Target distribution
target_counts = df['target'].value_counts()
print("Target Distribution:")
print(target_counts)
print(f"\nClass Balance:")
print(f"No Disease (0): {target_counts[0]} ({target_counts[0]/len(df)*100:.1f}%)")
print(f"Disease (1): {target_counts[1]} ({target_counts[1]/len(df)*100:.1f}%)")

# Visualize target distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='target', palette='Set2')
plt.title('Distribution of Target Variable (Heart Disease)', fontsize=14, fontweight='bold')
plt.xlabel('Target (0 = No Disease, 1 = Disease)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks([0, 1], ['No Disease', 'Disease'])
for i, v in enumerate(target_counts):
    plt.text(i, v + 5, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

### 2.4 Feature Distributions - Numerical Features

In [None]:
# Identify numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Plot histograms for numerical features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(alpha=0.3)

# Hide the extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

### 2.5 Feature Distributions - Categorical Features

In [None]:
# Categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Plot bar charts for categorical features
fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_features):
    df[col].value_counts().sort_index().plot(kind='bar', ax=axes[idx], color='coral', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].grid(alpha=0.3, axis='y')
    axes[idx].tick_params(axis='x', rotation=0)

# Hide the extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

### 2.6 Correlation Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()
sns.heatmap(
    correlation_matrix, 
    annot=True, 
    fmt='.2f', 
    cmap='coolwarm', 
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={'shrink': 0.8}
)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Features most correlated with target
print("\nFeatures Most Correlated with Target:")
target_corr = correlation_matrix['target'].abs().sort_values(ascending=False)
print(target_corr[1:])  # Exclude target itself

### 2.7 Boxplots by Target Class

In [None]:
# Boxplots to compare numerical features by target class
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    sns.boxplot(data=df, x='target', y=col, ax=axes[idx], palette='Set2')
    axes[idx].set_title(f'{col} by Target', fontweight='bold')
    axes[idx].set_xlabel('Target (0=No Disease, 1=Disease)')
    axes[idx].set_ylabel(col)
    axes[idx].grid(alpha=0.3, axis='y')

# Hide the extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

## Phase 3: Data Preprocessing

### 3.1 Handle Missing Values

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Strategy for missing values:
# - 'ca' (number of vessels): mode imputation (most common value)
# - 'thal': mode imputation (most common value)

print("Missing values before imputation:")
print(df_processed.isnull().sum()[df_processed.isnull().sum() > 0])

# Impute 'ca' with mode
if df_processed['ca'].isnull().sum() > 0:
    ca_mode = df_processed['ca'].mode()[0]
    df_processed['ca'].fillna(ca_mode, inplace=True)
    print(f"\n‚úì Imputed 'ca' missing values with mode: {ca_mode}")

# Impute 'thal' with mode
if df_processed['thal'].isnull().sum() > 0:
    thal_mode = df_processed['thal'].mode()[0]
    df_processed['thal'].fillna(thal_mode, inplace=True)
    print(f"‚úì Imputed 'thal' missing values with mode: {thal_mode}")

print("\nMissing values after imputation:")
print(df_processed.isnull().sum().sum())
print("‚úì All missing values handled")

### 3.2 Feature Engineering - Categorical Encoding

In [None]:
# Identify categorical columns for encoding
# Binary features (already 0/1): sex, fbs, exang - keep as is
# Multi-class features: cp, restecg, slope, ca, thal - use one-hot encoding

# Columns to one-hot encode
encode_cols = ['cp', 'restecg', 'slope', 'thal']

# Note: 'ca' is ordinal (0-3 vessels), keep as numeric

print(f"Encoding categorical features: {encode_cols}")
print(f"Shape before encoding: {df_processed.shape}")

# One-hot encoding
df_encoded = pd.get_dummies(
    df_processed, 
    columns=encode_cols, 
    prefix=encode_cols,
    drop_first=True  # Avoid multicollinearity
)

print(f"Shape after encoding: {df_encoded.shape}")
print(f"‚úì One-hot encoding complete")
print(f"\nNew columns: {df_encoded.shape[1] - df_processed.shape[1]} added")

### 3.3 Prepare Features and Target

In [None]:
# Separate features (X) and target (y)
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

print(f"Feature matrix (X): {X.shape}")
print(f"Target vector (y): {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

# Save feature names for later reference
feature_names = list(X.columns)

### 3.4 Train-Test Split

In [None]:
# Split data into training and test sets (80-20 split)
# Use stratify to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())
print(f"\nClass distribution in test set:")
print(y_test.value_counts())

### 3.5 Feature Scaling

In [None]:
# Scale numerical features using StandardScaler
# Important: Fit only on training data to prevent data leakage

scaler = StandardScaler()

# Fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data (using training scaler)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

print("‚úì Feature scaling complete")
print(f"\nScaled training data sample:")
print(X_train_scaled.head())

# Save scaler for future use
joblib.dump(scaler, '../models/scaler.pkl')
print("\n‚úì Scaler saved to '../models/scaler.pkl'")

## Phase 4: Model Training

### 4.1 Model 1 - Logistic Regression (Baseline)

In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_lr.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = model_lr.predict(X_test_scaled)

# Save model
joblib.dump(model_lr, '../models/logistic_regression.pkl')

print("‚úì Logistic Regression trained and saved")

### 4.2 Model 2 - K-Nearest Neighbors

In [None]:
# Train KNN
print("Training K-Nearest Neighbors...")
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn = model_knn.predict(X_test_scaled)

# Save model
joblib.dump(model_knn, '../models/knn.pkl')

print("‚úì KNN trained and saved")

### 4.3 Model 3 - Support Vector Machine

In [None]:
# Train SVM
print("Training Support Vector Machine...")
model_svc = SVC(probability=True, random_state=42)
model_svc.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svc = model_svc.predict(X_test_scaled)

# Save model
joblib.dump(model_svc, '../models/svm.pkl')

print("‚úì SVM trained and saved")

### 4.4 Model 4 - Random Forest

In [None]:
# Train Random Forest
print("Training Random Forest...")
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = model_rf.predict(X_test_scaled)

# Save model
joblib.dump(model_rf, '../models/random_forest.pkl')

print("‚úì Random Forest trained and saved")

## Phase 5: Model Evaluation

### 5.1 Evaluation Helper Function

In [None]:
def evaluate_model(model_name, y_true, y_pred):
    """Evaluate a model and print comprehensive metrics"""
    print(f"\n{'='*60}")
    print(f"{model_name} - Evaluation Results")
    print(f"{'='*60}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Extract values from confusion matrix
    tn, fp, fn, tp = cm.ravel()
    print(f"\nTrue Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn} ‚Üê Critical for medical diagnosis")
    print(f"True Positives: {tp}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['No Disease', 'Disease']))
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'False Negatives': fn
    }

### 5.2 Evaluate All Models

In [None]:
# Evaluate each model
results = []

results.append(evaluate_model('Logistic Regression', y_test, y_pred_lr))
results.append(evaluate_model('K-Nearest Neighbors', y_test, y_pred_knn))
results.append(evaluate_model('Support Vector Machine', y_test, y_pred_svc))
results.append(evaluate_model('Random Forest', y_test, y_pred_rf))

### 5.3 Model Comparison Summary

In [None]:
# Create comparison DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Recall', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print("\nNote: Recall (Sensitivity) is the most critical metric for medical diagnosis.")
print("High recall minimizes False Negatives (missing actual disease cases).\n")

# Display with formatting
print(results_df.to_string(index=False))

# Highlight best model
best_model = results_df.iloc[0]
print(f"\n{'='*80}")
print(f"üèÜ BEST MODEL: {best_model['Model']}")
print(f"{'='*80}")
print(f"Recall: {best_model['Recall']:.4f}")
print(f"Accuracy: {best_model['Accuracy']:.4f}")
print(f"Precision: {best_model['Precision']:.4f}")
print(f"F1-Score: {best_model['F1-Score']:.4f}")
print(f"False Negatives: {int(best_model['False Negatives'])}")

### 5.4 Visualize Model Performance

In [None]:
# Plot comparison of key metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    bars = ax.bar(results_df['Model'], results_df[metric], color=colors[idx], edgecolor='black')
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric, fontsize=10)
    ax.set_ylim(0, 1.0)
    ax.grid(alpha=0.3, axis='y')
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

### 5.5 Confusion Matrix Visualization

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

predictions = [
    ('Logistic Regression', y_pred_lr),
    ('K-Nearest Neighbors', y_pred_knn),
    ('Support Vector Machine', y_pred_svc),
    ('Random Forest', y_pred_rf)
]

for idx, (model_name, y_pred) in enumerate(predictions):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'],
                cbar=False)
    axes[idx].set_title(f'{model_name}\nConfusion Matrix', fontweight='bold')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## Phase 6: Feature Importance & Hyperparameter Tuning

### 6.1 Feature Importance Analysis (Random Forest)

In [None]:
# Extract feature importances from Random Forest
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': model_rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance Ranking:")
print(feature_importance.to_string(index=False))

# Plot top 10 features
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(10)
plt.barh(top_features['Feature'], top_features['Importance'], color='forestgreen', edgecolor='black')
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Top 10 Most Important Features (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

# Key insights
print("\n" + "="*60)
print("KEY INSIGHTS - Top 5 Predictive Features:")
print("="*60)
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")

### 6.2 Hyperparameter Tuning (Random Forest)

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("Starting GridSearchCV for Random Forest...")
print(f"Testing {len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['min_samples_split']) * len(param_grid['min_samples_leaf'])} combinations")
print("This may take a few minutes...\n")

# Perform GridSearchCV
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='recall',  # Optimize for recall (most important for medical diagnosis)
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print("\n‚úì GridSearchCV complete")
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Recall: {grid_search.best_score_:.4f}")

### 6.3 Evaluate Tuned Model

In [None]:
# Get best model from grid search
best_rf_model = grid_search.best_estimator_

# Make predictions with tuned model
y_pred_rf_tuned = best_rf_model.predict(X_test_scaled)

# Evaluate tuned model
tuned_results = evaluate_model('Random Forest (Tuned)', y_test, y_pred_rf_tuned)

# Compare with original Random Forest
print("\n" + "="*80)
print("COMPARISON: Original vs Tuned Random Forest")
print("="*80)

comparison_df = pd.DataFrame([
    results[3],  # Original Random Forest
    tuned_results
])

print(comparison_df.to_string(index=False))

# Save tuned model
joblib.dump(best_rf_model, '../models/random_forest_tuned.pkl')
print("\n‚úì Tuned Random Forest model saved to '../models/random_forest_tuned.pkl'")

## Phase 7: Final Report & Summary

In [None]:
# Generate final summary report
print("\n" + "#"*80)
print("#" + " "*78 + "#")
print("#" + " "*20 + "HEART DISEASE PREDICTION - FINAL REPORT" + " "*19 + "#")
print("#" + " "*78 + "#")
print("#"*80)

print("\n1. DATASET OVERVIEW")
print("   " + "-"*70)
print(f"   - Total Samples: {len(df)}")
print(f"   - Features: {len(column_names) - 1}")
print(f"   - Target Classes: 2 (No Disease / Disease)")
print(f"   - Class Distribution: {target_counts[0]} no disease, {target_counts[1]} disease")

print("\n2. DATA PREPROCESSING")
print("   " + "-"*70)
print("   - Missing values handled via mode imputation")
print("   - Categorical features encoded (one-hot encoding)")
print("   - Numerical features scaled (StandardScaler)")
print(f"   - Final feature count: {len(feature_names)}")

print("\n3. MODELS TRAINED")
print("   " + "-"*70)
print("   - Logistic Regression (Baseline)")
print("   - K-Nearest Neighbors (k=5)")
print("   - Support Vector Machine")
print("   - Random Forest (100 estimators)")
print("   - Random Forest (Hyperparameter Tuned)")

print("\n4. BEST PERFORMING MODEL")
print("   " + "-"*70)
best = results_df.iloc[0]
print(f"   Model: {best['Model']}")
print(f"   - Accuracy:  {best['Accuracy']:.4f} ({best['Accuracy']*100:.2f}%)")
print(f"   - Precision: {best['Precision']:.4f} ({best['Precision']*100:.2f}%)")
print(f"   - Recall:    {best['Recall']:.4f} ({best['Recall']*100:.2f}%) ‚òÖ Most Critical")
print(f"   - F1-Score:  {best['F1-Score']:.4f}")
print(f"   - False Negatives: {int(best['False Negatives'])} (missed disease cases)")

print("\n5. TOP PREDICTIVE FEATURES")
print("   " + "-"*70)
for i, row in feature_importance.head(5).iterrows():
    print(f"   {i+1}. {row['Feature']}: {row['Importance']:.4f}")

print("\n6. KEY FINDINGS")
print("   " + "-"*70)
print("   - The Random Forest model achieved the best overall performance")
print("   - High Recall (Sensitivity) minimizes missed disease cases")
print("   - Chest pain type and thalassemia are highly predictive features")
print("   - Maximum heart rate (thalach) is a strong indicator")
print("   - The model is suitable for clinical decision support")

print("\n7. SAVED ARTIFACTS")
print("   " + "-"*70)
print("   - ../models/logistic_regression.pkl")
print("   - ../models/knn.pkl")
print("   - ../models/svm.pkl")
print("   - ../models/random_forest.pkl")
print("   - ../models/random_forest_tuned.pkl")
print("   - ../models/scaler.pkl")
print("   - ../data/df_raw.csv")

print("\n" + "#"*80)
print("\n‚úì Analysis Complete! All models trained, evaluated, and saved.")
print("‚úì Review the visualizations and metrics above for detailed insights.")

## Next Steps & Recommendations

### For Further Improvement:
1. **Cross-Validation**: Implement k-fold cross-validation for more robust performance estimates
2. **Ensemble Methods**: Try stacking or voting classifiers combining multiple models
3. **Feature Engineering**: Create interaction features or polynomial features
4. **Class Imbalance**: Experiment with SMOTE or class weights if needed
5. **Deep Learning**: Try neural networks for potentially better performance

### For Deployment:
1. Create a simple prediction function using the saved models
2. Build a web interface (Flask/Streamlit) for interactive predictions
3. Implement proper error handling and input validation
4. Add confidence intervals and prediction probabilities
5. Regular model retraining with new data

### Medical Considerations:
- This model is for educational purposes only
- Should not replace professional medical diagnosis
- Always prioritize high Recall to minimize false negatives
- Consider interpretability for clinical acceptance