# Heart Disease Prediction Model Training
## Team: Vi Thi Tuong Nguyen, Lam Nguyen, James Pham, Le Duy Vu

This notebook implements:
1. **Hierarchical Classification Approach**
   - Stage 1: Binary Classification (Disease vs No Disease)
   - Stage 2: Multi-class Classification (Severity Levels 1-4)
2. **Multiple ML Algorithms** (Random Forest, XGBoost, SVM, Logistic Regression, Gradient Boosting)
3. **Hyperparameter Tuning** with GridSearchCV/RandomizedSearchCV
4. **Class Imbalance Handling** (SMOTE, class weights, threshold tuning)
5. **Ensemble Methods** (Voting, Stacking)
6. **Comprehensive Evaluation** (F1-score focused)

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix, f1_score, 
                             precision_score, recall_score, accuracy_score,
                             roc_auc_score, roc_curve, precision_recall_curve)
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek
from collections import Counter
import pickle
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load Preprocessed Data

In [None]:
# Load binary classification data
X_train_bin = pd.read_csv('../data/processed/X_train_binary.csv')
X_test_bin = pd.read_csv('../data/processed/X_test_binary.csv')
y_train_bin = pd.read_csv('../data/processed/y_train_binary.csv').values.ravel()
y_test_bin = pd.read_csv('../data/processed/y_test_binary.csv').values.ravel()

# Load multi-class classification data
X_train_multi = pd.read_csv('../data/processed/X_train_multiclass.csv')
X_test_multi = pd.read_csv('../data/processed/X_test_multiclass.csv')
y_train_multi = pd.read_csv('../data/processed/y_train_multiclass.csv').values.ravel()
y_test_multi = pd.read_csv('../data/processed/y_test_multiclass.csv').values.ravel()

print("Binary Classification Data:")
print(f"X_train: {X_train_bin.shape}, y_train: {y_train_bin.shape}")
print(f"X_test: {X_test_bin.shape}, y_test: {y_test_bin.shape}")
print(f"Class distribution - Train: {Counter(y_train_bin)}")
print(f"Class distribution - Test: {Counter(y_test_bin)}")

print("\nMulti-class Classification Data:")
print(f"X_train: {X_train_multi.shape}, y_train: {y_train_multi.shape}")
print(f"X_test: {X_test_multi.shape}, y_test: {y_test_multi.shape}")
print(f"Class distribution - Train: {Counter(y_train_multi)}")
print(f"Class distribution - Test: {Counter(y_test_multi)}")

## 2. Handle Class Imbalance with SMOTE and BorderlineSMOTE

**SMOTE (Binary Classification)**:
- Standard SMOTE works well for binary classification
- Creates synthetic samples in feature space
- k=5 neighbors for stable synthetic sample generation

**BorderlineSMOTE (Multi-class Classification)**:
- Focuses on borderline/difficult cases between classes
- Better for multi-class problems with severe imbalance
- `kind='borderline-1'`: Only creates synthetic samples from minority class borderline cases
- More conservative than standard SMOTE
- Helps prevent over-generalization in overlapping class regions
- Particularly effective for rare severity levels (classes 3-4)

In [None]:
# Apply SMOTE to binary classification
print("Applying SMOTE to Binary Classification...")
smote_bin = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)
X_train_bin_smote, y_train_bin_smote = smote_bin.fit_resample(X_train_bin, y_train_bin)

print(f"Before SMOTE: {Counter(y_train_bin)}")
print(f"After SMOTE: {Counter(y_train_bin_smote)}")
print(f"Shape before: {X_train_bin.shape}, After: {X_train_bin_smote.shape}")

# Apply SMOTE to multi-class (only to disease cases - classes 1-4)
print("\nApplying BorderlineSMOTE to Multi-class Classification...")
# BorderlineSMOTE focuses on borderline cases, better for multi-class with imbalance
smote_multi = BorderlineSMOTE(random_state=RANDOM_STATE, k_neighbors=3, kind='borderline-1')
X_train_multi_smote, y_train_multi_smote = smote_multi.fit_resample(X_train_multi, y_train_multi)

print(f"Before BorderlineSMOTE: {Counter(y_train_multi)}")
print(f"After BorderlineSMOTE: {Counter(y_train_multi_smote)}")
print(f"Shape before: {X_train_multi.shape}, After: {X_train_multi_smote.shape}")

## 3. Utility Functions for Evaluation

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name, is_binary=True):
    """
    Comprehensive model evaluation with multiple metrics
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    # Calculate metrics
    results = {
        'model': model_name,
        'train_accuracy': accuracy_score(y_train, y_pred_train),
        'test_accuracy': accuracy_score(y_test, y_pred),
        'train_f1': f1_score(y_train, y_pred_train, average='weighted'),
        'test_f1': f1_score(y_test, y_pred, average='weighted'),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted')
    }
    
    if is_binary:
        results['test_f1_macro'] = f1_score(y_test, y_pred, average='macro')
        results['test_f1_binary'] = f1_score(y_test, y_pred, average='binary')
        
        # ROC-AUC for binary
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test)[:, 1]
            results['roc_auc'] = roc_auc_score(y_test, y_proba)
    
    return results, y_pred

def plot_confusion_matrix(y_true, y_pred, title, labels=None):
    """
    Plot confusion matrix
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels if labels else 'auto',
                yticklabels=labels if labels else 'auto')
    plt.title(title, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

def plot_feature_importance(model, feature_names, top_n=20, title="Feature Importance"):
    """
    Plot feature importance for tree-based models
    """
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[-top_n:]
        
        plt.figure(figsize=(10, 8))
        plt.barh(range(len(indices)), importances[indices], color='steelblue')
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel('Importance')
        plt.title(title, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        return pd.DataFrame({
            'feature': [feature_names[i] for i in indices],
            'importance': importances[indices]
        }).sort_values('importance', ascending=False)
    return None

## 4. Stage 1: Binary Classification (Disease vs No Disease)

### 4.1 Baseline Models

In [None]:
# Initialize models
binary_models = {
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'),
    'SVM': SVC(random_state=RANDOM_STATE, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE)
}

# Train and evaluate baseline models
binary_results = []

print("Training Binary Classification Models (with SMOTE)...\n")
print("="*80)

for name, model in binary_models.items():
    print(f"\nTraining {name}...")
    
    # Train on SMOTE data
    model.fit(X_train_bin_smote, y_train_bin_smote)
    
    # Evaluate
    results, y_pred = evaluate_model(
        model, X_train_bin_smote, y_train_bin_smote, 
        X_test_bin, y_test_bin, name, is_binary=True
    )
    binary_results.append(results)
    
    print(f"  Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"  Test F1 (weighted): {results['test_f1']:.4f}")
    print(f"  Test F1 (binary): {results['test_f1_binary']:.4f}")
    print(f"  ROC-AUC: {results.get('roc_auc', 'N/A')}")
    print(f"  Precision: {results['precision']:.4f}")
    print(f"  Recall: {results['recall']:.4f}")

print("\n" + "="*80)

In [None]:
# Compare baseline models
binary_results_df = pd.DataFrame(binary_results)
binary_results_df = binary_results_df.sort_values('test_f1', ascending=False)

print("\nBinary Classification - Baseline Results:")
print(binary_results_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

binary_results_df.plot(x='model', y=['test_accuracy', 'test_f1'], kind='bar', ax=axes[0])
axes[0].set_title('Binary Classification: Accuracy vs F1-Score', fontweight='bold')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].legend(['Accuracy', 'F1-Score (weighted)'])
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

binary_results_df.plot(x='model', y=['precision', 'recall'], kind='bar', ax=axes[1])
axes[1].set_title('Binary Classification: Precision vs Recall', fontweight='bold')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

### 4.2 Hyperparameter Tuning for Best Binary Model

In [None]:
# Select top 2 models for tuning
top_binary_models = binary_results_df.head(2)['model'].tolist()
print(f"Top models for tuning: {top_binary_models}")

In [None]:
# Hyperparameter tuning for Random Forest
print("Tuning Random Forest...")

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', None]
}

rf_random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=RANDOM_STATE),
    rf_param_grid,
    n_iter=50,
    cv=StratifiedKFold(n_splits=5),
    scoring='f1_weighted',
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=1
)

rf_random_search.fit(X_train_bin_smote, y_train_bin_smote)

print(f"\nBest parameters: {rf_random_search.best_params_}")
print(f"Best cross-validation F1 score: {rf_random_search.best_score_:.4f}")

best_rf_bin = rf_random_search.best_estimator_

In [None]:
# Hyperparameter tuning for XGBoost
print("Tuning XGBoost...")

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight': [1, 2, 3]
}

xgb_random_search = RandomizedSearchCV(
    XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'),
    xgb_param_grid,
    n_iter=50,
    cv=StratifiedKFold(n_splits=5),
    scoring='f1_weighted',
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=1
)

xgb_random_search.fit(X_train_bin_smote, y_train_bin_smote)

print(f"\nBest parameters: {xgb_random_search.best_params_}")
print(f"Best cross-validation F1 score: {xgb_random_search.best_score_:.4f}")

best_xgb_bin = xgb_random_search.best_estimator_

In [None]:
# Evaluate tuned models
print("\nEvaluating Tuned Models...\n")
print("="*80)

tuned_results = []

for name, model in [('Random Forest (Tuned)', best_rf_bin), ('XGBoost (Tuned)', best_xgb_bin)]:
    print(f"\n{name}:")
    results, y_pred = evaluate_model(
        model, X_train_bin_smote, y_train_bin_smote,
        X_test_bin, y_test_bin, name, is_binary=True
    )
    tuned_results.append(results)
    
    print(f"  Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"  Test F1 (weighted): {results['test_f1']:.4f}")
    print(f"  Test F1 (binary): {results['test_f1_binary']:.4f}")
    print(f"  ROC-AUC: {results.get('roc_auc', 'N/A')}")
    print(f"  Precision: {results['precision']:.4f}")
    print(f"  Recall: {results['recall']:.4f}")
    
    # Confusion matrix
    plot_confusion_matrix(y_test_bin, y_pred, f'{name} - Confusion Matrix', 
                         labels=['No Disease', 'Disease'])
    
    # Feature importance
    feature_imp = plot_feature_importance(model, X_train_bin.columns, 
                                         title=f'{name} - Top 20 Features')
    if feature_imp is not None:
        print(f"\nTop 10 Important Features:")
        print(feature_imp.head(10))

print("\n" + "="*80)

### 4.3 Ensemble Methods for Binary Classification

In [None]:
# Voting Classifier
print("Training Voting Ensemble...")

voting_clf_bin = VotingClassifier(
    estimators=[
        ('rf', best_rf_bin),
        ('xgb', best_xgb_bin),
        ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE))
    ],
    voting='soft'
)

voting_clf_bin.fit(X_train_bin_smote, y_train_bin_smote)

results_voting, y_pred_voting = evaluate_model(
    voting_clf_bin, X_train_bin_smote, y_train_bin_smote,
    X_test_bin, y_test_bin, 'Voting Ensemble', is_binary=True
)

print(f"\nVoting Ensemble Results:")
print(f"  Test Accuracy: {results_voting['test_accuracy']:.4f}")
print(f"  Test F1 (weighted): {results_voting['test_f1']:.4f}")
print(f"  Test F1 (binary): {results_voting['test_f1_binary']:.4f}")
print(f"  ROC-AUC: {results_voting.get('roc_auc', 'N/A')}")

plot_confusion_matrix(y_test_bin, y_pred_voting, 
                     'Voting Ensemble - Confusion Matrix',
                     labels=['No Disease', 'Disease'])

In [None]:
# Stacking Classifier
print("Training Stacking Ensemble...")

stacking_clf_bin = StackingClassifier(
    estimators=[
        ('rf', best_rf_bin),
        ('xgb', best_xgb_bin),
        ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE))
    ],
    final_estimator=LogisticRegression(random_state=RANDOM_STATE),
    cv=5
)

stacking_clf_bin.fit(X_train_bin_smote, y_train_bin_smote)

results_stacking, y_pred_stacking = evaluate_model(
    stacking_clf_bin, X_train_bin_smote, y_train_bin_smote,
    X_test_bin, y_test_bin, 'Stacking Ensemble', is_binary=True
)

print(f"\nStacking Ensemble Results:")
print(f"  Test Accuracy: {results_stacking['test_accuracy']:.4f}")
print(f"  Test F1 (weighted): {results_stacking['test_f1']:.4f}")
print(f"  Test F1 (binary): {results_stacking['test_f1_binary']:.4f}")
print(f"  ROC-AUC: {results_stacking.get('roc_auc', 'N/A')}")

plot_confusion_matrix(y_test_bin, y_pred_stacking, 
                     'Stacking Ensemble - Confusion Matrix',
                     labels=['No Disease', 'Disease'])

### 4.4 Select Best Binary Model

In [None]:
# Compare all binary models
all_binary_results = tuned_results + [results_voting, results_stacking]
all_binary_df = pd.DataFrame(all_binary_results).sort_values('test_f1', ascending=False)

print("\nFinal Binary Classification Results:")
print(all_binary_df.to_string(index=False))

best_binary_model_name = all_binary_df.iloc[0]['model']
best_binary_f1 = all_binary_df.iloc[0]['test_f1']

print(f"\n{'='*80}")
print(f"BEST BINARY MODEL: {best_binary_model_name}")
print(f"Test F1-Score: {best_binary_f1:.4f}")
print(f"{'='*80}")

# Select the best model
if 'Voting' in best_binary_model_name:
    best_binary_model = voting_clf_bin
elif 'Stacking' in best_binary_model_name:
    best_binary_model = stacking_clf_bin
elif 'XGBoost' in best_binary_model_name:
    best_binary_model = best_xgb_bin
else:
    best_binary_model = best_rf_bin

## 5. Stage 2: Multi-class Classification (Severity Levels)

### 5.1 Baseline Multi-class Models

In [None]:
# Initialize multi-class models
multiclass_models = {
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=200, class_weight='balanced'),
    'XGBoost': XGBClassifier(random_state=RANDOM_STATE, eval_metric='mlogloss'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'SVM': SVC(random_state=RANDOM_STATE, kernel='rbf', probability=True)
}

# Train and evaluate baseline multi-class models
multiclass_results = []

print("Training Multi-class Classification Models (with BorderlineSMOTE)...\n")
print("="*80)

for name, model in multiclass_models.items():
    print(f"\nTraining {name}...")
    
    # Train on SMOTE data
    model.fit(X_train_multi_smote, y_train_multi_smote)
    
    # Evaluate
    results, y_pred = evaluate_model(
        model, X_train_multi_smote, y_train_multi_smote,
        X_test_multi, y_test_multi, name, is_binary=False
    )
    multiclass_results.append(results)
    
    print(f"  Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"  Test F1 (weighted): {results['test_f1']:.4f}")
    print(f"  Precision: {results['precision']:.4f}")
    print(f"  Recall: {results['recall']:.4f}")
    
    # Classification report
    print(f"\n  Classification Report:")
    print(classification_report(y_test_multi, y_pred, 
                                target_names=[f'Class {i}' for i in range(5)]))
    
    # Confusion matrix
    plot_confusion_matrix(y_test_multi, y_pred, 
                         f'{name} - Multi-class Confusion Matrix',
                         labels=[f'Level {i}' for i in range(5)])

print("\n" + "="*80)

In [None]:
# Compare multi-class baseline models
multiclass_results_df = pd.DataFrame(multiclass_results).sort_values('test_f1', ascending=False)

print("\nMulti-class Classification - Baseline Results:")
print(multiclass_results_df.to_string(index=False))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

multiclass_results_df.plot(x='model', y=['test_accuracy', 'test_f1'], kind='bar', ax=axes[0])
axes[0].set_title('Multi-class: Accuracy vs F1-Score', fontweight='bold')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

multiclass_results_df.plot(x='model', y=['precision', 'recall'], kind='bar', ax=axes[1])
axes[1].set_title('Multi-class: Precision vs Recall', fontweight='bold')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

### 5.2 Hyperparameter Tuning for Best Multi-class Model

In [None]:
# Select top model for tuning
top_multiclass_model = multiclass_results_df.head(1)['model'].tolist()[0]
print(f"Top multi-class model for tuning: {top_multiclass_model}")

In [None]:
# Tune the best performing model (likely Random Forest or XGBoost)
if 'Random Forest' in top_multiclass_model:
    print("Tuning Random Forest for Multi-class...")
    
    rf_param_grid_multi = {
        'n_estimators': [200, 300, 400],
        'max_depth': [15, 20, 25, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'class_weight': ['balanced', 'balanced_subsample']
    }
    
    rf_search_multi = RandomizedSearchCV(
        RandomForestClassifier(random_state=RANDOM_STATE),
        rf_param_grid_multi,
        n_iter=40,
        cv=StratifiedKFold(n_splits=5),
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1
    )
    
    rf_search_multi.fit(X_train_multi_smote, y_train_multi_smote)
    best_multi_model = rf_search_multi.best_estimator_
    best_cv_score = rf_search_multi.best_score_
    
elif 'XGBoost' in top_multiclass_model:
    print("Tuning XGBoost for Multi-class...")
    
    xgb_param_grid_multi = {
        'n_estimators': [200, 300, 400],
        'max_depth': [5, 7, 9, 11],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 3, 5]
    }
    
    xgb_search_multi = RandomizedSearchCV(
        XGBClassifier(random_state=RANDOM_STATE, eval_metric='mlogloss'),
        xgb_param_grid_multi,
        n_iter=40,
        cv=StratifiedKFold(n_splits=5),
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1
    )
    
    xgb_search_multi.fit(X_train_multi_smote, y_train_multi_smote)
    best_multi_model = xgb_search_multi.best_estimator_
    best_cv_score = xgb_search_multi.best_score_
else:
    # If neither Random Forest nor XGBoost, tune whichever model is top
    print(f"Tuning {top_multiclass_model} for Multi-class...")
    
    if 'Gradient' in top_multiclass_model:
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10],
            'subsample': [0.8, 0.9, 1.0]
        }
        base_model = GradientBoostingClassifier(random_state=RANDOM_STATE)
    elif 'SVM' in top_multiclass_model:
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto', 0.001, 0.01],
            'kernel': ['rbf', 'poly']
        }
        base_model = SVC(random_state=RANDOM_STATE, probability=True)
    else:
        # Fallback to Random Forest
        param_grid = {
            'n_estimators': [200, 300],
            'max_depth': [15, 20, None],
            'min_samples_split': [2, 5],
            'class_weight': ['balanced']
        }
        base_model = RandomForestClassifier(random_state=RANDOM_STATE)
    
    search = RandomizedSearchCV(
        base_model,
        param_grid,
        n_iter=30,
        cv=StratifiedKFold(n_splits=5),
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1
    )
    
    search.fit(X_train_multi_smote, y_train_multi_smote)
    best_multi_model = search.best_estimator_
    best_cv_score = search.best_score_

print(f"\nBest cross-validation F1 score: {best_cv_score:.4f}")

In [None]:
# Evaluate tuned multi-class model
print("\nEvaluating Tuned Multi-class Model...\n")
print("="*80)

results_tuned_multi, y_pred_tuned = evaluate_model(
    best_multi_model, X_train_multi_smote, y_train_multi_smote,
    X_test_multi, y_test_multi, f'{top_multiclass_model} (Tuned)', is_binary=False
)

print(f"\n{top_multiclass_model} (Tuned) Results:")
print(f"  Test Accuracy: {results_tuned_multi['test_accuracy']:.4f}")
print(f"  Test F1 (weighted): {results_tuned_multi['test_f1']:.4f}")
print(f"  Precision: {results_tuned_multi['precision']:.4f}")
print(f"  Recall: {results_tuned_multi['recall']:.4f}")

print(f"\n  Classification Report:")
print(classification_report(y_test_multi, y_pred_tuned,
                           target_names=[f'Class {i}' for i in range(5)]))

plot_confusion_matrix(y_test_multi, y_pred_tuned,
                     f'{top_multiclass_model} (Tuned) - Confusion Matrix',
                     labels=[f'Level {i}' for i in range(5)])

# Feature importance
feature_imp_multi = plot_feature_importance(best_multi_model, X_train_multi.columns,
                                           title=f'{top_multiclass_model} (Tuned) - Top 20 Features')
if feature_imp_multi is not None:
    print(f"\nTop 10 Important Features for Multi-class:")
    print(feature_imp_multi.head(10))

print("\n" + "="*80)

## 6. Hierarchical Classification Pipeline

In [None]:
# Create hierarchical classifier
class HierarchicalClassifier:
    """
    Two-stage hierarchical classifier:
    Stage 1: Binary (Disease vs No Disease)
    Stage 2: Multi-class (Severity Level 1-4 for Disease cases ONLY)
    
    Note: Multi-class model is trained on disease cases only (classes 1-4),
    so it never learned class 0. This is optimal for hierarchical classification.
    """
    def __init__(self, binary_model, multiclass_model):
        self.binary_model = binary_model
        self.multiclass_model = multiclass_model
    
    def predict(self, X):
        # Stage 1: Binary prediction
        binary_pred = self.binary_model.predict(X)
        
        # Initialize final predictions as 0 (no disease)
        final_pred = np.zeros(len(X), dtype=int)
        
        # Stage 2: For disease cases, predict severity (1-4)
        disease_mask = binary_pred == 1
        if disease_mask.sum() > 0:
            X_disease = X[disease_mask]
            # Multi-class predicts 1, 2, 3, or 4 (never 0)
            severity_pred = self.multiclass_model.predict(X_disease)
            final_pred[disease_mask] = severity_pred
        
        return final_pred
    
    def predict_proba(self, X):
        # Get binary probabilities
        binary_proba = self.binary_model.predict_proba(X)
        
        # Initialize final probabilities (5 classes: 0-4)
        final_proba = np.zeros((len(X), 5))
        
        # No disease probability (class 0) comes from binary model
        final_proba[:, 0] = binary_proba[:, 0]
        
        # Disease probability to distribute across severity levels
        disease_prob = binary_proba[:, 1]
        
        # Get multi-class probabilities (for classes 1-4)
        # Note: multi-class model outputs 4 classes, not 5
        multi_proba = self.multiclass_model.predict_proba(X)
        
        # Distribute disease probability across severity levels 1-4
        # multi_proba has shape (n_samples, 4) for classes 1, 2, 3, 4
        for i in range(4):  # 4 severity levels
            final_proba[:, i+1] = disease_prob * multi_proba[:, i]
        
        return final_proba

# Initialize hierarchical classifier
hierarchical_clf = HierarchicalClassifier(best_binary_model, best_multi_model)

print("Hierarchical Classifier Created!")
print(f"Stage 1 (Binary): {best_binary_model_name}")
print(f"Stage 2 (Multi-class on disease cases 1-4): {top_multiclass_model} (Tuned)")

In [None]:
# Evaluate hierarchical classifier
print("\nEvaluating Hierarchical Classifier...\n")
print("="*80)

y_pred_hierarchical = hierarchical_clf.predict(X_test_multi)

# Calculate metrics
hierarchical_accuracy = accuracy_score(y_test_multi, y_pred_hierarchical)
hierarchical_f1 = f1_score(y_test_multi, y_pred_hierarchical, average='weighted')
hierarchical_precision = precision_score(y_test_multi, y_pred_hierarchical, average='weighted')
hierarchical_recall = recall_score(y_test_multi, y_pred_hierarchical, average='weighted')

print(f"Hierarchical Classifier Results:")
print(f"  Test Accuracy: {hierarchical_accuracy:.4f}")
print(f"  Test F1 (weighted): {hierarchical_f1:.4f}")
print(f"  Precision: {hierarchical_precision:.4f}")
print(f"  Recall: {hierarchical_recall:.4f}")

print(f"\n  Classification Report:")
print(classification_report(y_test_multi, y_pred_hierarchical,
                           target_names=[f'Class {i}' for i in range(5)]))

plot_confusion_matrix(y_test_multi, y_pred_hierarchical,
                     'Hierarchical Classifier - Confusion Matrix',
                     labels=[f'Level {i}' for i in range(5)])

print("\n" + "="*80)

## 7. Final Model Comparison and Selection

In [None]:
# Compare all approaches
final_comparison = pd.DataFrame([
    {
        'Approach': 'Direct Multi-class (Best Baseline)',
        'Model': multiclass_results_df.iloc[0]['model'],
        'Test Accuracy': multiclass_results_df.iloc[0]['test_accuracy'],
        'Test F1 (weighted)': multiclass_results_df.iloc[0]['test_f1'],
        'Precision': multiclass_results_df.iloc[0]['precision'],
        'Recall': multiclass_results_df.iloc[0]['recall']
    },
    {
        'Approach': 'Direct Multi-class (Tuned)',
        'Model': f'{top_multiclass_model} (Tuned)',
        'Test Accuracy': results_tuned_multi['test_accuracy'],
        'Test F1 (weighted)': results_tuned_multi['test_f1'],
        'Precision': results_tuned_multi['precision'],
        'Recall': results_tuned_multi['recall']
    },
    {
        'Approach': 'Hierarchical',
        'Model': f'Binary: {best_binary_model_name} + Multi: {top_multiclass_model}',
        'Test Accuracy': hierarchical_accuracy,
        'Test F1 (weighted)': hierarchical_f1,
        'Precision': hierarchical_precision,
        'Recall': hierarchical_recall
    }
])

print("\n" + "="*100)
print("FINAL MODEL COMPARISON")
print("="*100)
print(final_comparison.to_string(index=False))

# Visualize final comparison
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(final_comparison))
width = 0.2

ax.bar(x - width*1.5, final_comparison['Test Accuracy'], width, label='Accuracy')
ax.bar(x - width/2, final_comparison['Test F1 (weighted)'], width, label='F1-Score')
ax.bar(x + width/2, final_comparison['Precision'], width, label='Precision')
ax.bar(x + width*1.5, final_comparison['Recall'], width, label='Recall')

ax.set_xlabel('Approach')
ax.set_ylabel('Score')
ax.set_title('Final Model Comparison - All Metrics', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(final_comparison['Approach'], rotation=15, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Select best overall model
best_overall_idx = final_comparison['Test F1 (weighted)'].idxmax()
best_overall = final_comparison.iloc[best_overall_idx]

print(f"\n{'='*100}")
print(f"BEST OVERALL MODEL: {best_overall['Approach']}")
print(f"Model: {best_overall['Model']}")
print(f"Test F1-Score (weighted): {best_overall['Test F1 (weighted)']:.4f}")
print(f"Test Accuracy: {best_overall['Test Accuracy']:.4f}")
print(f"{'='*100}")

## 8. Save Models

In [None]:
# Save all important models
models_to_save = {
    'best_binary_model': best_binary_model,
    'best_multiclass_model': best_multi_model,
    'hierarchical_classifier': hierarchical_clf,
    'smote_binary': smote_bin,
    'smote_multiclass': smote_multi
}

for model_name, model in models_to_save.items():
    with open(f'../models/{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)
    print(f"Saved: {model_name}.pkl")

# Save model metadata
metadata = {
    'best_binary_model_name': best_binary_model_name,
    'best_binary_f1': best_binary_f1,
    'best_multiclass_model_name': top_multiclass_model,
    'best_multiclass_f1': results_tuned_multi['test_f1'],
    'hierarchical_f1': hierarchical_f1,
    'best_overall_approach': best_overall['Approach'],
    'feature_names': X_train_multi.columns.tolist(),
    'random_state': RANDOM_STATE,
    'smote_method_binary': 'SMOTE',
    'smote_method_multiclass': 'BorderlineSMOTE (kind=borderline-1)'
}

with open('../models/model_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("\nModel metadata saved!")
print(f"\nAll models saved to: ../models/")

## 9. Final Summary

In [None]:
print("\n" + "="*100)
print("HEART DISEASE PREDICTION - TRAINING SUMMARY")
print("="*100)

print(f"\n1. BINARY CLASSIFICATION (Disease Detection)")
print(f"   Best Model: {best_binary_model_name}")
print(f"   Test F1-Score: {best_binary_f1:.4f}")
print(f"   Models Trained: {len(binary_models)} baseline + {len(tuned_results)} tuned + 2 ensemble")

print(f"\n2. MULTI-CLASS CLASSIFICATION (Severity Assessment)")
print(f"   Best Model: {top_multiclass_model} (Tuned)")
print(f"   Test F1-Score: {results_tuned_multi['test_f1']:.4f}")
print(f"   Models Trained: {len(multiclass_models)} baseline + 1 tuned")

print(f"\n3. HIERARCHICAL CLASSIFICATION")
print(f"   Stage 1: {best_binary_model_name}")
print(f"   Stage 2: {top_multiclass_model} (Tuned)")
print(f"   Test F1-Score: {hierarchical_f1:.4f}")

print(f"\n4. BEST OVERALL APPROACH")
print(f"   Approach: {best_overall['Approach']}")
print(f"   Test F1-Score: {best_overall['Test F1 (weighted)']:.4f}")
print(f"   Test Accuracy: {best_overall['Test Accuracy']:.4f}")

print(f"\n5. TECHNIQUES USED")
print(f"   - Class Imbalance (Binary): SMOTE oversampling")
print(f"   - Class Imbalance (Multi-class): BorderlineSMOTE (borderline-1)")
print(f"   - Hyperparameter Tuning: RandomizedSearchCV with 5-fold CV")
print(f"   - Ensemble Methods: Voting, Stacking")
print(f"   - Evaluation: Stratified K-Fold, weighted F1-score")

print(f"\n6. MODELS SAVED")
print(f"   - best_binary_model.pkl")
print(f"   - best_multiclass_model.pkl")
print(f"   - hierarchical_classifier.pkl")
print(f"   - model_metadata.pkl")

print("\n" + "="*100)
print("TRAINING COMPLETE! Ready for deployment.")
print("="*100)