In [1]:
"""
Member 2: Naive Bayes and Deep Learning Models
Heart Disease Classification using Naive Bayes, MLP, and LSTM
"""

# ============================================================================
# SECTION 1: IMPORT LIBRARIES AND SETUP
# ============================================================================

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations and array handling
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns  # For advanced statistical visualizations
from sklearn.naive_bayes import GaussianNB  # For Gaussian Naive Bayes classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # For model evaluation metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve  # For advanced evaluation
from sklearn.utils import resample  # For bootstrap sampling
import tensorflow as tf  # For deep learning framework
from tensorflow import keras  # For high-level neural network API
from tensorflow.keras.models import Sequential  # For creating sequential neural networks
from tensorflow.keras.layers import Dense, Dropout, LSTM, Reshape  # For neural network layers
from tensorflow.keras.callbacks import EarlyStopping  # For preventing overfitting during training
import joblib  # For loading saved models and preprocessors
import json  # For saving and loading metrics
import os  # For directory and file operations
import glob  # For file pattern matching
import warnings  # For suppressing unnecessary warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set random seeds for reproducibility across all frameworks
np.random.seed(42)  # NumPy random seed
tf.random.set_seed(42)  # TensorFlow random seed

print("="*80)  # Print separator line for visual clarity
print("MEMBER 2: NAIVE BAYES AND DEEP LEARNING MODELS")  # Print module header
print("="*80)  # Print separator line for visual clarity

# ============================================================================
# SECTION 2: LOAD PREPROCESSED DATA AND SAVED ARTIFACTS
# ============================================================================

print("\n--- PHASE 1: LOADING PREPROCESSED DATA ---")  # Announce data loading phase

# Load the dataset to get original column information
df = pd.read_csv('data/heart_cleveland_upload.csv')  # Load heart disease dataset
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')  # Standardize column names

# Identify target column
target_column = None  # Initialize target column variable
if 'target' in df.columns:  # Check if 'target' column exists
    target_column = 'target'  # Set target column name
elif 'condition' in df.columns:  # Check if 'condition' column exists
    target_column = 'condition'  # Set target column name
else:  # If neither common name exists
    target_column = df.columns[-1]  # Use last column as target

print(f"Target column: {target_column}")  # Display target column name

# Load preprocessor saved by Member 1
preprocessor = joblib.load('artifacts/models/preprocessor.pkl')  # Load fitted preprocessor
print("Preprocessor loaded successfully")  # Confirm loading

# Load train-test split saved by Member 1
X_train, X_test, y_train, y_test = joblib.load('artifacts/models/train_test_split.pkl')  # Load split data
print(f" Train-test split loaded: {X_train.shape[0]} train, {X_test.shape[0]} test samples")  # Confirm loading

# Transform data using loaded preprocessor
X_train_processed = preprocessor.transform(X_train)  # Transform training data
X_test_processed = preprocessor.transform(X_test)  # Transform test data
print(f" Data preprocessed: {X_train_processed.shape[1]} features after transformation")  # Display feature count

# ============================================================================
# SECTION 3: NAIVE BAYES MODEL WITH BOOTSTRAP CONFIDENCE INTERVALS
# ============================================================================

print("\n--- PHASE 2: NAIVE BAYES MODEL ---")  # Announce Naive Bayes phase

# Initialize and train Gaussian Naive Bayes classifier
nb_model = GaussianNB()  # Create Naive Bayes model
nb_model.fit(X_train_processed, y_train)  # Train model on processed training data
y_pred_nb = nb_model.predict(X_test_processed)  # Make predictions on test set
y_pred_proba_nb = nb_model.predict_proba(X_test_processed)[:, 1]  # Get probability predictions for positive class

# Calculate evaluation metrics for Naive Bayes
nb_accuracy = accuracy_score(y_test, y_pred_nb)  # Calculate accuracy
nb_precision = precision_score(y_test, y_pred_nb, average='binary', zero_division=0)  # Calculate precision
nb_recall = recall_score(y_test, y_pred_nb, average='binary', zero_division=0)  # Calculate recall
nb_f1 = f1_score(y_test, y_pred_nb, average='binary', zero_division=0)  # Calculate F1-score

print(f"\nNaive Bayes Performance:")  # Announce NB results
print(f"  Accuracy:  {nb_accuracy:.4f}")  # Display accuracy
print(f"  Precision: {nb_precision:.4f}")  # Display precision
print(f"  Recall:    {nb_recall:.4f}")  # Display recall
print(f"  F1-Score:  {nb_f1:.4f}")  # Display F1-score

# ============================================================================
# PHASE 2A: BOOTSTRAP CONFIDENCE INTERVALS FOR NAIVE BAYES
# ============================================================================

print("\nCalculating Bootstrap Confidence Intervals (1000 iterations)...")  # Announce bootstrap
n_iterations = 1000  # Number of bootstrap iterations
bootstrap_accuracies = []  # List to store bootstrap accuracy scores

for i in range(n_iterations):  # Iterate through bootstrap samples
    # Resample test set with replacement
    X_test_boot, y_test_boot = resample(X_test_processed, y_test, random_state=i)  # Create bootstrap sample
    y_pred_boot = nb_model.predict(X_test_boot)  # Make predictions on bootstrap sample
    boot_acc = accuracy_score(y_test_boot, y_pred_boot)  # Calculate accuracy
    bootstrap_accuracies.append(boot_acc)  # Store accuracy

bootstrap_accuracies = np.array(bootstrap_accuracies)  # Convert to numpy array
ci_lower = np.percentile(bootstrap_accuracies, 2.5)  # Calculate lower bound of 95% CI
ci_upper = np.percentile(bootstrap_accuracies, 97.5)  # Calculate upper bound of 95% CI

print(f"Bootstrap Mean Accuracy: {bootstrap_accuracies.mean():.4f}")  # Display mean accuracy
print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")  # Display confidence interval

# ============================================================================
# PHASE 2B: CONFUSION MATRIX FOR NAIVE BAYES
# ============================================================================

nb_cm = confusion_matrix(y_test, y_pred_nb)  # Calculate confusion matrix
plt.figure(figsize=(8, 6))  # Create figure
sns.heatmap(nb_cm, annot=True, fmt='d', cmap='Purples', cbar=True,  # Create heatmap
            xticklabels=['No Disease', 'Disease'],  # Label x-axis
            yticklabels=['No Disease', 'Disease'])  # Label y-axis
plt.title('Naive Bayes - Confusion Matrix', fontsize=14, fontweight='bold')  # Add title
plt.ylabel('Actual', fontsize=12)  # Label y-axis
plt.xlabel('Predicted', fontsize=12)  # Label x-axis
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/nb_confusion_matrix.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Naive Bayes confusion matrix saved")  # Confirm save

# ============================================================================
# PHASE 2C: ROC CURVE FOR NAIVE BAYES
# ============================================================================

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_proba_nb)  # Calculate ROC curve points
roc_auc_nb = auc(fpr_nb, tpr_nb)  # Calculate area under ROC curve

plt.figure(figsize=(8, 6))  # Create figure
plt.plot(fpr_nb, tpr_nb, color='purple', lw=2, label=f'ROC curve (AUC = {roc_auc_nb:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')  # Plot diagonal reference
plt.xlim([0.0, 1.0])  # Set x-axis limits
plt.ylim([0.0, 1.05])  # Set y-axis limits
plt.xlabel('False Positive Rate', fontsize=12)  # Label x-axis
plt.ylabel('True Positive Rate', fontsize=12)  # Label y-axis
plt.title('Naive Bayes - ROC Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower right")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/nb_roc_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Naive Bayes ROC curve saved")  # Confirm save

# ============================================================================
# PHASE 2D: PRECISION-RECALL CURVE FOR NAIVE BAYES
# ============================================================================

precision_nb, recall_nb, _ = precision_recall_curve(y_test, y_pred_proba_nb)  # Calculate PR curve points
plt.figure(figsize=(8, 6))  # Create figure
plt.plot(recall_nb, precision_nb, color='purple', lw=2, label='PR curve')  # Plot PR curve
plt.xlabel('Recall', fontsize=12)  # Label x-axis
plt.ylabel('Precision', fontsize=12)  # Label y-axis
plt.title('Naive Bayes - Precision-Recall Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower left")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/nb_pr_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Naive Bayes PR curve saved")  # Confirm save

# Save Naive Bayes model and metrics
joblib.dump(nb_model, 'artifacts/models/naive_bayes_model.pkl')  # Save trained model
nb_metrics = {  # Create metrics dictionary
    'model': 'Naive Bayes',  # Model name
    'accuracy': float(nb_accuracy),  # Convert to float for JSON serialization
    'precision': float(nb_precision),  # Convert to float
    'recall': float(nb_recall),  # Convert to float
    'f1_score': float(nb_f1),  # Convert to float
    'roc_auc': float(roc_auc_nb),  # Convert to float
    'bootstrap_mean': float(bootstrap_accuracies.mean()),  # Bootstrap mean
    'ci_lower': float(ci_lower),  # Confidence interval lower bound
    'ci_upper': float(ci_upper)  # Confidence interval upper bound
}
with open('artifacts/metrics/naive_bayes_metrics.json', 'w') as f:  # Open file for writing
    json.dump(nb_metrics, f, indent=4)  # Save metrics as formatted JSON
print("Naive Bayes model and metrics saved")  # Confirm save

# ============================================================================
# SECTION 4: MULTILAYER PERCEPTRON (MLP) ARTIFICIAL NEURAL NETWORK
# ============================================================================

print("\n--- PHASE 3: MULTILAYER PERCEPTRON (ANN) ---")  # Announce MLP phase

# Build MLP architecture
mlp_model = Sequential([  # Create sequential model
    Dense(128, activation='relu', input_shape=(X_train_processed.shape[1],)),  # First hidden layer with 128 neurons
    Dropout(0.3),  # Dropout layer to prevent overfitting
    Dense(64, activation='relu'),  # Second hidden layer with 64 neurons
    Dropout(0.3),  # Dropout layer
    Dense(32, activation='relu'),  # Third hidden layer with 32 neurons
    Dropout(0.2),  # Dropout layer
    Dense(1, activation='sigmoid')  # Output layer with sigmoid for binary classification
])

# Compile MLP model
mlp_model.compile(optimizer='adam',  # Use Adam optimizer
                  loss='binary_crossentropy',  # Binary cross-entropy loss for binary classification
                  metrics=['accuracy'])  # Track accuracy during training

print("\nMLP Model Architecture:")  # Announce architecture summary
mlp_model.summary()  # Display model architecture

# Define early stopping callback to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)  # Stop if no improvement

# Train MLP model
print("\nTraining MLP model...")  # Announce training
mlp_history = mlp_model.fit(X_train_processed, y_train,  # Train on processed data
                             epochs=100,  # Maximum number of epochs
                             batch_size=32,  # Batch size for training
                             validation_split=0.2,  # Use 20% of training data for validation
                             callbacks=[early_stop],  # Use early stopping
                             verbose=0)  # Suppress detailed output

print(f" MLP training completed in {len(mlp_history.history['loss'])} epochs")  # Display epochs trained

# Make predictions with MLP
y_pred_proba_mlp = mlp_model.predict(X_test_processed).ravel()  # Get probability predictions
y_pred_mlp = (y_pred_proba_mlp > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate evaluation metrics for MLP
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)  # Calculate accuracy
mlp_precision = precision_score(y_test, y_pred_mlp, average='binary', zero_division=0)  # Calculate precision
mlp_recall = recall_score(y_test, y_pred_mlp, average='binary', zero_division=0)  # Calculate recall
mlp_f1 = f1_score(y_test, y_pred_mlp, average='binary', zero_division=0)  # Calculate F1-score

print(f"\nMLP Performance:")  # Announce MLP results
print(f"  Accuracy:  {mlp_accuracy:.4f}")  # Display accuracy
print(f"  Precision: {mlp_precision:.4f}")  # Display precision
print(f"  Recall:    {mlp_recall:.4f}")  # Display recall
print(f"  F1-Score:  {mlp_f1:.4f}")  # Display F1-score

# ============================================================================
# PHASE 3A: TRAINING CURVES FOR MLP
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))  # Create figure with two subplots

# Plot training and validation loss
ax1.plot(mlp_history.history['loss'], label='Training Loss', color='blue', linewidth=2)  # Plot training loss
ax1.plot(mlp_history.history['val_loss'], label='Validation Loss', color='red', linewidth=2)  # Plot validation loss
ax1.set_xlabel('Epoch', fontsize=12)  # Label x-axis
ax1.set_ylabel('Loss', fontsize=12)  # Label y-axis
ax1.set_title('MLP - Loss Curves', fontsize=14, fontweight='bold')  # Add title
ax1.legend()  # Add legend
ax1.grid(alpha=0.3)  # Add grid

# Plot training and validation accuracy
ax2.plot(mlp_history.history['accuracy'], label='Training Accuracy', color='blue', linewidth=2)  # Plot train accuracy
ax2.plot(mlp_history.history['val_accuracy'], label='Validation Accuracy', color='red', linewidth=2)  # Plot val accuracy
ax2.set_xlabel('Epoch', fontsize=12)  # Label x-axis
ax2.set_ylabel('Accuracy', fontsize=12)  # Label y-axis
ax2.set_title('MLP - Accuracy Curves', fontsize=14, fontweight='bold')  # Add title
ax2.legend()  # Add legend
ax2.grid(alpha=0.3)  # Add grid

plt.tight_layout()  # Adjust spacing
plt.savefig('artifacts/figures/mlp_training_curves.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("MLP training curves saved")  # Confirm save

# ============================================================================
# PHASE 3B: CONFUSION MATRIX FOR MLP
# ============================================================================

mlp_cm = confusion_matrix(y_test, y_pred_mlp)  # Calculate confusion matrix
plt.figure(figsize=(8, 6))  # Create figure
sns.heatmap(mlp_cm, annot=True, fmt='d', cmap='Oranges', cbar=True,  # Create heatmap
            xticklabels=['No Disease', 'Disease'],  # Label x-axis
            yticklabels=['No Disease', 'Disease'])  # Label y-axis
plt.title('MLP - Confusion Matrix', fontsize=14, fontweight='bold')  # Add title
plt.ylabel('Actual', fontsize=12)  # Label y-axis
plt.xlabel('Predicted', fontsize=12)  # Label x-axis
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/mlp_confusion_matrix.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("MLP confusion matrix saved")  # Confirm save

# ============================================================================
# PHASE 3C: ROC CURVE FOR MLP
# ============================================================================

fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_pred_proba_mlp)  # Calculate ROC curve points
roc_auc_mlp = auc(fpr_mlp, tpr_mlp)  # Calculate area under ROC curve

plt.figure(figsize=(8, 6))  # Create figure
plt.plot(fpr_mlp, tpr_mlp, color='orange', lw=2, label=f'ROC curve (AUC = {roc_auc_mlp:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')  # Plot diagonal reference
plt.xlim([0.0, 1.0])  # Set x-axis limits
plt.ylim([0.0, 1.05])  # Set y-axis limits
plt.xlabel('False Positive Rate', fontsize=12)  # Label x-axis
plt.ylabel('True Positive Rate', fontsize=12)  # Label y-axis
plt.title('MLP - ROC Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower right")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/mlp_roc_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("MLP ROC curve saved")  # Confirm save

# ============================================================================
# PHASE 3D: PRECISION-RECALL CURVE FOR MLP
# ============================================================================

precision_mlp, recall_mlp, _ = precision_recall_curve(y_test, y_pred_proba_mlp)  # Calculate PR curve points
plt.figure(figsize=(8, 6))  # Create figure
plt.plot(recall_mlp, precision_mlp, color='orange', lw=2, label='PR curve')  # Plot PR curve
plt.xlabel('Recall', fontsize=12)  # Label x-axis
plt.ylabel('Precision', fontsize=12)  # Label y-axis
plt.title('MLP - Precision-Recall Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower left")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/mlp_pr_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("MLP PR curve saved")  # Confirm save

# Save MLP model and metrics
mlp_model.save('artifacts/models/mlp_model.h5')  # Save trained Keras model
mlp_metrics = {  # Create metrics dictionary
    'model': 'MLP (ANN)',  # Model name
    'accuracy': float(mlp_accuracy),  # Convert to float for JSON serialization
    'precision': float(mlp_precision),  # Convert to float
    'recall': float(mlp_recall),  # Convert to float
    'f1_score': float(mlp_f1),  # Convert to float
    'roc_auc': float(roc_auc_mlp),  # Convert to float
    'epochs_trained': len(mlp_history.history['loss']),  # Number of epochs
    'final_train_loss': float(mlp_history.history['loss'][-1]),  # Final training loss
    'final_val_loss': float(mlp_history.history['val_loss'][-1])  # Final validation loss
}
with open('artifacts/metrics/mlp_metrics.json', 'w') as f:  # Open file for writing
    json.dump(mlp_metrics, f, indent=4)  # Save metrics as formatted JSON
print("MLP model and metrics saved")  # Confirm save

# ============================================================================
# SECTION 5: LSTM MODEL (ADVANCED EXTENSION)
# ============================================================================

print("\n--- PHASE 4: LSTM MODEL (ADVANCED EXTENSION) ---")  # Announce LSTM phase

# Reshape data for LSTM (samples, timesteps, features)
X_train_lstm = X_train_processed.reshape((X_train_processed.shape[0], 1, X_train_processed.shape[1]))  # Reshape training
X_test_lstm = X_test_processed.reshape((X_test_processed.shape[0], 1, X_test_processed.shape[1]))  # Reshape test

print(f"LSTM input shape: {X_train_lstm.shape}")  # Display reshaped input dimensions

# Build LSTM architecture
lstm_model = Sequential([  # Create sequential model
    LSTM(64, activation='tanh', return_sequences=True, input_shape=(1, X_train_processed.shape[1])),  # First LSTM layer
    Dropout(0.3),  # Dropout layer
    LSTM(32, activation='tanh'),  # Second LSTM layer
    Dropout(0.3),  # Dropout layer
    Dense(16, activation='relu'),  # Dense hidden layer
    Dropout(0.2),  # Dropout layer
    Dense(1, activation='sigmoid')  # Output layer with sigmoid
])

# Compile LSTM model
lstm_model.compile(optimizer='adam',  # Use Adam optimizer
                   loss='binary_crossentropy',  # Binary cross-entropy loss
                   metrics=['accuracy'])  # Track accuracy

print("\nLSTM Model Architecture:")  # Announce architecture summary
lstm_model.summary()  # Display model architecture

# Train LSTM model
print("\nTraining LSTM model...")  # Announce training
lstm_history = lstm_model.fit(X_train_lstm, y_train,  # Train on reshaped data
                               epochs=100,  # Maximum number of epochs
                               batch_size=32,  # Batch size for training
                               validation_split=0.2,  # Use 20% for validation
                               callbacks=[early_stop],  # Use early stopping
                               verbose=0)  # Suppress detailed output

print(f" LSTM training completed in {len(lstm_history.history['loss'])} epochs")  # Display epochs trained

# Make predictions with LSTM
y_pred_proba_lstm = lstm_model.predict(X_test_lstm).ravel()  # Get probability predictions
y_pred_lstm = (y_pred_proba_lstm > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate evaluation metrics for LSTM
lstm_accuracy = accuracy_score(y_test, y_pred_lstm)  # Calculate accuracy
lstm_precision = precision_score(y_test, y_pred_lstm, average='binary', zero_division=0)  # Calculate precision
lstm_recall = recall_score(y_test, y_pred_lstm, average='binary', zero_division=0)  # Calculate recall
lstm_f1 = f1_score(y_test, y_pred_lstm, average='binary', zero_division=0)  # Calculate F1-score

print(f"\nLSTM Performance:")  # Announce LSTM results
print(f"  Accuracy:  {lstm_accuracy:.4f}")  # Display accuracy
print(f"  Precision: {lstm_precision:.4f}")  # Display precision
print(f"  Recall:    {lstm_recall:.4f}")  # Display recall
print(f"  F1-Score:  {lstm_f1:.4f}")  # Display F1-score

# ============================================================================
# PHASE 4A: TRAINING CURVES FOR LSTM
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))  # Create figure with two subplots

# Plot training and validation loss
ax1.plot(lstm_history.history['loss'], label='Training Loss', color='blue', linewidth=2)  # Plot training loss
ax1.plot(lstm_history.history['val_loss'], label='Validation Loss', color='red', linewidth=2)  # Plot validation loss
ax1.set_xlabel('Epoch', fontsize=12)  # Label x-axis
ax1.set_ylabel('Loss', fontsize=12)  # Label y-axis
ax1.set_title('LSTM - Loss Curves', fontsize=14, fontweight='bold')  # Add title
ax1.legend()  # Add legend
ax1.grid(alpha=0.3)  # Add grid

# Plot training and validation accuracy
ax2.plot(lstm_history.history['accuracy'], label='Training Accuracy', color='blue', linewidth=2)  # Plot train accuracy
ax2.plot(lstm_history.history['val_accuracy'], label='Validation Accuracy', color='red', linewidth=2)  # Plot val accuracy
ax2.set_xlabel('Epoch', fontsize=12)  # Label x-axis
ax2.set_ylabel('Accuracy', fontsize=12)  # Label y-axis
ax2.set_title('LSTM - Accuracy Curves', fontsize=14, fontweight='bold')  # Add title
ax2.legend()  # Add legend
ax2.grid(alpha=0.3)  # Add grid

plt.tight_layout()  # Adjust spacing
plt.savefig('artifacts/figures/lstm_training_curves.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("LSTM training curves saved")  # Confirm save

# ============================================================================
# PHASE 4B: CONFUSION MATRIX FOR LSTM
# ============================================================================

lstm_cm = confusion_matrix(y_test, y_pred_lstm)  # Calculate confusion matrix
plt.figure(figsize=(8, 6))  # Create figure
sns.heatmap(lstm_cm, annot=True, fmt='d', cmap='YlOrRd', cbar=True,  # Create heatmap
            xticklabels=['No Disease', 'Disease'],  # Label x-axis
            yticklabels=['No Disease', 'Disease'])  # Label y-axis
plt.title('LSTM - Confusion Matrix', fontsize=14, fontweight='bold')  # Add title
plt.ylabel('Actual', fontsize=12)  # Label y-axis
plt.xlabel('Predicted', fontsize=12)  # Label x-axis
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/lstm_confusion_matrix.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("LSTM confusion matrix saved")  # Confirm save

# ============================================================================
# PHASE 4C: ROC CURVE FOR LSTM
# ============================================================================

fpr_lstm, tpr_lstm, _ = roc_curve(y_test, y_pred_proba_lstm)  # Calculate ROC curve points
roc_auc_lstm = auc(fpr_lstm, tpr_lstm)  # Calculate area under ROC curve

plt.figure(figsize=(8, 6))  # Create figure
plt.plot(fpr_lstm, tpr_lstm, color='red', lw=2, label=f'ROC curve (AUC = {roc_auc_lstm:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')  # Plot diagonal reference
plt.xlim([0.0, 1.0])  # Set x-axis limits
plt.ylim([0.0, 1.05])  # Set y-axis limits
plt.xlabel('False Positive Rate', fontsize=12)  # Label x-axis
plt.ylabel('True Positive Rate', fontsize=12)  # Label y-axis
plt.title('LSTM - ROC Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower right")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/lstm_roc_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("LSTM ROC curve saved")  # Confirm save

# ============================================================================
# PHASE 4D: PRECISION-RECALL CURVE FOR LSTM
# ============================================================================

precision_lstm, recall_lstm, _ = precision_recall_curve(y_test, y_pred_proba_lstm)  # Calculate PR curve points
plt.figure(figsize=(8, 6))  # Create figure
plt.plot(recall_lstm, precision_lstm, color='red', lw=2, label='PR curve')  # Plot PR curve
plt.xlabel('Recall', fontsize=12)  # Label x-axis
plt.ylabel('Precision', fontsize=12)  # Label y-axis
plt.title('LSTM - Precision-Recall Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower left")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/lstm_pr_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("LSTM PR curve saved")  # Confirm save

# Save LSTM model and metrics
lstm_model.save('artifacts/models/lstm_model.h5')  # Save trained Keras model
lstm_metrics = {  # Create metrics dictionary
    'model': 'LSTM',  # Model name
    'accuracy': float(lstm_accuracy),  # Convert to float for JSON serialization
    'precision': float(lstm_precision),  # Convert to float
    'recall': float(lstm_recall),  # Convert to float
    'f1_score': float(lstm_f1),  # Convert to float
    'roc_auc': float(roc_auc_lstm),  # Convert to float
    'epochs_trained': len(lstm_history.history['loss']),  # Number of epochs
    'final_train_loss': float(lstm_history.history['loss'][-1]),  # Final training loss
    'final_val_loss': float(lstm_history.history['val_loss'][-1])  # Final validation loss
}
with open('artifacts/metrics/lstm_metrics.json', 'w') as f:  # Open file for writing
    json.dump(lstm_metrics, f, indent=4)  # Save metrics as formatted JSON
print("LSTM model and metrics saved")  # Confirm save

# ============================================================================
# SECTION 6: AGGREGATE ALL METRICS AND CREATE COMPARISON
# ============================================================================

print("\n--- PHASE 5: AGGREGATING METRICS ---")  # Announce aggregation phase

# Load all metric JSON files
metric_files = glob.glob('artifacts/metrics/*_metrics.json')  # Find all metric JSON files
all_metrics = []  # List to store all metrics

for file in metric_files:  # Iterate through metric files
    with open(file, 'r') as f:  # Open file for reading
        metrics = json.load(f)  # Load metrics from JSON
        all_metrics.append(metrics)  # Add to list

# Create DataFrame from all metrics
metrics_df = pd.DataFrame(all_metrics)  # Convert list of dicts to DataFrame
metrics_df = metrics_df[['model', 'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']]  # Select key columns
metrics_df = metrics_df.sort_values('accuracy', ascending=False)  # Sort by accuracy descending

# Save aggregated metrics to CSV
metrics_df.to_csv('artifacts/metrics/all_models_comparison.csv', index=False)  # Save to CSV file
print("\n Aggregated metrics saved to all_models_comparison.csv")  # Confirm save

print("\nAll Models Performance Comparison:")  # Announce comparison table
print(metrics_df.to_string(index=False))  # Display formatted table

# ============================================================================
# PHASE 5A: CREATE COMPARISON BAR CHART
# ============================================================================

# Prepare data for visualization
models = metrics_df['model'].tolist()  # Extract model names
accuracy_scores = metrics_df['accuracy'].tolist()  # Extract accuracy scores
precision_scores = metrics_df['precision'].tolist()  # Extract precision scores
recall_scores = metrics_df['recall'].tolist()  # Extract recall scores
f1_scores = metrics_df['f1_score'].tolist()  # Extract F1 scores

# Set up bar chart parameters
x = np.arange(len(models))  # Create x-axis positions
width = 0.2  # Width of each bar

# Create figure and bar chart
fig, ax = plt.subplots(figsize=(14, 8))  # Create large figure
bar1 = ax.bar(x - 1.5*width, accuracy_scores, width, label='Accuracy', color='#2E86AB')  # Accuracy bars
bar2 = ax.bar(x - 0.5*width, precision_scores, width, label='Precision', color='#A23B72')  # Precision bars
bar3 = ax.bar(x + 0.5*width, recall_scores, width, label='Recall', color='#F18F01')  # Recall bars
bar4 = ax.bar(x + 1.5*width, f1_scores, width, label='F1-Score', color='#C73E1D')  # F1-Score bars

# Add value labels on top of bars
for bars in [bar1, bar2, bar3, bar4]:  # Iterate through bar groups
    for bar in bars:  # Iterate through individual bars
        height = bar.get_height()  # Get bar height
        ax.text(bar.get_x() + bar.get_width()/2., height,  # Position text
                f'{height:.3f}',  # Format value to 3 decimals
                ha='center', va='bottom', fontsize=9)  # Set text alignment and size

# Customize chart appearance
ax.set_xlabel('Models', fontsize=14, fontweight='bold')  # Label x-axis
ax.set_ylabel('Score', fontsize=14, fontweight='bold')  # Label y-axis
ax.set_title('Performance Comparison Across All Models', fontsize=16, fontweight='bold')  # Add title
ax.set_xticks(x)  # Set x-axis tick positions
ax.set_xticklabels(models, fontsize=11)  # Set x-axis tick labels
ax.legend(fontsize=11, loc='lower right')  # Add legend
ax.set_ylim([0, 1.1])  # Set y-axis limits
ax.grid(axis='y', alpha=0.3, linestyle='--')  # Add horizontal grid lines
ax.axhline(y=0.5, color='gray', linestyle='--', linewidth=1, alpha=0.5)  # Add reference line at 0.5

plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/all_models_comparison.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Comparison bar chart saved")  # Confirm save

# ============================================================================
# SECTION 7: SUMMARY AND COMPLETION
# ============================================================================

print("\n" + "="*80)  # Print separator line
print("MEMBER 2 COMPLETION SUMMARY")  # Print summary header
print("="*80)  # Print separator line
print("\n Naive Bayes trained and evaluated with bootstrap CI")  # Confirm NB completion
print("MLP (ANN) trained and evaluated with training curves")  # Confirm MLP completion
print("LSTM trained and evaluated with training curves")  # Confirm LSTM completion
print("All models saved to artifacts/models/")  # Confirm model saves
print("All metrics saved to artifacts/metrics/")  # Confirm metrics saves
print("All figures saved to artifacts/figures/")  # Confirm figure saves
print("Aggregated comparison CSV and chart created")  # Confirm aggregation

print("\nFinal Model Rankings (by Accuracy):")  # Announce rankings
for idx, row in metrics_df.iterrows():  # Iterate through sorted dataframe
    print(f"  {idx+1}. {row['model']}: {row['accuracy']:.4f}")  # Display rank and accuracy

print("\n" + "="*80)  # Print separator line
print("ALL MODELS READY FOR PRESENTATION")  # Print completion message
print("="*80)  # Print separator line

MEMBER 2: NAIVE BAYES AND DEEP LEARNING MODELS

--- PHASE 1: LOADING PREPROCESSED DATA ---
Target column: condition
Preprocessor loaded successfully
 Train-test split loaded: 237 train, 60 test samples
 Data preprocessed: 13 features after transformation

--- PHASE 2: NAIVE BAYES MODEL ---

Naive Bayes Performance:
  Accuracy:  0.8667
  Precision: 1.0000
  Recall:    0.7143
  F1-Score:  0.8333

Calculating Bootstrap Confidence Intervals (1000 iterations)...
Bootstrap Mean Accuracy: 0.8667
95% Confidence Interval: [0.7667, 0.9500]
Naive Bayes confusion matrix saved
Naive Bayes ROC curve saved
Naive Bayes PR curve saved
Naive Bayes model and metrics saved

--- PHASE 3: MULTILAYER PERCEPTRON (ANN) ---

MLP Model Architecture:



Training MLP model...
 MLP training completed in 42 epochs
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step

MLP Performance:
  Accuracy:  0.9167
  Precision: 1.0000
  Recall:    0.8214
  F1-Score:  0.9020
MLP training curves saved
MLP confusion matrix saved
MLP ROC curve saved




MLP PR curve saved
MLP model and metrics saved

--- PHASE 4: LSTM MODEL (ADVANCED EXTENSION) ---
LSTM input shape: (237, 1, 13)

LSTM Model Architecture:



Training LSTM model...
 LSTM training completed in 49 epochs
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 764ms/step

LSTM Performance:
  Accuracy:  0.9000
  Precision: 1.0000
  Recall:    0.7857
  F1-Score:  0.8800
LSTM training curves saved
LSTM confusion matrix saved
LSTM ROC curve saved




LSTM PR curve saved
LSTM model and metrics saved

--- PHASE 5: AGGREGATING METRICS ---

 Aggregated metrics saved to all_models_comparison.csv

All Models Performance Comparison:
        model  accuracy  precision   recall  f1_score  roc_auc
    MLP (ANN)  0.916667   1.000000 0.821429  0.901961 0.950893
         LSTM  0.900000   1.000000 0.785714  0.880000 0.947545
  Naive Bayes  0.866667   1.000000 0.714286  0.833333 0.912946
Random Forest  0.833333   0.875000 0.750000  0.807692 0.919643
Decision Tree  0.800000   0.833333 0.714286  0.769231 0.843750
Comparison bar chart saved

MEMBER 2 COMPLETION SUMMARY

 Naive Bayes trained and evaluated with bootstrap CI
MLP (ANN) trained and evaluated with training curves
LSTM trained and evaluated with training curves
All models saved to artifacts/models/
All metrics saved to artifacts/metrics/
All figures saved to artifacts/figures/
Aggregated comparison CSV and chart created

Final Model Rankings (by Accuracy):
  3. MLP (ANN): 0.9167
  2. LSTM: