In [1]:
"""
Member 1: Data Preprocessing and Traditional Machine Learning Models
Heart Disease Classification using Decision Tree and Random Forest
"""

# ============================================================================
# SECTION 1: IMPORT LIBRARIES AND SETUP
# ============================================================================

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations and array handling
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns  # For advanced statistical visualizations
from sklearn.model_selection import train_test_split, cross_val_score  # For data splitting and cross-validation
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For feature scaling and encoding
from sklearn.compose import ColumnTransformer  # For applying different preprocessing to different columns
from sklearn.pipeline import Pipeline  # For creating preprocessing pipelines
from sklearn.tree import DecisionTreeClassifier  # For decision tree model
from sklearn.ensemble import RandomForestClassifier  # For random forest ensemble model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # For model evaluation metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve  # For advanced evaluation
import joblib  # For saving models and preprocessors
import json  # For saving metrics as JSON files
import os  # For directory and file operations
import warnings  # For suppressing unnecessary warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set random seed for reproducibility across all random operations
np.random.seed(42)  # Ensures consistent results across multiple runs

# ============================================================================
# SECTION 2: CREATE REQUIRED DIRECTORIES
# ============================================================================

os.makedirs('artifacts/models', exist_ok=True)  # Create models directory if it doesn't exist
os.makedirs('artifacts/metrics', exist_ok=True)  # Create metrics directory if it doesn't exist
os.makedirs('artifacts/figures', exist_ok=True)  # Create figures directory if it doesn't exist

print("="*80)  # Print separator line for visual clarity
print("MEMBER 1: DATA PREPROCESSING AND TRADITIONAL ML MODELS")  # Print module header
print("="*80)  # Print separator line for visual clarity

# ============================================================================
# SECTION 3: DATA LOADING AND INITIAL EXPLORATION
# ============================================================================

print("\n--- PHASE 1: DATA LOADING ---")  # Announce data loading phase
df = pd.read_csv('data/heart_cleveland_upload.csv')  # Load heart disease dataset from CSV file

# Clean column names by removing spaces and special characters
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')  # Standardize column names to lowercase with underscores

print(f"Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")  # Display dataset dimensions
print(f"\nColumn names: {list(df.columns)}")  # Display all column names

# Display first few rows to understand data structure
print("\nFirst 5 rows of the dataset:")  # Announce data preview
print(df.head())  # Show first 5 rows of data

# ============================================================================
# SECTION 4: DATA DESCRIPTION AND EXPLORATORY DATA ANALYSIS
# ============================================================================

print("\n--- PHASE 2: EXPLORATORY DATA ANALYSIS ---")  # Announce EDA phase

# Generate comprehensive statistical summary of dataset
print("\nDataset Statistical Summary:")  # Announce statistical summary
print(df.describe())  # Display mean, std, min, max, quartiles for numeric columns

# Check for missing values in dataset
print("\nMissing Values Count:")  # Announce missing values check
print(df.isnull().sum())  # Display count of missing values per column

# Identify data types of each column
print("\nData Types:")  # Announce data types information
print(df.dtypes)  # Display data type of each column

# Prepare target variable (assuming last column or 'target'/'condition' column)
target_column = None  # Initialize target column variable
if 'target' in df.columns:  # Check if 'target' column exists
    target_column = 'target'  # Set target column name
elif 'condition' in df.columns:  # Check if 'condition' column exists
    target_column = 'condition'  # Set target column name
else:  # If neither common name exists
    target_column = df.columns[-1]  # Use last column as target

print(f"\nTarget column identified: {target_column}")  # Display target column name

# Analyze class distribution in target variable
print("\nClass Distribution:")  # Announce class distribution analysis
print(df[target_column].value_counts())  # Count occurrences of each class

# ============================================================================
# PHASE 2A: VISUALIZE CLASS BALANCE
# ============================================================================

plt.figure(figsize=(8, 6))  # Create figure with specified size
df[target_column].value_counts().plot(kind='bar', color=['#2E86AB', '#A23B72'])  # Create bar plot of class counts
plt.title('Class Distribution - Heart Disease', fontsize=14, fontweight='bold')  # Add title to plot
plt.xlabel('Class (0=No Disease, 1=Disease)', fontsize=12)  # Label x-axis
plt.ylabel('Frequency', fontsize=12)  # Label y-axis
plt.xticks(rotation=0)  # Keep x-axis labels horizontal
plt.grid(axis='y', alpha=0.3)  # Add horizontal grid lines
plt.tight_layout()  # Adjust layout to prevent label cutoff
plt.savefig('artifacts/figures/class_distribution.png', dpi=300, bbox_inches='tight')  # Save figure to file
plt.close()  # Close figure to free memory
print("Class distribution plot saved")  # Confirm plot saved

# ============================================================================
# PHASE 2B: CORRELATION HEATMAP
# ============================================================================

plt.figure(figsize=(12, 10))  # Create large figure for correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns  # Select only numeric columns
correlation_matrix = df[numeric_cols].corr()  # Calculate correlation matrix
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,  # Create annotated heatmap
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})  # Set heatmap properties
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')  # Add title
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Correlation heatmap saved")  # Confirm plot saved

# ============================================================================
# PHASE 2C: FEATURE DISTRIBUTIONS
# ============================================================================

# Plot histograms for all numeric features
fig, axes = plt.subplots(4, 4, figsize=(16, 12))  # Create 4x4 subplot grid
axes = axes.ravel()  # Flatten axes array for easier iteration
for idx, col in enumerate(numeric_cols[:16]):  # Iterate through first 16 numeric columns
    if idx < len(axes):  # Check if subplot exists
        axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)  # Create histogram
        axes[idx].set_title(f'{col}', fontsize=10, fontweight='bold')  # Add column name as title
        axes[idx].set_xlabel('Value', fontsize=8)  # Label x-axis
        axes[idx].set_ylabel('Frequency', fontsize=8)  # Label y-axis
        axes[idx].grid(axis='y', alpha=0.3)  # Add grid lines
for idx in range(len(numeric_cols), len(axes)):  # Remove empty subplots
    fig.delaxes(axes[idx])  # Delete unused subplot
plt.tight_layout()  # Adjust spacing between subplots
plt.savefig('artifacts/figures/feature_distributions.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Feature distribution histograms saved")  # Confirm plot saved

# ============================================================================
# SECTION 5: DATA PREPROCESSING AND FEATURE ENGINEERING
# ============================================================================

print("\n--- PHASE 3: DATA PREPROCESSING ---")  # Announce preprocessing phase

# Separate features (X) and target (y)
X = df.drop(columns=[target_column])  # Remove target column to get features
y = df[target_column]  # Extract target column

# Identify numeric and categorical columns for preprocessing
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()  # Get list of numeric feature names
categorical_features = X.select_dtypes(include=['object']).columns.tolist()  # Get list of categorical feature names

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")  # Display numeric features
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")  # Display categorical features

# Create preprocessing pipeline for numeric features
numeric_transformer = Pipeline(steps=[  # Define numeric preprocessing steps
    ('scaler', StandardScaler())  # Apply standard scaling (mean=0, std=1)
])

# If categorical features exist, encode them
if len(categorical_features) > 0:  # Check if categorical features present
    from sklearn.preprocessing import OneHotEncoder  # Import one-hot encoder
    categorical_transformer = Pipeline(steps=[  # Define categorical preprocessing steps
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))  # One-hot encode
    ])
    preprocessor = ColumnTransformer(  # Combine transformers for different column types
        transformers=[  # List of transformers
            ('num', numeric_transformer, numeric_features),  # Apply numeric transformer to numeric features
            ('cat', categorical_transformer, categorical_features)  # Apply categorical transformer to categorical features
        ])
else:  # If no categorical features
    preprocessor = ColumnTransformer(  # Create transformer for numeric features only
        transformers=[  # List of transformers
            ('num', numeric_transformer, numeric_features)  # Apply numeric transformer
        ])

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # Split with stratification

print(f"\nTraining set size: {X_train.shape[0]} samples")  # Display training set size
print(f"Testing set size: {X_test.shape[0]} samples")  # Display testing set size

# Fit preprocessor on training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)  # Fit on training data and transform
X_test_processed = preprocessor.transform(X_test)  # Transform test data using fitted preprocessor

# Save preprocessor for use by other team members
joblib.dump(preprocessor, 'artifacts/models/preprocessor.pkl')  # Save preprocessor to file
print("\n Preprocessor saved to artifacts/models/preprocessor.pkl")  # Confirm save

# Save train-test split for consistency across models
joblib.dump((X_train, X_test, y_train, y_test), 'artifacts/models/train_test_split.pkl')  # Save split data
print("Train-test split saved to artifacts/models/train_test_split.pkl")  # Confirm save

# ============================================================================
# SECTION 6: DECISION TREE MODEL
# ============================================================================

print("\n--- PHASE 4: DECISION TREE MODEL ---")  # Announce decision tree phase

# Initialize and train Decision Tree classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=10)  # Create DT with hyperparameters
dt_model.fit(X_train_processed, y_train)  # Train model on processed training data
y_pred_dt = dt_model.predict(X_test_processed)  # Make predictions on test set
y_pred_proba_dt = dt_model.predict_proba(X_test_processed)[:, 1]  # Get probability predictions for positive class

# Calculate evaluation metrics for Decision Tree
dt_accuracy = accuracy_score(y_test, y_pred_dt)  # Calculate accuracy
dt_precision = precision_score(y_test, y_pred_dt, average='binary', zero_division=0)  # Calculate precision
dt_recall = recall_score(y_test, y_pred_dt, average='binary', zero_division=0)  # Calculate recall
dt_f1 = f1_score(y_test, y_pred_dt, average='binary', zero_division=0)  # Calculate F1-score

print(f"\nDecision Tree Performance:")  # Announce DT results
print(f"  Accuracy:  {dt_accuracy:.4f}")  # Display accuracy
print(f"  Precision: {dt_precision:.4f}")  # Display precision
print(f"  Recall:    {dt_recall:.4f}")  # Display recall
print(f"  F1-Score:  {dt_f1:.4f}")  # Display F1-score

# ============================================================================
# PHASE 4A: CROSS-VALIDATION FOR DECISION TREE
# ============================================================================

print("\nPerforming 5-Fold Cross-Validation for Decision Tree...")  # Announce CV
dt_cv_scores = cross_val_score(dt_model, X_train_processed, y_train, cv=5, scoring='accuracy')  # Perform 5-fold CV
print(f"CV Accuracy Scores: {dt_cv_scores}")  # Display individual fold scores
print(f"Mean CV Accuracy: {dt_cv_scores.mean():.4f} (+/- {dt_cv_scores.std() * 2:.4f})")  # Display mean and std

# ============================================================================
# PHASE 4B: CONFUSION MATRIX FOR DECISION TREE
# ============================================================================

dt_cm = confusion_matrix(y_test, y_pred_dt)  # Calculate confusion matrix
plt.figure(figsize=(8, 6))  # Create figure
sns.heatmap(dt_cm, annot=True, fmt='d', cmap='Blues', cbar=True,  # Create heatmap
            xticklabels=['No Disease', 'Disease'],  # Label x-axis
            yticklabels=['No Disease', 'Disease'])  # Label y-axis
plt.title('Decision Tree - Confusion Matrix', fontsize=14, fontweight='bold')  # Add title
plt.ylabel('Actual', fontsize=12)  # Label y-axis
plt.xlabel('Predicted', fontsize=12)  # Label x-axis
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/dt_confusion_matrix.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Decision Tree confusion matrix saved")  # Confirm save

# ============================================================================
# PHASE 4C: ROC CURVE FOR DECISION TREE
# ============================================================================

fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_proba_dt)  # Calculate ROC curve points
roc_auc_dt = auc(fpr_dt, tpr_dt)  # Calculate area under ROC curve

plt.figure(figsize=(8, 6))  # Create figure
plt.plot(fpr_dt, tpr_dt, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc_dt:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')  # Plot diagonal reference
plt.xlim([0.0, 1.0])  # Set x-axis limits
plt.ylim([0.0, 1.05])  # Set y-axis limits
plt.xlabel('False Positive Rate', fontsize=12)  # Label x-axis
plt.ylabel('True Positive Rate', fontsize=12)  # Label y-axis
plt.title('Decision Tree - ROC Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower right")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/dt_roc_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Decision Tree ROC curve saved")  # Confirm save

# ============================================================================
# PHASE 4D: PRECISION-RECALL CURVE FOR DECISION TREE
# ============================================================================

precision_dt, recall_dt, _ = precision_recall_curve(y_test, y_pred_proba_dt)  # Calculate PR curve points
plt.figure(figsize=(8, 6))  # Create figure
plt.plot(recall_dt, precision_dt, color='blue', lw=2, label='PR curve')  # Plot PR curve
plt.xlabel('Recall', fontsize=12)  # Label x-axis
plt.ylabel('Precision', fontsize=12)  # Label y-axis
plt.title('Decision Tree - Precision-Recall Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower left")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/dt_pr_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Decision Tree PR curve saved")  # Confirm save

# Save Decision Tree model and metrics
joblib.dump(dt_model, 'artifacts/models/decision_tree_model.pkl')  # Save trained model
dt_metrics = {  # Create metrics dictionary
    'model': 'Decision Tree',  # Model name
    'accuracy': float(dt_accuracy),  # Convert to float for JSON serialization
    'precision': float(dt_precision),  # Convert to float
    'recall': float(dt_recall),  # Convert to float
    'f1_score': float(dt_f1),  # Convert to float
    'roc_auc': float(roc_auc_dt),  # Convert to float
    'cv_mean': float(dt_cv_scores.mean()),  # Cross-validation mean
    'cv_std': float(dt_cv_scores.std())  # Cross-validation standard deviation
}
with open('artifacts/metrics/decision_tree_metrics.json', 'w') as f:  # Open file for writing
    json.dump(dt_metrics, f, indent=4)  # Save metrics as formatted JSON
print("Decision Tree model and metrics saved")  # Confirm save

# ============================================================================
# SECTION 7: RANDOM FOREST MODEL
# ============================================================================

print("\n--- PHASE 5: RANDOM FOREST MODEL ---")  # Announce random forest phase

# Initialize and train Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, min_samples_split=10)  # Create RF
rf_model.fit(X_train_processed, y_train)  # Train model on processed training data
y_pred_rf = rf_model.predict(X_test_processed)  # Make predictions on test set
y_pred_proba_rf = rf_model.predict_proba(X_test_processed)[:, 1]  # Get probability predictions for positive class

# Calculate evaluation metrics for Random Forest
rf_accuracy = accuracy_score(y_test, y_pred_rf)  # Calculate accuracy
rf_precision = precision_score(y_test, y_pred_rf, average='binary', zero_division=0)  # Calculate precision
rf_recall = recall_score(y_test, y_pred_rf, average='binary', zero_division=0)  # Calculate recall
rf_f1 = f1_score(y_test, y_pred_rf, average='binary', zero_division=0)  # Calculate F1-score

print(f"\nRandom Forest Performance:")  # Announce RF results
print(f"  Accuracy:  {rf_accuracy:.4f}")  # Display accuracy
print(f"  Precision: {rf_precision:.4f}")  # Display precision
print(f"  Recall:    {rf_recall:.4f}")  # Display recall
print(f"  F1-Score:  {rf_f1:.4f}")  # Display F1-score

# ============================================================================
# PHASE 5A: CROSS-VALIDATION FOR RANDOM FOREST
# ============================================================================

print("\nPerforming 5-Fold Cross-Validation for Random Forest...")  # Announce CV
rf_cv_scores = cross_val_score(rf_model, X_train_processed, y_train, cv=5, scoring='accuracy')  # Perform 5-fold CV
print(f"CV Accuracy Scores: {rf_cv_scores}")  # Display individual fold scores
print(f"Mean CV Accuracy: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")  # Display mean and std

# ============================================================================
# PHASE 5B: FEATURE IMPORTANCE FOR RANDOM FOREST
# ============================================================================

feature_importances = rf_model.feature_importances_  # Extract feature importances from trained model
feature_names = numeric_features  # Use numeric feature names (categorical would be expanded)

# If categorical features were one-hot encoded, get transformed feature names
if len(categorical_features) > 0:  # Check if categorical features present
    feature_names = preprocessor.get_feature_names_out()  # Get all transformed feature names

# Sort features by importance
indices = np.argsort(feature_importances)[::-1]  # Get indices sorted by importance (descending)
top_n = min(15, len(feature_importances))  # Limit to top 15 features

plt.figure(figsize=(10, 8))  # Create figure
plt.barh(range(top_n), feature_importances[indices[:top_n]], color='teal', alpha=0.7)  # Create horizontal bar plot
plt.yticks(range(top_n), [feature_names[i] for i in indices[:top_n]])  # Set y-axis labels
plt.xlabel('Importance', fontsize=12)  # Label x-axis
plt.ylabel('Features', fontsize=12)  # Label y-axis
plt.title('Random Forest - Top Feature Importances', fontsize=14, fontweight='bold')  # Add title
plt.gca().invert_yaxis()  # Invert y-axis to show most important at top
plt.grid(axis='x', alpha=0.3)  # Add vertical grid lines
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/rf_feature_importance.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Random Forest feature importance plot saved")  # Confirm save

# ============================================================================
# PHASE 5C: CONFUSION MATRIX FOR RANDOM FOREST
# ============================================================================

rf_cm = confusion_matrix(y_test, y_pred_rf)  # Calculate confusion matrix
plt.figure(figsize=(8, 6))  # Create figure
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Greens', cbar=True,  # Create heatmap
            xticklabels=['No Disease', 'Disease'],  # Label x-axis
            yticklabels=['No Disease', 'Disease'])  # Label y-axis
plt.title('Random Forest - Confusion Matrix', fontsize=14, fontweight='bold')  # Add title
plt.ylabel('Actual', fontsize=12)  # Label y-axis
plt.xlabel('Predicted', fontsize=12)  # Label x-axis
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/rf_confusion_matrix.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Random Forest confusion matrix saved")  # Confirm save

# ============================================================================
# PHASE 5D: ROC CURVE FOR RANDOM FOREST
# ============================================================================

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)  # Calculate ROC curve points
roc_auc_rf = auc(fpr_rf, tpr_rf)  # Calculate area under ROC curve

plt.figure(figsize=(8, 6))  # Create figure
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'ROC curve (AUC = {roc_auc_rf:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')  # Plot diagonal reference
plt.xlim([0.0, 1.0])  # Set x-axis limits
plt.ylim([0.0, 1.05])  # Set y-axis limits
plt.xlabel('False Positive Rate', fontsize=12)  # Label x-axis
plt.ylabel('True Positive Rate', fontsize=12)  # Label y-axis
plt.title('Random Forest - ROC Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower right")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/rf_roc_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Random Forest ROC curve saved")  # Confirm save

# ============================================================================
# PHASE 5E: PRECISION-RECALL CURVE FOR RANDOM FOREST
# ============================================================================

precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_proba_rf)  # Calculate PR curve points
plt.figure(figsize=(8, 6))  # Create figure
plt.plot(recall_rf, precision_rf, color='green', lw=2, label='PR curve')  # Plot PR curve
plt.xlabel('Recall', fontsize=12)  # Label x-axis
plt.ylabel('Precision', fontsize=12)  # Label y-axis
plt.title('Random Forest - Precision-Recall Curve', fontsize=14, fontweight='bold')  # Add title
plt.legend(loc="lower left")  # Add legend
plt.grid(alpha=0.3)  # Add grid
plt.tight_layout()  # Adjust layout
plt.savefig('artifacts/figures/rf_pr_curve.png', dpi=300, bbox_inches='tight')  # Save figure
plt.close()  # Close figure
print("Random Forest PR curve saved")  # Confirm save

# Save Random Forest model and metrics
joblib.dump(rf_model, 'artifacts/models/random_forest_model.pkl')  # Save trained model
rf_metrics = {  # Create metrics dictionary
    'model': 'Random Forest',  # Model name
    'accuracy': float(rf_accuracy),  # Convert to float for JSON serialization
    'precision': float(rf_precision),  # Convert to float
    'recall': float(rf_recall),  # Convert to float
    'f1_score': float(rf_f1),  # Convert to float
    'roc_auc': float(roc_auc_rf),  # Convert to float
    'cv_mean': float(rf_cv_scores.mean()),  # Cross-validation mean
    'cv_std': float(rf_cv_scores.std())  # Cross-validation standard deviation
}
with open('artifacts/metrics/random_forest_metrics.json', 'w') as f:  # Open file for writing
    json.dump(rf_metrics, f, indent=4)  # Save metrics as formatted JSON
print("Random Forest model and metrics saved")  # Confirm save

# ============================================================================
# SECTION 8: SUMMARY AND COMPLETION
# ============================================================================

print("\n" + "="*80)  # Print separator line
print("MEMBER 1 COMPLETION SUMMARY")  # Print summary header
print("="*80)  # Print separator line
print("\n Data preprocessing completed")  # Confirm preprocessing
print("Decision Tree trained and evaluated")  # Confirm DT completion
print("Random Forest trained and evaluated")  # Confirm RF completion
print("All models saved to artifacts/models/")  # Confirm model saves
print("All metrics saved to artifacts/metrics/")  # Confirm metrics saves
print("All figures saved to artifacts/figures/")  # Confirm figure saves
print("\nReady for Member 2 to proceed with Naive Bayes and Deep Learning models.")  # Announce readiness
print("="*80)  # Print separator line

MEMBER 1: DATA PREPROCESSING AND TRADITIONAL ML MODELS

--- PHASE 1: DATA LOADING ---
Dataset loaded successfully: 297 rows, 14 columns

Column names: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition']

First 5 rows of the dataset:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   69    1   0       160   234    1        2      131      0      0.1      1   
1   69    0   0       140   239    0        0      151      0      1.8      0   
2   66    0   0       150   226    0        0      114      0      2.6      2   
3   65    1   0       138   282    1        2      174      0      1.4      1   
4   64    1   0       110   211    0        2      144      1      1.8      1   

   ca  thal  condition  
0   1     0          0  
1   2     0          0  
2   0     0          0  
3   1     0          1  
4   0     0          0  

--- PHASE 2: EXPLORATORY DATA ANALYSIS ---

Dataset Sta