In [None]:
# Core libraries
import numpy as np
import pandas as pd
import warnings
import sys
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning - Core
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE

# Machine Learning - Models
from sklearn.ensemble import (
    GradientBoostingClassifier, 
    VotingClassifier, 
    StackingClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# For comprehensive model options
try:
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    advanced_models_available = True
except ImportError:
    print("⚠️ XGBoost/LightGBM not available - using sklearn models only")
    advanced_models_available = False

# Evaluation and Metrics
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    average_precision_score,
    make_scorer,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score
)

# Statistical Analysis
from scipy import stats
from scipy.stats import chi2_contingency

# Configuration
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
np.random.seed(42)

print("✅ All libraries imported successfully!")
print(f"📊 Environment: Python {sys.version.split()[0]}")
print(f"🚀 Advanced models available: {advanced_models_available}")


In [None]:
# Load the cardiovascular dataset
# For this demonstration, we'll use the heart.csv file
try:
    df_raw = pd.read_csv('heart.csv')
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print("⚠️ heart.csv not found. Please ensure the dataset is in the current directory.")
    print("Creating sample dataset for demonstration...")
    # Create a small sample dataset if file not found
    np.random.seed(42)
    n_samples = 300
    df_raw = pd.DataFrame({
        'age': np.random.randint(30, 80, n_samples),
        'sex': np.random.randint(0, 2, n_samples),
        'cp': np.random.randint(0, 4, n_samples),
        'trestbps': np.random.randint(90, 200, n_samples),
        'chol': np.random.randint(120, 400, n_samples),
        'fbs': np.random.randint(0, 2, n_samples),
        'restecg': np.random.randint(0, 3, n_samples),
        'thalach': np.random.randint(80, 200, n_samples),
        'exang': np.random.randint(0, 2, n_samples),
        'oldpeak': np.random.uniform(0, 6, n_samples),
        'slope': np.random.randint(0, 3, n_samples),
        'ca': np.random.randint(0, 5, n_samples),
        'thal': np.random.randint(1, 4, n_samples),
        'target': np.random.randint(0, 2, n_samples)
    })
    print("✅ Sample dataset created!")

# Create a copy for processing
df = df_raw.copy()

print(f"📈 Dataset Shape: {df.shape}")
print(f"🏥 Total Patients: {len(df):,}")
print(f"🔬 Features Available: {df.shape[1]-1}")

# Display first few rows
print("\n📋 Dataset Preview:")
display(df.head())


In [None]:
# Enhanced data profiling function
def comprehensive_data_profile(dataframe):
    """
    Generate comprehensive data profiling report
    """
    print("=" * 80)
    print("📋 COMPREHENSIVE DATA PROFILING REPORT")
    print("=" * 80)
    
    # Basic Information
    print("\n🏗️ DATASET STRUCTURE:")
    print("-" * 40)
    print(f"Rows: {dataframe.shape[0]:,}")
    print(f"Columns: {dataframe.shape[1]}")
    print(f"Memory Usage: {dataframe.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data Types Analysis
    print("\n📊 DATA TYPES DISTRIBUTION:")
    print("-" * 40)
    dtype_counts = dataframe.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"{dtype}: {count} columns")
    
    # Missing Values Analysis
    print("\n❌ MISSING VALUES ANALYSIS:")
    print("-" * 40)
    missing_data = dataframe.isnull().sum()
    missing_percent = (missing_data / len(dataframe)) * 100
    
    if missing_data.sum() == 0:
        print("✅ No missing values detected!")
    else:
        missing_df = pd.DataFrame({
            'Missing Count': missing_data[missing_data > 0],
            'Percentage': missing_percent[missing_percent > 0]
        }).sort_values('Percentage', ascending=False)
        print(missing_df)
    
    # Duplicate Analysis
    print("\n🔄 DUPLICATE ANALYSIS:")
    print("-" * 40)
    duplicates = dataframe.duplicated().sum()
    print(f"Duplicate rows: {duplicates} ({duplicates/len(dataframe)*100:.2f}%)")
    
    return dataframe.describe(include='all')

# Generate comprehensive profile
desc_stats = comprehensive_data_profile(df)
display(desc_stats)


In [None]:
# Clinical feature mapping with medical interpretation
FEATURE_MAPPING = {
    'age': 'Patient Age (years)',
    'sex': 'Biological Sex (1=Male, 0=Female)',
    'cp': 'Chest Pain Type (0-3)',
    'trestbps': 'Resting Blood Pressure (mmHg)',
    'chol': 'Serum Cholesterol (mg/dl)',
    'fbs': 'Fasting Blood Sugar >120 mg/dl (1=Yes)',
    'restecg': 'Resting ECG Results (0-2)',
    'thalach': 'Maximum Heart Rate Achieved',
    'exang': 'Exercise Induced Angina (1=Yes)',
    'oldpeak': 'ST Depression Induced by Exercise',
    'slope': 'Slope of Peak Exercise ST Segment (0-2)',
    'ca': 'Number of Major Vessels (0-4)',
    'thal': 'Thalassemia Type (1-3)',
    'target': 'Heart Disease Presence (1=Disease, 0=No Disease)'
}

# Clinical risk thresholds (based on medical literature)
CLINICAL_THRESHOLDS = {
    'age_high_risk': 65,
    'chol_high': 240,
    'chol_borderline': 200,
    'trestbps_high': 140,
    'trestbps_elevated': 130,
    'thalach_low_fitness': 100
}

# Display feature information
feature_df = pd.DataFrame([
    {'Feature': k, 'Clinical Meaning': v} 
    for k, v in FEATURE_MAPPING.items()
])

print("🏥 CLINICAL FEATURE REFERENCE:")
print("=" * 80)
display(feature_df)

# Check target distribution
target_dist = df['target'].value_counts().sort_index()
print("\n❤️ HEART DISEASE DISTRIBUTION:")
print("-" * 40)
for val, count in target_dist.items():
    label = "No Disease" if val == 0 else "Disease Present"
    percentage = (count / len(df)) * 100
    print(f"{label}: {count:,} patients ({percentage:.1f}%)")

# Basic statistics by target
print("\n📊 BASIC STATISTICS BY DISEASE STATUS:")
print("-" * 50)
for feature in ['age', 'chol', 'trestbps', 'thalach']:
    if feature in df.columns:
        no_disease_mean = df[df['target'] == 0][feature].mean()
        disease_mean = df[df['target'] == 1][feature].mean()
        print(f"{feature.upper()}: No Disease={no_disease_mean:.1f}, Disease={disease_mean:.1f}")


In [None]:
class AdvancedFeatureEngineer:
    """
    Advanced feature engineering for cardiovascular risk assessment
    """
    
    def __init__(self):
        self.features_created = []
    
    def create_clinical_risk_features(self, df):
        """
        Create clinically meaningful risk stratification features
        """
        df_enhanced = df.copy()
        
        # Age-based risk stratification
        df_enhanced['age_risk_category'] = pd.cut(
            df_enhanced['age'], 
            bins=[0, 40, 55, 65, 100], 
            labels=['Low', 'Moderate', 'High', 'Very High']
        )
        
        # Cholesterol risk categories (ATP III guidelines)
        df_enhanced['chol_risk'] = pd.cut(
            df_enhanced['chol'],
            bins=[0, 200, 240, 1000],
            labels=['Desirable', 'Borderline', 'High']
        )
        
        # Blood pressure categories (AHA guidelines)
        df_enhanced['bp_category'] = pd.cut(
            df_enhanced['trestbps'],
            bins=[0, 120, 130, 140, 300],
            labels=['Normal', 'Elevated', 'Stage1_HTN', 'Stage2_HTN']
        )
        
        # Heart rate reserve (fitness indicator)
        max_hr_predicted = 220 - df_enhanced['age']
        df_enhanced['hr_reserve'] = max_hr_predicted - df_enhanced['thalach']
        df_enhanced['hr_reserve_percent'] = (df_enhanced['thalach'] / max_hr_predicted) * 100
        
        # Exercise capacity assessment
        df_enhanced['exercise_capacity'] = np.where(
            (df_enhanced['thalach'] >= 150) & (df_enhanced['exang'] == 0),
            'Good',
            np.where(
                (df_enhanced['thalach'] >= 120) & (df_enhanced['exang'] == 0),
                'Fair',
                'Poor'
            )
        )
        
        # Metabolic risk score
        df_enhanced['metabolic_risk_score'] = (
            (df_enhanced['chol'] > 240).astype(int) * 2 +
            (df_enhanced['fbs'] == 1).astype(int) * 2 +
            (df_enhanced['trestbps'] > 140).astype(int) * 3
        )
        
        # Composite cardiovascular risk index
        df_enhanced['cv_risk_index'] = (
            (df_enhanced['age'] / 100) * 0.3 +
            (df_enhanced['chol'] / 400) * 0.2 +
            (df_enhanced['trestbps'] / 200) * 0.2 +
            (df_enhanced['oldpeak'] / 10) * 0.15 +
            (df_enhanced['ca'] / 4) * 0.15
        )
        
        self.features_created.extend([
            'age_risk_category', 'chol_risk', 'bp_category',
            'hr_reserve', 'hr_reserve_percent', 'exercise_capacity',
            'metabolic_risk_score', 'cv_risk_index'
        ])
        
        return df_enhanced
    
    def create_interaction_features(self, df):
        """
        Create meaningful feature interactions
        """
        df_interactions = df.copy()
        
        # Age-gender interaction (different risk profiles)
        df_interactions['age_sex_interaction'] = df_interactions['age'] * df_interactions['sex']
        
        # Cholesterol-age interaction
        df_interactions['chol_age_ratio'] = df_interactions['chol'] / df_interactions['age']
        
        # Exercise response (thalach vs oldpeak)
        df_interactions['exercise_response'] = df_interactions['thalach'] / (df_interactions['oldpeak'] + 1)
        
        # Vascular health score
        df_interactions['vascular_health'] = (
            df_interactions['thalach'] / df_interactions['trestbps']
        )
        
        self.features_created.extend([
            'age_sex_interaction', 'chol_age_ratio', 
            'exercise_response', 'vascular_health'
        ])
        
        return df_interactions

# Initialize and apply feature engineering
feature_engineer = AdvancedFeatureEngineer()

print("🔬 ADVANCED FEATURE ENGINEERING PIPELINE")
print("=" * 80)

# Apply clinical risk features
print("\n⚕️ Creating clinical risk stratification features...")
df_clinical = feature_engineer.create_clinical_risk_features(df)

# Apply interaction features
print("🔄 Creating feature interactions...")
df_engineered = feature_engineer.create_interaction_features(df_clinical)

print(f"\n✅ Feature engineering complete!")
print(f"📊 Original features: {df.shape[1]}")
print(f"🚀 Enhanced features: {df_engineered.shape[1]}")
print(f"➕ New features created: {len(feature_engineer.features_created)}")

# Display new features created
print("\n🆕 NEW FEATURES CREATED:")
for i, feature in enumerate(feature_engineer.features_created, 1):
    print(f"{i:2d}. {feature}")

# Show sample of new features
print("\n📋 SAMPLE OF ENGINEERED FEATURES:")
print("-" * 50)
sample_cols = ['age', 'age_risk_category', 'chol', 'chol_risk', 'cv_risk_index', 'metabolic_risk_score']
display(df_engineered[sample_cols].head())


In [None]:
# Advanced EDA with clinical insights
def analyze_clinical_risk_factors(df):
    """
    Analyze clinical risk factors with statistical significance
    """
    print("⚕️ CLINICAL RISK FACTOR ANALYSIS")
    print("=" * 80)
    
    # Age risk analysis
    age_risk_crosstab = pd.crosstab(df['age_risk_category'], df['target'])
    chi2, p_value, _, _ = chi2_contingency(age_risk_crosstab)
    
    print("\n📊 AGE RISK STRATIFICATION:")
    print("-" * 40)
    age_risk_pct = pd.crosstab(df['age_risk_category'], df['target'], normalize='index') * 100
    print(age_risk_pct.round(1))
    print(f"\n📈 Chi-square test p-value: {p_value:.2e}")
    
    # Cholesterol risk analysis
    print("\n🧪 CHOLESTEROL RISK ANALYSIS:")
    print("-" * 40)
    chol_risk_pct = pd.crosstab(df['chol_risk'], df['target'], normalize='index') * 100
    print(chol_risk_pct.round(1))
    
    # Exercise capacity analysis
    print("\n🏃 EXERCISE CAPACITY ASSESSMENT:")
    print("-" * 40)
    exercise_risk_pct = pd.crosstab(df['exercise_capacity'], df['target'], normalize='index') * 100
    print(exercise_risk_pct.round(1))
    
    return age_risk_crosstab, chol_risk_pct, exercise_risk_pct

# Execute advanced EDA
risk_analysis = analyze_clinical_risk_factors(df_engineered)

# Create comprehensive visualization dashboard
def create_eda_visualizations(df):
    """
    Create comprehensive EDA visualizations
    """
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Advanced Cardiovascular Risk Analysis Dashboard', fontsize=16, fontweight='bold')
    
    # 1. Age distribution by heart disease
    ax1 = axes[0, 0]
    for target_val in df['target'].unique():
        subset = df[df['target'] == target_val]
        label = "No Disease" if target_val == 0 else "Heart Disease"
        ax1.hist(subset['age'], alpha=0.7, label=label, bins=15)
    ax1.set_xlabel('Age (years)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Age Distribution by Disease Status')
    ax1.legend()
    ax1.grid(alpha=0.3)
    
    # 2. Cholesterol vs Blood Pressure Risk Matrix
    ax2 = axes[0, 1]
    colors = ['green' if x == 0 else 'red' for x in df['target']]
    ax2.scatter(df['chol'], df['trestbps'], c=colors, alpha=0.6)
    ax2.axhline(y=140, color='orange', linestyle='--', alpha=0.7, label='HTN Threshold')
    ax2.axvline(x=240, color='purple', linestyle='--', alpha=0.7, label='High Cholesterol')
    ax2.set_xlabel('Cholesterol (mg/dl)')
    ax2.set_ylabel('Resting BP (mmHg)')
    ax2.set_title('Cholesterol vs BP Risk Matrix')
    ax2.legend()
    ax2.grid(alpha=0.3)
    
    # 3. Exercise Capacity Assessment
    ax3 = axes[0, 2]
    exercise_counts = pd.crosstab(df['exercise_capacity'], df['target'])
    exercise_counts.plot(kind='bar', ax=ax3, color=['lightgreen', 'lightcoral'])
    ax3.set_title('Exercise Capacity vs Heart Disease')
    ax3.set_xlabel('Exercise Capacity')
    ax3.set_ylabel('Count')
    ax3.legend(['No Disease', 'Heart Disease'])
    ax3.tick_params(axis='x', rotation=45)
    
    # 4. Cardiovascular Risk Index Distribution
    ax4 = axes[1, 0]
    for target_val in df['target'].unique():
        subset = df[df['target'] == target_val]
        label = "No Disease" if target_val == 0 else "Heart Disease"
        ax4.hist(subset['cv_risk_index'], alpha=0.7, label=label, bins=15)
    ax4.set_xlabel('CV Risk Index')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Cardiovascular Risk Index Distribution')
    ax4.legend()
    ax4.grid(alpha=0.3)
    
    # 5. Metabolic Risk Score Analysis
    ax5 = axes[1, 1]
    metabolic_counts = pd.crosstab(df['metabolic_risk_score'], df['target'])
    metabolic_counts.plot(kind='bar', ax=ax5, color=['lightblue', 'orange'])
    ax5.set_title('Metabolic Risk Score Analysis')
    ax5.set_xlabel('Metabolic Risk Score')
    ax5.set_ylabel('Count')
    ax5.legend(['No Disease', 'Heart Disease'])
    
    # 6. Correlation Heatmap of Key Features
    ax6 = axes[1, 2]
    key_features = ['age', 'chol', 'trestbps', 'thalach', 'oldpeak', 
                   'cv_risk_index', 'metabolic_risk_score', 'target']
    corr_matrix = df[key_features].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax6, 
                fmt='.2f', square=True)
    ax6.set_title('Feature Correlation Matrix')
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Create comprehensive visualizations
print("\n📊 GENERATING COMPREHENSIVE VISUALIZATION DASHBOARD")
print("=" * 80)
eda_visualizations = create_eda_visualizations(df_engineered)


In [None]:
class AdvancedFeatureSelector:
    """
    Multi-method feature selection for optimal model performance
    """
    
    def __init__(self):
        self.selected_features = {}
        self.feature_scores = {}
    
    def prepare_features(self, df, target_col='target'):
        """
        Prepare features for selection (handle categorical variables)
        """
        df_processed = df.copy()
        
        # Encode categorical variables
        categorical_cols = df_processed.select_dtypes(include=['object', 'category']).columns
        
        for col in categorical_cols:
            if col != target_col:
                le = LabelEncoder()
                df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        
        # Separate features and target
        X = df_processed.drop(target_col, axis=1)
        y = df_processed[target_col]
        
        return X, y
    
    def univariate_selection(self, X, y, k=15):
        """
        Univariate feature selection using f_classif
        """
        selector = SelectKBest(score_func=f_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        
        selected_features = X.columns[selector.get_support()].tolist()
        feature_scores = dict(zip(X.columns, selector.scores_))
        
        self.selected_features['univariate'] = selected_features
        self.feature_scores['univariate'] = feature_scores
        
        return selected_features, feature_scores
    
    def recursive_feature_elimination(self, X, y, n_features=15):
        """
        Recursive Feature Elimination with ExtraTreesClassifier
        """
        estimator = ExtraTreesClassifier(n_estimators=100, random_state=42)
        selector = RFE(estimator, n_features_to_select=n_features, step=1)
        X_selected = selector.fit_transform(X, y)
        
        selected_features = X.columns[selector.get_support()].tolist()
        feature_rankings = dict(zip(X.columns, selector.ranking_))
        
        self.selected_features['rfe'] = selected_features
        self.feature_scores['rfe'] = feature_rankings
        
        return selected_features, feature_rankings
    
    def feature_importance_selection(self, X, y, threshold=0.01):
        """
        Feature selection based on tree-based feature importance
        """
        # Use multiple tree-based models
        models = {
            'extra_trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
            'gradient_boost': GradientBoostingClassifier(random_state=42)
        }
        
        importance_scores = pd.DataFrame(index=X.columns)
        
        for name, model in models.items():
            model.fit(X, y)
            importance_scores[name] = model.feature_importances_
        
        # Average importance across models
        importance_scores['mean_importance'] = importance_scores.mean(axis=1)
        
        # Select features above threshold
        selected_features = importance_scores[
            importance_scores['mean_importance'] > threshold
        ].index.tolist()
        
        self.selected_features['importance'] = selected_features
        self.feature_scores['importance'] = importance_scores['mean_importance'].to_dict()
        
        return selected_features, importance_scores
    
    def ensemble_feature_selection(self, X, y):
        """
        Combine multiple selection methods for robust feature selection
        """
        # Run all selection methods
        univariate_features, _ = self.univariate_selection(X, y)
        rfe_features, _ = self.recursive_feature_elimination(X, y)
        importance_features, _ = self.feature_importance_selection(X, y)
        
        # Find consensus features (appearing in at least 2 methods)
        all_features = set(univariate_features + rfe_features + importance_features)
        
        feature_votes = {}
        for feature in all_features:
            votes = 0
            if feature in univariate_features:
                votes += 1
            if feature in rfe_features:
                votes += 1
            if feature in importance_features:
                votes += 1
            feature_votes[feature] = votes
        
        # Select features with 2+ votes
        consensus_features = [f for f, votes in feature_votes.items() if votes >= 2]
        
        self.selected_features['ensemble'] = consensus_features
        self.feature_scores['ensemble'] = feature_votes
        
        return consensus_features, feature_votes

# Initialize feature selector
feature_selector = AdvancedFeatureSelector()

print("🎯 ADVANCED FEATURE SELECTION PIPELINE")
print("=" * 80)

# Prepare features
print("\n🔧 Preparing features for selection...")
X, y = feature_selector.prepare_features(df_engineered)
print(f"📊 Total features available: {X.shape[1]}")

# Run ensemble feature selection
print("\n🤖 Running ensemble feature selection...")
selected_features, feature_votes = feature_selector.ensemble_feature_selection(X, y)

print(f"\n✅ Feature selection complete!")
print(f"🎯 Selected features: {len(selected_features)}")

# Display selected features
print("\n🏆 FINAL SELECTED FEATURES:")
print("-" * 40)
for i, feature in enumerate(selected_features, 1):
    votes = feature_votes[feature]
    print(f"{i:2d}. {feature:<25} (votes: {votes}/3)")

# Create final dataset with selected features
X_selected = X[selected_features]
print(f"\n📊 Final dataset shape: {X_selected.shape}")

# Show feature selection summary
print("\n📈 FEATURE SELECTION METHODS SUMMARY:")
print("-" * 50)
for method, features in feature_selector.selected_features.items():
    print(f"{method.title()}: {len(features)} features selected")


In [None]:
class AdvancedMLPipeline:
    """
    Comprehensive ML pipeline with ensemble methods and advanced evaluation
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.results = {}
        self.best_model = None
    
    def initialize_models(self):
        """
        Initialize diverse set of advanced models
        """
        self.models = {
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=200,
                max_depth=5,
                learning_rate=0.1,
                random_state=self.random_state
            ),
            
            'Extra Trees': ExtraTreesClassifier(
                n_estimators=200,
                max_depth=10,
                random_state=self.random_state
            ),
            
            'SVM (RBF)': SVC(
                kernel='rbf',
                C=1.0,
                gamma='scale',
                probability=True,
                random_state=self.random_state
            ),
            
            'Neural Network': MLPClassifier(
                hidden_layer_sizes=(100, 50),
                activation='relu',
                solver='adam',
                alpha=0.001,
                learning_rate='adaptive',
                max_iter=1000,
                random_state=self.random_state
            ),
            
            'Naive Bayes': GaussianNB()
        }
        
        # Add advanced models if available
        if advanced_models_available:
            self.models.update({
                'XGBoost': XGBClassifier(
                    n_estimators=200,
                    max_depth=6,
                    learning_rate=0.1,
                    random_state=self.random_state,
                    eval_metric='logloss'
                ),
                
                'LightGBM': LGBMClassifier(
                    n_estimators=200,
                    max_depth=6,
                    learning_rate=0.1,
                    random_state=self.random_state,
                    verbose=-1
                )
            })
    
    def create_ensemble_models(self):
        """
        Create advanced ensemble models
        """
        # Select base models for ensembles
        base_models = ['Gradient Boosting', 'Extra Trees', 'SVM (RBF)']
        if advanced_models_available:
            base_models.extend(['XGBoost', 'LightGBM'])
        
        # Voting Classifier
        voting_estimators = [(name.lower().replace(' ', '_'), self.models[name]) 
                            for name in base_models[:4]]  # Use top 4 models
        
        voting_clf = VotingClassifier(
            estimators=voting_estimators,
            voting='soft'
        )
        
        # Stacking Classifier
        stacking_estimators = voting_estimators
        stacking_clf = StackingClassifier(
            estimators=stacking_estimators,
            final_estimator=GradientBoostingClassifier(
                n_estimators=50,
                random_state=self.random_state
            ),
            cv=5
        )
        
        # Add ensemble models
        self.models['Voting Ensemble'] = voting_clf
        self.models['Stacking Ensemble'] = stacking_clf
    
    def evaluate_model_performance(self, X_train, X_test, y_train, y_test):
        """
        Comprehensive model evaluation with multiple metrics
        """
        print("🔍 COMPREHENSIVE MODEL EVALUATION")
        print("=" * 80)
        
        results_df = []
        
        for name, model in self.models.items():
            print(f"\n🤖 Training {name}...")
            
            try:
                # Train model
                model.fit(X_train, y_train)
                
                # Predictions
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
                
                # Cross-validation
                cv_scores = cross_val_score(
                    model, X_train, y_train, 
                    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state),
                    scoring='roc_auc'
                )
                
                # Metrics calculation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 0
                
                # Store results
                self.results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'roc_auc': roc_auc,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'y_pred': y_pred,
                    'y_pred_proba': y_pred_proba
                }
                
                results_df.append({
                    'Model': name,
                    'Accuracy': f"{accuracy:.4f}",
                    'Precision': f"{precision:.4f}",
                    'Recall': f"{recall:.4f}",
                    'F1-Score': f"{f1:.4f}",
                    'ROC-AUC': f"{roc_auc:.4f}",
                    'CV Score': f"{cv_scores.mean():.4f} ± {cv_scores.std():.4f}"
                })
                
                print(f"✅ {name} completed - ROC-AUC: {roc_auc:.4f}")
                
            except Exception as e:
                print(f"❌ Error training {name}: {str(e)}")
                continue
        
        # Create results DataFrame
        results_df = pd.DataFrame(results_df)
        
        # Find best model based on ROC-AUC
        if self.results:
            best_model_name = max(self.results.keys(), 
                                key=lambda x: self.results[x]['roc_auc'])
            self.best_model = {
                'name': best_model_name,
                'model': self.results[best_model_name]['model'],
                'metrics': self.results[best_model_name]
            }
        
        print("\n📊 MODEL PERFORMANCE SUMMARY:")
        print("=" * 80)
        if not results_df.empty:
            display(results_df)
            print(f"\n🏆 BEST MODEL: {self.best_model['name']}")
            print(f"🎯 Best ROC-AUC: {self.results[self.best_model['name']]['roc_auc']:.4f}")
        else:
            print("⚠️ No models completed successfully")
        
        return results_df

# Initialize and run the ML pipeline
ml_pipeline = AdvancedMLPipeline()

print("🚀 ADVANCED ML PIPELINE INITIALIZATION")
print("=" * 80)

# Initialize models
print("\n🤖 Initializing advanced models...")
ml_pipeline.initialize_models()

# Create ensemble models
print("🔗 Creating ensemble models...")
ml_pipeline.create_ensemble_models()

print(f"✅ Total models initialized: {len(ml_pipeline.models)}")

# Data preparation with robust scaling
print("\n🔧 Preparing data with robust scaling...")
scaler = RobustScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_selected),
    columns=X_selected.columns,
    index=X_selected.index
)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.25, 
    random_state=42, 
    stratify=y
)

print(f"📊 Training set: {X_train.shape[0]} samples")
print(f"📊 Test set: {X_test.shape[0]} samples")

# Run comprehensive evaluation
performance_results = ml_pipeline.evaluate_model_performance(
    X_train, X_test, y_train, y_test
)


In [None]:
class ClinicalModelInterpreter:
    """
    Advanced model interpretation with clinical relevance
    """
    
    def __init__(self, model, X_test, y_test, feature_names):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.feature_names = feature_names
    
    def analyze_feature_importance(self):
        """
        Analyze and visualize feature importance
        """
        print("\n🎯 FEATURE IMPORTANCE ANALYSIS")
        print("=" * 60)
        
        if hasattr(self.model, 'feature_importances_'):
            # Tree-based model feature importance
            importance_df = pd.DataFrame({
                'Feature': self.feature_names,
                'Importance': self.model.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            print("🏆 TOP 10 MOST IMPORTANT FEATURES:")
            print("-" * 40)
            for i, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
                print(f"{i:2d}. {row['Feature']:<25} {row['Importance']:.4f}")
            
            return importance_df
        
        else:
            print("ℹ️ Feature importance not available for this model type")
            return None
    
    def clinical_risk_interpretation(self, importance_df):
        """
        Provide clinical interpretation of important features
        """
        print("\n⚕️ CLINICAL INTERPRETATION OF KEY RISK FACTORS")
        print("=" * 80)
        
        # Clinical interpretations for common features
        clinical_interpretations = {
            'cp': "Chest pain type is a primary symptom indicator",
            'thalach': "Maximum heart rate reflects cardiovascular fitness",
            'oldpeak': "Exercise-induced ST depression indicates ischemia",
            'ca': "Number of coronary vessels affected by stenosis",
            'thal': "Thalassemia test results indicate blood flow patterns",
            'age': "Age is a non-modifiable major risk factor",
            'sex': "Gender influences cardiovascular disease presentation",
            'chol': "Cholesterol levels affect arterial health",
            'trestbps': "Resting blood pressure indicates vascular health",
            'exang': "Exercise-induced angina suggests coronary insufficiency",
            'cv_risk_index': "Composite cardiovascular risk score",
            'metabolic_risk_score': "Combined metabolic dysfunction indicators",
            'hr_reserve': "Heart rate reserve indicates fitness level",
            'exercise_capacity': "Overall exercise tolerance assessment"
        }
        
        if importance_df is not None:
            print("🔍 TOP CLINICAL RISK FACTORS AND THEIR SIGNIFICANCE:")
            print("-" * 60)
            
            for i, (_, row) in enumerate(importance_df.head(8).iterrows(), 1):
                feature = row['Feature']
                importance = row['Importance']
                
                # Find clinical interpretation
                interpretation = "Clinical significance being evaluated"
                for key, value in clinical_interpretations.items():
                    if key in feature.lower():
                        interpretation = value
                        break
                
                print(f"\n{i}. {feature} (Importance: {importance:.4f})")
                print(f"   💡 {interpretation}")
    
    def generate_risk_assessment_summary(self):
        """
        Generate comprehensive risk assessment summary
        """
        print("\n📋 CARDIOVASCULAR RISK ASSESSMENT SUMMARY")
        print("=" * 80)
        
        # Model performance summary
        y_pred = self.model.predict(self.X_test)
        y_pred_proba = self.model.predict_proba(self.X_test)[:, 1] if hasattr(self.model, 'predict_proba') else None
        
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred, zero_division=0)
        recall = recall_score(self.y_test, y_pred, zero_division=0)
        roc_auc = roc_auc_score(self.y_test, y_pred_proba) if y_pred_proba is not None else 0
        
        print("🎯 MODEL PERFORMANCE METRICS:")
        print("-" * 40)
        print(f"Accuracy: {accuracy:.1%} - Overall correct predictions")
        print(f"Precision: {precision:.1%} - True positive rate among positive predictions")
        print(f"Recall: {recall:.1%} - Sensitivity to detect heart disease cases")
        print(f"ROC-AUC: {roc_auc:.3f} - Overall discriminative ability")
        
        # Clinical recommendations
        print("\n💡 CLINICAL RECOMMENDATIONS:")
        print("-" * 40)
        
        if roc_auc >= 0.90:
            print("🟢 EXCELLENT: Model shows excellent predictive performance")
            print("   → Suitable for clinical decision support")
            print("   → Can effectively identify high-risk patients")
        elif roc_auc >= 0.80:
            print("🟡 GOOD: Model shows good predictive performance")
            print("   → Useful for screening and risk stratification")
            print("   → Recommend additional clinical validation")
        else:
            print("🟠 MODERATE: Model shows moderate predictive performance")
            print("   → Requires further optimization")
            print("   → Additional features or data may be needed")
        
        print("\n⚠️ IMPORTANT CLINICAL CONSIDERATIONS:")
        print("-" * 40)
        print("• This model is for research and educational purposes")
        print("• Clinical decisions should always involve qualified healthcare professionals")
        print("• Model predictions should supplement, not replace, clinical judgment")
        print("• External validation on diverse populations is recommended")

# Initialize clinical interpreter with best model
if ml_pipeline.best_model:
    best_model_name = ml_pipeline.best_model['name']
    best_model_obj = ml_pipeline.best_model['model']

    print("🔍 ADVANCED MODEL INTERPRETATION & CLINICAL INSIGHTS")
    print("=" * 80)
    print(f"\n🏆 Analyzing best performing model: {best_model_name}")

    # Initialize clinical interpreter
    clinical_interpreter = ClinicalModelInterpreter(
        model=best_model_obj,
        X_test=X_test,
        y_test=y_test,
        feature_names=X_selected.columns.tolist()
    )

    # Analyze feature importance
    importance_results = clinical_interpreter.analyze_feature_importance()

    # Generate clinical interpretation
    clinical_interpreter.clinical_risk_interpretation(importance_results)

    # Generate comprehensive risk assessment summary
    clinical_interpreter.generate_risk_assessment_summary()
else:
    print("⚠️ No best model available for interpretation")


In [None]:
def create_comprehensive_results_dashboard():
    """
    Create comprehensive visualization dashboard for model results
    """
    if not ml_pipeline.best_model:
        print("⚠️ No model results available for visualization")
        return None
        
    # Set up the plotting environment
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Advanced Heart Disease Prediction: Comprehensive Results Dashboard', 
                 fontsize=16, fontweight='bold', y=0.98)
    
    # 1. Model Performance Comparison
    ax1 = axes[0, 0]
    if ml_pipeline.results:
        model_names = list(ml_pipeline.results.keys())
        roc_scores = [ml_pipeline.results[name]['roc_auc'] for name in model_names]
        
        bars = ax1.barh(model_names, roc_scores, color='skyblue', alpha=0.8)
        ax1.set_xlabel('ROC-AUC Score')
        ax1.set_title('Model Performance Comparison\n(ROC-AUC Scores)', fontweight='bold')
        ax1.set_xlim(0, 1)
        
        # Add value labels on bars
        for bar, score in zip(bars, roc_scores):
            ax1.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2, 
                    f'{score:.3f}', va='center', fontweight='bold')
    
    # 2. Confusion Matrix for Best Model
    ax2 = axes[0, 1]
    best_model_results = ml_pipeline.results[ml_pipeline.best_model['name']]
    cm = confusion_matrix(y_test, best_model_results['y_pred'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
                xticklabels=['No Disease', 'Heart Disease'],
                yticklabels=['No Disease', 'Heart Disease'])
    ax2.set_title(f'Confusion Matrix\n{ml_pipeline.best_model["name"]}', fontweight='bold')
    ax2.set_ylabel('True Label')
    ax2.set_xlabel('Predicted Label')
    
    # 3. ROC Curve
    ax3 = axes[0, 2]
    if best_model_results['y_pred_proba'] is not None:
        fpr, tpr, _ = roc_curve(y_test, best_model_results['y_pred_proba'])
        roc_auc = best_model_results['roc_auc']
        
        ax3.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC Curve (AUC = {roc_auc:.3f})')
        ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.5)
        ax3.set_xlim([0.0, 1.0])
        ax3.set_ylim([0.0, 1.05])
        ax3.set_xlabel('False Positive Rate')
        ax3.set_ylabel('True Positive Rate')
        ax3.set_title(f'ROC Curve\n{ml_pipeline.best_model["name"]}', fontweight='bold')
        ax3.legend(loc="lower right")
        ax3.grid(alpha=0.3)
    
    # 4. Feature Importance (if available)
    ax4 = axes[1, 0]
    if hasattr(ml_pipeline.best_model['model'], 'feature_importances_'):
        importances = ml_pipeline.best_model['model'].feature_importances_
        feature_names = X_selected.columns
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        top_features = importance_df.head(10)
        bars = ax4.barh(top_features['Feature'], top_features['Importance'], 
                       color='lightcoral', alpha=0.8)
        ax4.set_xlabel('Feature Importance')
        ax4.set_title('Top 10 Feature Importance\n(Clinical Risk Factors)', fontweight='bold')
        
        # Add value labels
        for bar, importance in zip(bars, top_features['Importance']):
            ax4.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2, 
                    f'{importance:.3f}', va='center', fontsize=8)
    
    # 5. Age vs Heart Disease Risk
    ax5 = axes[1, 1]
    age_disease = df_engineered.groupby(['age', 'target']).size().unstack(fill_value=0)
    if 1 in age_disease.columns and not age_disease.empty:
        age_risk_pct = age_disease.div(age_disease.sum(axis=1), axis=0)[1] * 100
        
        ax5.scatter(age_risk_pct.index, age_risk_pct.values, alpha=0.6, color='red')
        ax5.set_xlabel('Age (years)')
        ax5.set_ylabel('Heart Disease Risk (%)')
        ax5.set_title('Age vs Heart Disease Risk\n(Clinical Pattern)', fontweight='bold')
        ax5.grid(alpha=0.3)
        
        # Add trend line
        if len(age_risk_pct) > 1:
            z = np.polyfit(age_risk_pct.index, age_risk_pct.values, 1)
            p = np.poly1d(z)
            ax5.plot(age_risk_pct.index, p(age_risk_pct.index), "r--", alpha=0.8, linewidth=2)
    
    # 6. Risk Stratification Summary
    ax6 = axes[1, 2]
    
    # Create risk categories based on model predictions
    if best_model_results['y_pred_proba'] is not None:
        risk_proba = best_model_results['y_pred_proba']
        risk_categories = pd.cut(risk_proba, 
                               bins=[0, 0.3, 0.7, 1.0], 
                               labels=['Low Risk', 'Moderate Risk', 'High Risk'])
        risk_counts = risk_categories.value_counts()
        
        if not risk_counts.empty:
            wedges, texts, autotexts = ax6.pie(risk_counts.values, 
                                              labels=risk_counts.index,
                                              autopct='%1.1f%%',
                                              colors=['lightgreen', 'gold', 'lightcoral'],
                                              startangle=90,
                                              explode=(0.05, 0.05, 0.05))
            
            ax6.set_title('Risk Stratification\n(Model-Based)', fontweight='bold')
            
            # Enhance text appearance
            for autotext in autotexts:
                autotext.set_color('white')
                autotext.set_fontweight('bold')
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Create comprehensive visualization dashboard
print("📊 GENERATING COMPREHENSIVE RESULTS DASHBOARD")
print("=" * 80)
results_dashboard = create_comprehensive_results_dashboard()

# Additional performance metrics visualization
def create_detailed_metrics_plot():
    """
    Create detailed metrics comparison plot
    """
    if not ml_pipeline.results:
        return None
        
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Metrics comparison
    metrics_data = []
    for name, results in ml_pipeline.results.items():
        metrics_data.append({
            'Model': name,
            'Accuracy': results['accuracy'],
            'Precision': results['precision'],
            'Recall': results['recall'],
            'F1-Score': results['f1_score']
        })
    
    metrics_df = pd.DataFrame(metrics_data)
    
    # Bar plot of metrics
    x = np.arange(len(metrics_df))
    width = 0.2
    
    ax1.bar(x - 1.5*width, metrics_df['Accuracy'], width, label='Accuracy', alpha=0.8)
    ax1.bar(x - 0.5*width, metrics_df['Precision'], width, label='Precision', alpha=0.8)
    ax1.bar(x + 0.5*width, metrics_df['Recall'], width, label='Recall', alpha=0.8)
    ax1.bar(x + 1.5*width, metrics_df['F1-Score'], width, label='F1-Score', alpha=0.8)
    
    ax1.set_xlabel('Models')
    ax1.set_ylabel('Score')
    ax1.set_title('Detailed Performance Metrics Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels(metrics_df['Model'], rotation=45, ha='right')
    ax1.legend()
    ax1.grid(alpha=0.3)
    
    # Cross-validation scores
    cv_means = [ml_pipeline.results[name]['cv_mean'] for name in ml_pipeline.results.keys()]
    cv_stds = [ml_pipeline.results[name]['cv_std'] for name in ml_pipeline.results.keys()]
    model_names = list(ml_pipeline.results.keys())
    
    ax2.barh(model_names, cv_means, xerr=cv_stds, alpha=0.8, capsize=5)
    ax2.set_xlabel('Cross-Validation ROC-AUC Score')
    ax2.set_title('Cross-Validation Performance (Mean ± Std)')
    ax2.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Create detailed metrics plot
print("\n📈 GENERATING DETAILED METRICS COMPARISON")
print("-" * 50)
detailed_metrics_plot = create_detailed_metrics_plot()
