<a href="https://colab.research.google.com/github/Greashajain/Smart-Health-Surveillance/blob/main/ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install all required packages
!pip install xgboost lightgbm catboost scikit-learn tensorflow prophet plotly seaborn
!pip install fastapi uvicorn pyngrok nest-asyncio shap

print("All packages installed successfully!")


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0
All packages installed successfully!


In [2]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Visualization
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f" TensorFlow version: {tf.__version__}")
print(f" XGBoost version: {xgb.__version__}")


Libraries imported successfully!
 TensorFlow version: 2.19.0
 XGBoost version: 3.1.2


In [3]:
# Load the dataset
df = pd.read_csv('water_pollution_disease.csv')


In [4]:
print(" DATASET OVERVIEW:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n First few rows:")
print(df.head())
print("\n Basic statistics:")
print(df.describe())

# Check for missing values
print(f"\n Missing values:")
print(df.isnull().sum().sort_values(ascending=False))

 DATASET OVERVIEW:
Shape: (3000, 24)
Columns: ['Country', 'Region', 'Year', 'Water Source Type', 'Contaminant Level (ppm)', 'pH Level', 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', 'Nitrate Level (mg/L)', 'Lead Concentration (µg/L)', 'Bacteria Count (CFU/mL)', 'Water Treatment Method', 'Access to Clean Water (% of Population)', 'Diarrheal Cases per 100,000 people', 'Cholera Cases per 100,000 people', 'Typhoid Cases per 100,000 people', 'Infant Mortality Rate (per 1,000 live births)', 'GDP per Capita (USD)', 'Healthcare Access Index (0-100)', 'Urbanization Rate (%)', 'Sanitation Coverage (% of Population)', 'Rainfall (mm per year)', 'Temperature (°C)', 'Population Density (people per km²)']

 First few rows:
     Country   Region  Year Water Source Type  Contaminant Level (ppm)  \
0     Mexico    North  2015              Lake                     6.06   
1     Brazil     West  2017              Well                     5.24   
2  Indonesia  Central  2022              Pond               

In [6]:
class AdvancedFeatureEngineer:
    def __init__(self):
        self.scaler = StandardScaler()

    def create_features(self, df):
        """Create advanced features for outbreak prediction"""

        df = df.copy()

        # Disease columns
        disease_cols = [
            'Diarrheal Cases per 100,000 people',
            'Cholera Cases per 100,000 people',
            'Typhoid Cases per 100,000 people'
        ]

        # 1. TEMPORAL FEATURES
        print(" Creating temporal features...")

        # Sort by location and year for time series features
        df = df.sort_values(['Country', 'Region', 'Year']).reset_index(drop=True)

        for disease in disease_cols:
            # Moving averages (trend detection)
            df[f'{disease}_MA3'] = df.groupby(['Country', 'Region'])[disease].rolling(3, min_periods=1).mean().values
            df[f'{disease}_MA5'] = df.groupby(['Country', 'Region'])[disease].rolling(5, min_periods=1).mean().values

            # Rate of change (momentum detection)
            df[f'{disease}_Change'] = df.groupby(['Country', 'Region'])[disease].pct_change().fillna(0)
            df[f'{disease}_Acceleration'] = df.groupby(['Country', 'Region'])[f'{disease}_Change'].diff().fillna(0)

            # Lag features (historical context)
            df[f'{disease}_Lag1'] = df.groupby(['Country', 'Region'])[disease].shift(1).fillna(df[disease].mean())
            df[f'{disease}_Lag2'] = df.groupby(['Country', 'Region'])[disease].shift(2).fillna(df[disease].mean())

        # 2. ENVIRONMENTAL RISK FEATURES
        print(" Creating environmental risk features...")

        # Temperature risk (extreme temperatures favor disease spread)
        df['Temperature_Risk'] = ((df['Temperature (°C)'] > 35) | (df['Temperature (°C)'] < 5)).astype(int)
        df['Temperature_Normalized'] = (df['Temperature (°C)'] - df['Temperature (°C)'].mean()) / df['Temperature (°C)'].std()

        # Rainfall risk (both drought and flooding increase disease risk)
        df['Rainfall_Risk'] = ((df['Rainfall (mm per year)'] < 300) | (df['Rainfall (mm per year)'] > 2500)).astype(int)
        df['Rainfall_Normalized'] = (df['Rainfall (mm per year)'] - df['Rainfall (mm per year)'].mean()) / df['Rainfall (mm per year)'].std()

        # 3. WATER QUALITY COMPOSITE SCORE
        print(" Creating water quality features...")

        water_quality_features = [
            'Contaminant Level (ppm)', 'pH Level', 'Turbidity (NTU)',
            'Dissolved Oxygen (mg/L)', 'Nitrate Level (mg/L)',
            'Lead Concentration (µg/L)', 'Bacteria Count (CFU/mL)'
        ]

        # Normalize water quality features
        for feature in water_quality_features:
            df[f'{feature}_Normalized'] = (df[feature] - df[feature].mean()) / df[feature].std()

        # Create composite water quality risk score
        df['Water_Quality_Risk'] = (
            (df['pH Level'] < 6.5).astype(int) +
            (df['pH Level'] > 8.5).astype(int) +
            (df['Turbidity (NTU)'] > 5).astype(int) +
            (df['Dissolved Oxygen (mg/L)'] < 4).astype(int) +
            (df['Bacteria Count (CFU/mL)'] > 100).astype(int)
        )

        # 4. INFRASTRUCTURE VULNERABILITY
        print(" Creating infrastructure features...")

        df['Infrastructure_Risk'] = (
            (100 - df['Access to Clean Water (% of Population)']) +
            (100 - df['Healthcare Access Index (0-100)']) +
            (100 - df['Sanitation Coverage (% of Population)'])
        ) / 3

        df['Infrastructure_Critical'] = (df['Infrastructure_Risk'] > 60).astype(int)

        # 5. POPULATION DENSITY CATEGORIES
        df['Pop_Density_High'] = (df['Population Density (people per km²)'] > 1000).astype(int)
        df['Pop_Density_Low'] = (df['Population Density (people per km²)'] < 100).astype(int)

        # 6. ECONOMIC VULNERABILITY
        df['Economic_Risk'] = (df['GDP per Capita (USD)'] < 2000).astype(int)
        df['GDP_Normalized'] = (df['GDP per Capita (USD)'] - df['GDP per Capita (USD)'].mean()) / df['GDP per Capita (USD)'].std()

        # 7. DISEASE INTERACTION FEATURES
        print(" Creating disease interaction features...")

        df['Total_Disease_Load'] = sum([df[disease] for disease in disease_cols])
        df['Disease_Diversity'] = (sum([(df[disease] > 0).astype(int) for disease in disease_cols]))
        df['Max_Disease'] = np.max([df[disease] for disease in disease_cols], axis=0)

        # Disease ratios
        df['Diarrheal_Cholera_Ratio'] = df[disease_cols[0]] / (df[disease_cols[1]] + 1)
        df['Cholera_Typhoid_Ratio'] = df[disease_cols[1]] / (df[disease_cols[2]] + 1)

        print(f"Feature engineering complete! Created {len(df.columns)} features")
        return df

# Create feature engineer and apply
feature_engineer = AdvancedFeatureEngineer()
df_engineered = feature_engineer.create_features(df)

print(f" Original features: {df.shape[1]}")
print(f" Engineered features: {df_engineered.shape[1]}")
print(f"New features added: {df_engineered.shape[1] - df.shape[1]}")


 Creating temporal features...
 Creating environmental risk features...
 Creating water quality features...
 Creating infrastructure features...
 Creating disease interaction features...
Feature engineering complete! Created 65 features
 Original features: 24
 Engineered features: 65
New features added: 41


In [7]:
class OutbreakLabelCreator:
    def __init__(self):
        pass

    def create_outbreak_labels(self, df):
        """Create sophisticated outbreak labels using multiple criteria"""

        df = df.copy()

        disease_cols = [
            'Diarrheal Cases per 100,000 people',
            'Cholera Cases per 100,000 people',
            'Typhoid Cases per 100,000 people'
        ]

        print("Creating outbreak labels...")

        # Multiple outbreak detection criteria
        outbreak_signals = []

        # Criterion 1: Absolute threshold (75th percentile)
        for disease in disease_cols:
            threshold = df[disease].quantile(0.75)
            outbreak_signals.append(df[disease] > threshold)

        # Criterion 2: Rapid increase (50% increase from moving average)
        for disease in disease_cols:
            ma_col = f'{disease}_MA3'
            if ma_col in df.columns:
                rapid_increase = (df[disease] / (df[ma_col] + 1)) > 1.5
                outbreak_signals.append(rapid_increase)

        # Criterion 3: Multiple diseases elevated simultaneously
        elevated_count = sum([
            (df[disease] > df[disease].quantile(0.70)).astype(int)
            for disease in disease_cols
        ])
        outbreak_signals.append(elevated_count >= 2)

        # Criterion 4: High environmental risk + elevated disease
        environmental_risk = (
            (df['Water_Quality_Risk'] >= 3) |
            (df['Infrastructure_Risk'] > 50) |
            (df['Temperature_Risk'] == 1) |
            (df['Rainfall_Risk'] == 1)
        )

        any_disease_elevated = np.any([
            df[disease] > df[disease].quantile(0.60)
            for disease in disease_cols
        ], axis=0)

        outbreak_signals.append(environmental_risk & any_disease_elevated)

        # Final outbreak label: ANY criterion triggers outbreak
        df['outbreak_label'] = np.any(outbreak_signals, axis=0).astype(int)

        # Create severity levels
        signal_count = sum([signal.astype(int) for signal in outbreak_signals])

        conditions = [
            signal_count == 0,
            signal_count == 1,
            signal_count == 2,
            signal_count >= 3
        ]

        choices = [0, 1, 2, 3]  # Normal, Watch, Alert, Critical

        df['outbreak_severity'] = np.select(conditions, choices, default=0)

        # Print label distribution
        print(" OUTBREAK LABEL DISTRIBUTION:")
        print(df['outbreak_label'].value_counts())
        print(f"\n OUTBREAK SEVERITY DISTRIBUTION:")
        print(df['outbreak_severity'].value_counts())

        outbreak_rate = df['outbreak_label'].mean()
        print(f"\n Outbreak rate: {outbreak_rate:.1%}")

        return df

# Create labels
label_creator = OutbreakLabelCreator()
df_labeled = label_creator.create_outbreak_labels(df_engineered)

print(" Outbreak labels created successfully!")


Creating outbreak labels...
 OUTBREAK LABEL DISTRIBUTION:
outbreak_label
1    2275
0     725
Name: count, dtype: int64

 OUTBREAK SEVERITY DISTRIBUTION:
outbreak_severity
3    916
0    725
1    683
2    676
Name: count, dtype: int64

 Outbreak rate: 75.8%
 Outbreak labels created successfully!


In [9]:
# Temporal train-test split (crucial for time series data)
def create_temporal_split(df, test_year_start=2019):
    """Create temporal train-test split"""

    train_mask = df['Year'] < test_year_start
    test_mask = df['Year'] >= test_year_start

    train_data = df[train_mask].copy()
    test_data = df[test_mask].copy()

    print(f" TEMPORAL SPLIT SUMMARY:")
    print(f"Training data: {len(train_data)} samples ({train_data['Year'].min()}-{train_data['Year'].max()})")
    print(f"Test data: {len(test_data)} samples ({test_data['Year'].min()}-{test_data['Year'].max()})")

    return train_data, test_data

# Create split
train_df, test_df = create_temporal_split(df_labeled, test_year_start=2019)

# Feature selection (exclude non-predictive columns)
exclude_cols = ['Country', 'Region', 'Year', 'outbreak_label', 'outbreak_severity']

# Get all numeric feature columns
feature_cols = [col for col in train_df.columns
                if col not in exclude_cols
                and train_df[col].dtype in ['int64', 'float64']]

# Handle missing values
train_features = train_df[feature_cols].fillna(train_df[feature_cols].median())
test_features = test_df[feature_cols].fillna(train_df[feature_cols].median())

# Target variables
train_labels = train_df['outbreak_label'].values
test_labels = test_df['outbreak_label'].values

print(f" Selected {len(feature_cols)} features for training")
print(f" Training samples: {len(train_features)}")
print(f" Test samples: {len(test_features)}")
print(f" Outbreak rate in training: {train_labels.mean():.1%}")
print(f" Outbreak rate in test: {test_labels.mean():.1%}")


 TEMPORAL SPLIT SUMMARY:
Training data: 2278 samples (2000-2018)
Test data: 722 samples (2019-2024)
 Selected 60 features for training
 Training samples: 2278
 Test samples: 722
 Outbreak rate in training: 75.9%
 Outbreak rate in test: 75.8%


In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def clean_data_for_ml(df):
    """Comprehensive data cleaning for machine learning"""

    print(" Cleaning data for ML training...")

    df_clean = df.copy()

    # 1. Replace infinite values with NaN
    df_clean = df_clean.replace([np.inf, -np.inf], np.nan)

    # 2. Handle extremely large values (cap at 99.9th percentile)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        if df_clean[col].dtype in ['int64', 'float64']:
            # Cap extremely large values
            upper_limit = df_clean[col].quantile(0.999)
            lower_limit = df_clean[col].quantile(0.001)

            df_clean[col] = np.clip(df_clean[col], lower_limit, upper_limit)

    # 3. Fill remaining NaN values
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            # Use median for skewed data, mean for normal data
            if abs(df_clean[col].skew()) > 1:  # Skewed data
                fill_value = df_clean[col].median()
            else:  # Normal data
                fill_value = df_clean[col].mean()

            df_clean[col] = df_clean[col].fillna(fill_value)

    # 4. Check for any remaining issues
    inf_count = np.isinf(df_clean.select_dtypes(include=[np.number])).sum().sum()
    nan_count = df_clean.select_dtypes(include=[np.number]).isnull().sum().sum()

    print(f" Cleaning complete!")
    print(f"   Infinite values: {inf_count}")
    print(f"   Missing values: {nan_count}")

    return df_clean

# Temporal train-test split (crucial for time series data)
def create_temporal_split(df, test_year_start=2019):
    """Create temporal train-test split with cleaned data"""

    # Clean data first
    df_clean = clean_data_for_ml(df)

    train_mask = df_clean['Year'] < test_year_start
    test_mask = df_clean['Year'] >= test_year_start

    train_data = df_clean[train_mask].copy()
    test_data = df_clean[test_mask].copy()

    print(f" TEMPORAL SPLIT SUMMARY:")
    print(f"Training data: {len(train_data)} samples ({train_data['Year'].min()}-{train_data['Year'].max()})")
    print(f"Test data: {len(test_data)} samples ({test_data['Year'].min()}-{test_data['Year'].max()})")

    return train_data, test_data

# Create split with cleaned data
train_df, test_df = create_temporal_split(df_labeled, test_year_start=2019)

# Feature selection (exclude non-predictive columns)
exclude_cols = ['Country', 'Region', 'Year', 'outbreak_label', 'outbreak_severity']

# Get all numeric feature columns
feature_cols = [col for col in train_df.columns
                if col not in exclude_cols
                and train_df[col].dtype in ['int64', 'float64']]

# Extract clean features and labels
train_features = train_df[feature_cols]
test_features = test_df[feature_cols]
train_labels = train_df['outbreak_label'].values
test_labels = test_df['outbreak_label'].values

# Final verification - ensure no infinite or NaN values
print(f" FEATURE VERIFICATION:")
print(f"Training features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Infinite values in training: {np.isinf(train_features).sum().sum()}")
print(f"Missing values in training: {train_features.isnull().sum().sum()}")
print(f"Outbreak rate in training: {train_labels.mean():.1%}")
print(f"Outbreak rate in test: {test_labels.mean():.1%}")

if np.isinf(train_features).sum().sum() == 0 and train_features.isnull().sum().sum() == 0:
    print("Data is clean and ready for ML training!")
else:
    print(" Data still has issues - check cleaning process")


 Cleaning data for ML training...
 Cleaning complete!
   Infinite values: 0
   Missing values: 0
 TEMPORAL SPLIT SUMMARY:
Training data: 2278 samples (2000-2018)
Test data: 722 samples (2019-2024)
 FEATURE VERIFICATION:
Training features shape: (2278, 60)
Test features shape: (722, 60)
Infinite values in training: 0
Missing values in training: 0
Outbreak rate in training: 75.9%
Outbreak rate in test: 75.8%
Data is clean and ready for ML training!


In [11]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

class EnsembleOutbreakPredictor:
    def __init__(self):
        self.models = {}
        self.predictions = {}
        self.probabilities = {}

    def train_models(self, X_train, y_train, X_val, y_val):
        """Train multiple complementary models - FIXED & OPTIMIZED VERSION"""

        print(" Training ensemble of ML models...")

        # Verify data is clean before training
        if np.isinf(X_train).sum().sum() > 0 or X_train.isnull().sum().sum() > 0:
            raise ValueError("Training data contains infinite or missing values!")

        # Calculate class weights for imbalanced data
        pos_weight = len(y_train[y_train==0])/len(y_train[y_train==1])

        # 1. XGBoost (FIXED - simplified parameters)
        print("Training XGBoost...")
        try:
            xgb_model = xgb.XGBClassifier(
                n_estimators=100,  # Reduced for faster training
                max_depth=4,       # Reduced to prevent overfitting
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                scale_pos_weight=pos_weight,
                random_state=42,
                eval_metric='logloss',
                verbosity=0  # Suppress warnings
            )
            xgb_model.fit(X_train, y_train)
            val_acc = xgb_model.score(X_val, y_val)
            print(f"   XGBoost validation accuracy: {val_acc:.1%}")
            self.models['XGBoost'] = xgb_model
        except Exception as e:
            print(f"   XGBoost failed: {e}")

        # 2. LightGBM (FIXED)
        print("Training LightGBM...")
        try:
            lgb_model = lgb.LGBMClassifier(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                scale_pos_weight=pos_weight,
                random_state=42,
                verbose=-1,
                force_col_wise=True  # Avoid warnings
            )
            lgb_model.fit(X_train, y_train)
            val_acc = lgb_model.score(X_val, y_val)
            print(f"   LightGBM validation accuracy: {val_acc:.1%}")
            self.models['LightGBM'] = lgb_model
        except Exception as e:
            print(f"   LightGBM failed: {e}")

        # 3. Random Forest (Robust baseline)
        print("Training Random Forest...")
        try:
            rf_model = RandomForestClassifier(
                n_estimators=100,
                max_depth=8,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            rf_model.fit(X_train, y_train)
            val_acc = rf_model.score(X_val, y_val)
            print(f"   Random Forest validation accuracy: {val_acc:.1%}")
            self.models['RandomForest'] = rf_model
        except Exception as e:
            print(f"   Random Forest failed: {e}")

        # 4. CatBoost (Usually most robust)
        print("Training CatBoost...")
        try:
            cat_model = CatBoostClassifier(
                iterations=100,
                depth=4,
                learning_rate=0.1,
                class_weights=[1, pos_weight],
                verbose=False,
                random_state=42
            )
            cat_model.fit(X_train, y_train)
            val_acc = cat_model.score(X_val, y_val)
            print(f"   CatBoost validation accuracy: {val_acc:.1%}")
            self.models['CatBoost'] = cat_model
        except Exception as e:
            print(f"   CatBoost failed: {e}")

        print(f" Successfully trained {len(self.models)} models!")

    def predict_ensemble(self, X_test, threshold=0.3):
        """Make ensemble predictions with optimized threshold"""

        print(f" Making predictions with threshold: {threshold}")

        if len(self.models) == 0:
            raise ValueError("No models trained successfully!")

        # Get predictions from each model
        for name, model in self.models.items():
            try:
                pred_proba = model.predict_proba(X_test)[:, 1]
                pred_binary = (pred_proba > threshold).astype(int)

                self.predictions[name] = pred_binary
                self.probabilities[name] = pred_proba
            except Exception as e:
                print(f"Prediction failed for {name}: {e}")

        if len(self.probabilities) == 0:
            raise ValueError("No successful predictions!")

        # Ensemble strategies
        prob_values = list(self.probabilities.values())

        # Simple average ensemble
        ensemble_probability = np.mean(prob_values, axis=0)

        # Final prediction
        final_predictions = (ensemble_probability > threshold).astype(int)

        ensemble_methods = {
            'Average_Probability': ensemble_probability,
            'Final_Predictions': final_predictions
        }

        return final_predictions, ensemble_probability, ensemble_methods

# Initialize and train models
predictor = EnsembleOutbreakPredictor()

# Create validation split from training data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_features, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

# Train models with cleaned data
predictor.train_models(X_train_split, y_train_split, X_val_split, y_val_split)

print(" Ensemble model training completed!")


 Training ensemble of ML models...
Training XGBoost...
   XGBoost validation accuracy: 91.2%
Training LightGBM...
   LightGBM failed: Do not support special JSON characters in feature name.
Training Random Forest...
   Random Forest validation accuracy: 86.8%
Training CatBoost...
   CatBoost validation accuracy: 90.4%
 Successfully trained 3 models!
 Ensemble model training completed!


In [12]:
def evaluate_outbreak_predictions(y_true, y_pred, y_proba, model_name="Ensemble"):
    """Comprehensive evaluation of outbreak predictions"""

    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

    print(f"{model_name.upper()} MODEL EVALUATION:")
    print("=" * 50)

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    try:
        auc = roc_auc_score(y_true, y_proba)
    except:
        auc = 0.5

    print(f" Accuracy: {accuracy:.1%}")
    print(f"Precision: {precision:.1%}")
    print(f" Recall: {recall:.1%}")
    print(f" F1-Score: {f1:.3f}")
    print(f" AUC-ROC: {auc:.3f}")

    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    print(f"\nDETAILED BREAKDOWN:")
    print(f"True Positives (Correctly detected outbreaks): {tp}")
    print(f"False Positives (False alarms): {fp}")
    print(f"False Negatives (Missed outbreaks): {fn}")
    print(f"True Negatives (Correctly identified normal): {tn}")

    # Key rates
    false_negative_rate = fn / (tp + fn) if (tp + fn) > 0 else 0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    alert_rate = (tp + fp) / len(y_true)

    print(f"\n OPERATIONAL METRICS:")
    print(f"False Negative Rate: {false_negative_rate:.1%} ({fn} missed outbreaks)")
    print(f"False Positive Rate: {false_positive_rate:.1%}")
    print(f"Alert Rate: {alert_rate:.1%} ({tp + fp} total alerts)")

    # Compare with your previous Z-score results
    print(f"\n IMPROVEMENT OVER Z-SCORE SYSTEM:")
    print(f"Previous false negatives: 541")
    print(f"New false negatives: {fn}")
    print(f"Improvement: {541 - fn} fewer missed outbreaks!")
    print(f"Reduction in missed outbreaks: {((541 - fn) / 541) * 100:.1f}%")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'false_negatives': fn,
        'false_positives': fp,
        'alert_rate': alert_rate
    }

# Make predictions on test set
print(" Making predictions on test set...")
final_predictions, final_probabilities, ensemble_methods = predictor.predict_ensemble(
    test_features, threshold=0.3  # Lower threshold to catch more outbreaks
)

# Evaluate performance
results = evaluate_outbreak_predictions(
    test_labels, final_predictions, final_probabilities, "Ensemble ML System"
)

# Show top high-risk predictions
high_risk_mask = final_probabilities > 0.7
if high_risk_mask.sum() > 0:
    print(f"\n TOP HIGH-RISK LOCATIONS (probability > 70%):")
    high_risk_data = test_df[high_risk_mask].head(10)
    for _, row in high_risk_data.iterrows():
        prob = final_probabilities[high_risk_mask][_]
        print(f"   {row['Country']} - {row['Region']} ({row['Year']}): {prob:.1%} risk")

print(f"\n FINAL PERFORMANCE SUMMARY:")
print(f" Recall (Outbreak Detection): {results['recall']:.1%}")
print(f" Precision (Alert Accuracy): {results['precision']:.1%}")
print(f" False Negatives Reduced: {541 - results['false_negatives']} outbreaks")
print(f" Ready for SIH 2025 presentation!")


 Making predictions on test set...
 Making predictions with threshold: 0.3
ENSEMBLE ML SYSTEM MODEL EVALUATION:
 Accuracy: 93.6%
Precision: 93.9%
 Recall: 98.0%
 F1-Score: 0.959
 AUC-ROC: 0.981

DETAILED BREAKDOWN:
True Positives (Correctly detected outbreaks): 536
False Positives (False alarms): 35
False Negatives (Missed outbreaks): 11
True Negatives (Correctly identified normal): 140

 OPERATIONAL METRICS:
False Negative Rate: 2.0% (11 missed outbreaks)
False Positive Rate: 20.0%
Alert Rate: 79.1% (571 total alerts)

 IMPROVEMENT OVER Z-SCORE SYSTEM:
Previous false negatives: 541
New false negatives: 11
Improvement: 530 fewer missed outbreaks!
Reduction in missed outbreaks: 98.0%

 TOP HIGH-RISK LOCATIONS (probability > 70%):
   Bangladesh - Central (2020): 98.2% risk
   Bangladesh - Central (2021): 96.7% risk
   Bangladesh - Central (2022): 95.5% risk
   Bangladesh - Central (2022): 90.2% risk
   Bangladesh - Central (2023): 99.7% risk
   Bangladesh - Central (2023): 97.4% risk
   

In [13]:
# Create visualization with prediction probabilities - CORRECTED VERSION
import plotly.express as px
import plotly.graph_objects as go

# 1. Add probabilities to your test dataframe
results_df = test_df.copy()
results_df['outbreak_probability'] = final_probabilities
results_df['predicted_outbreak'] = final_predictions
results_df['prediction_correct'] = (final_predictions == test_labels).astype(int)

# 2. Create visualizations
fig = go.Figure()

# Performance metrics bar chart
fig.add_bar(
    x=['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    y=[results['accuracy'], results['precision'], results['recall'], results['f1']],
    name="Model Performance"
)
fig.update_layout(title=" Smart Health Surveillance - Model Performance")
fig.show()

# 3. Geographic distribution of alerts (FIXED)
alert_locations = results_df[results_df['predicted_outbreak'] == 1]

if len(alert_locations) > 0:
    fig2 = px.scatter(
        alert_locations,
        x='Country',
        y='outbreak_probability',
        color='outbreak_probability',
        size='outbreak_probability',
        title=" High-Risk Outbreak Locations Detected",
        labels={'outbreak_probability': 'Outbreak Probability'}
    )
    fig2.show()

    print(f" Found {len(alert_locations)} high-risk locations!")
    print("\n Top 5 High-Risk Locations:")
    top_alerts = alert_locations.nlargest(5, 'outbreak_probability')
    for _, row in top_alerts.iterrows():
        print(f"   {row['Country']} - {row['Region']} ({row['Year']}): {row['outbreak_probability']:.1%} risk")
else:
    print(" No high-risk locations detected with current threshold")

# 4. Performance comparison chart
comparison_data = {
    'Method': ['Previous Z-Score', 'New Ensemble ML'],
    'Recall': [0.263, results['recall']],  # Your old 26.3% vs new performance
    'Precision': [1.0, results['precision']],
    'False Negatives': [541, results['false_negatives']]
}



print(" All visualizations created successfully!")


 Found 571 high-risk locations!

 Top 5 High-Risk Locations:
   Brazil - West (2023): 99.9% risk
   China - East (2022): 99.9% risk
   Mexico - East (2023): 99.9% risk
   India - South (2022): 99.9% risk
   Ethiopia - South (2020): 99.8% risk
 All visualizations created successfully!
