<a href="https://colab.research.google.com/github/IbrahimJenberu/Social-Media-Sentimental-Analysis-ML-/blob/main/%F0%9F%A9%BA_Symptom_Based_Disease_Prediction_Using_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install xgboost joblib



In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import random
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_synthetic_data(n_samples=2500):
    """
    Generate synthetic Ethiopian patient data for disease prediction

    Args:
        n_samples: Number of patient records to generate

    Returns:
        DataFrame containing synthetic patient data
    """
    print(f"Generating {n_samples} synthetic patient records...")

    # Ethiopian regions
    regions = ['Addis Ababa', 'Oromia', 'Amhara', 'Tigray', 'SNNPR',
               'Somali', 'Afar', 'Benishangul-Gumuz', 'Gambela', 'Harari', 'Dire Dawa']

    # Common symptoms in Ethiopia
    symptoms = ['fever', 'cough', 'headache', 'fatigue', 'nausea', 'vomiting',
                'diarrhea', 'abdominal_pain', 'chest_pain', 'difficulty_breathing',
                'joint_pain', 'rash', 'sore_throat', 'chills', 'loss_of_appetite',
                'jaundice', 'swelling', 'blood_in_stool', 'night_sweats', 'weight_loss']

    # Pre-existing conditions common in Ethiopia
    conditions = ['hypertension', 'diabetes', 'HIV', 'tuberculosis', 'malaria',
                  'hepatitis', 'anemia', 'asthma', 'heart_disease', 'malnutrition']

    # Common diseases in Ethiopia (target variable)
    diseases = ['Malaria', 'Tuberculosis', 'Typhoid Fever', 'Pneumonia',
                'Diarrheal Disease', 'HIV/AIDS', 'Acute Respiratory Infection',
                'Intestinal Parasites', 'Hepatitis', 'Meningitis']

    # Disease severity mapping (used for creating correlation between symptoms and disease)
    disease_severity = {
        'Malaria': 65,
        'Tuberculosis': 70,
        'Typhoid Fever': 60,
        'Pneumonia': 68,
        'Diarrheal Disease': 55,
        'HIV/AIDS': 75,
        'Acute Respiratory Infection': 58,
        'Intestinal Parasites': 50,
        'Hepatitis': 63,
        'Meningitis': 72
    }

    # Symptom-disease correlation matrix (simplified)
    symptom_disease_corr = {
        'Malaria': ['fever', 'chills', 'headache', 'fatigue', 'nausea'],
        'Tuberculosis': ['cough', 'chest_pain', 'difficulty_breathing', 'night_sweats', 'weight_loss'],
        'Typhoid Fever': ['fever', 'headache', 'fatigue', 'abdominal_pain', 'loss_of_appetite'],
        'Pneumonia': ['cough', 'fever', 'difficulty_breathing', 'chest_pain', 'chills'],
        'Diarrheal Disease': ['diarrhea', 'vomiting', 'abdominal_pain', 'fever', 'dehydration'],
        'HIV/AIDS': ['weight_loss', 'fever', 'night_sweats', 'fatigue', 'rash'],
        'Acute Respiratory Infection': ['cough', 'sore_throat', 'fever', 'difficulty_breathing', 'fatigue'],
        'Intestinal Parasites': ['abdominal_pain', 'diarrhea', 'weight_loss', 'fatigue', 'loss_of_appetite'],
        'Hepatitis': ['jaundice', 'fatigue', 'nausea', 'abdominal_pain', 'loss_of_appetite'],
        'Meningitis': ['headache', 'fever', 'neck_stiffness', 'vomiting', 'sensitivity_to_light']
    }

    # Initialize empty dataframe
    df = pd.DataFrame()

    # Generate data in batches for memory efficiency
    batch_size = min(10000, n_samples)
    remaining = n_samples

    while remaining > 0:
        current_batch = min(batch_size, remaining)

        # Initialize data dictionary for this batch
        data = {
            'patient_id': [f'ETH{i:06d}' for i in range(n_samples - remaining + 1, n_samples - remaining + current_batch + 1)],
            'age': np.random.randint(0, 101, size=current_batch),
            'gender': np.random.choice(['male', 'female'], size=current_batch),
            'region': np.random.choice(regions, size=current_batch),
            'symptom_duration_days': np.random.randint(1, 31, size=current_batch),
            'severity_level': np.random.choice(['mild', 'moderate', 'severe'], size=current_batch),
        }

        # Generate symptom data (binary) - vectorized for speed
        for symptom in symptoms:
            data[f'symptom_{symptom}'] = np.random.choice([0, 1], size=current_batch, p=[0.7, 0.3])

        # Generate pre-existing conditions
        num_conditions = np.random.choice([0, 1, 2, 3], size=current_batch, p=[0.5, 0.3, 0.15, 0.05])

        # Vectorized approach to generate conditions
        data['pre_existing_conditions'] = [''] * current_batch
        for i in range(current_batch):
            if num_conditions[i] > 0:
                data['pre_existing_conditions'][i] = ','.join(random.sample(conditions, num_conditions[i]))

        # Create batch dataframe
        batch_df = pd.DataFrame(data)

        # Generate disease based on symptoms (with some correlation)
        batch_df['diagnosed_disease'] = ''
        batch_df['disease_severity_score'] = 0

        # More efficient approach to disease assignment
        symptom_scores = np.zeros((current_batch, len(diseases)))

        # Calculate symptom scores for each disease
        for d_idx, disease in enumerate(diseases):
            for symptom in symptom_disease_corr.get(disease, []):
                symptom_col = f'symptom_{symptom}'
                if symptom_col in batch_df.columns:
                    symptom_scores[:, d_idx] += batch_df[symptom_col].values

        # Add severity factor
        severity_factor = np.zeros(current_batch)
        severity_factor[batch_df['severity_level'] == 'moderate'] = 0.5
        severity_factor[batch_df['severity_level'] == 'severe'] = 1.0

        for d_idx in range(len(diseases)):
            symptom_scores[:, d_idx] += severity_factor

        # Add duration factor
        duration_factor = (batch_df['symptom_duration_days'] > 14).astype(float) * 0.5
        for d_idx in range(len(diseases)):
            symptom_scores[:, d_idx] += duration_factor

        # Add randomness
        symptom_scores += np.random.uniform(0, 0.2, size=symptom_scores.shape)

        # Get index of disease with highest score
        disease_indices = np.argmax(symptom_scores, axis=1)

        # Assign diseases
        batch_df['diagnosed_disease'] = [diseases[i] for i in disease_indices]

        # Calculate severity scores
        base_scores = np.array([disease_severity[d] for d in batch_df['diagnosed_disease']])
        symptom_count = batch_df[[c for c in batch_df.columns if c.startswith('symptom_')]].sum(axis=1).values
        condition_count = batch_df['pre_existing_conditions'].apply(lambda x: len(x.split(',')) if x else 0).values

        # Age factor
        age_factor = np.zeros(current_batch)
        age_factor[(batch_df['age'] < 5) | (batch_df['age'] > 65)] = 5

        # Final score calculation
        scores = base_scores + (symptom_count * 1.5) + (condition_count * 3) + age_factor
        scores += np.random.normal(0, 5, size=current_batch)

        # Clip scores
        scores = np.clip(scores, 0, 100)
        batch_df['disease_severity_score'] = scores

        # Append batch to main dataframe
        df = pd.concat([df, batch_df], ignore_index=True)

        # Update remaining count
        remaining -= current_batch

    print(f"Generated {n_samples} patient records with {len(symptoms)} symptoms")
    return df

def perform_eda(df):
    """
    Perform exploratory data analysis on patient data

    Args:
        df: DataFrame containing patient data
    """
    print("Performing exploratory data analysis...")

    # Basic statistics - more selective to improve performance
    numeric_cols = ['age', 'symptom_duration_days', 'disease_severity_score']
    print("\nBasic statistics (numeric features):")
    print(df[numeric_cols].describe().T)

    # Disease distribution
    print("\nDisease distribution:")
    disease_counts = df['diagnosed_disease'].value_counts()
    print(disease_counts)

    # More efficient plotting
    plt.figure(figsize=(12, 6))
    # Only plot top 10 diseases for clarity
    top_diseases = disease_counts.nlargest(10)
    sns.barplot(x=top_diseases.values, y=top_diseases.index)
    plt.title('Distribution of Top 10 Diagnosed Diseases')
    plt.tight_layout()
    plt.savefig('disease_distribution.png')
    plt.close()  # Close figure to save memory

    # Severity score distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df['disease_severity_score'], bins=20, kde=True)
    plt.title('Distribution of Disease Severity Scores')
    plt.savefig('severity_distribution.png')
    plt.close()

    # Age distribution by gender - more efficient
    plt.figure(figsize=(10, 6))
    # Use violin plot for better distribution visualization
    sns.violinplot(x='gender', y='age', data=df)
    plt.title('Age Distribution by Gender')
    plt.savefig('age_gender_distribution.png')
    plt.close()

    # Correlation between symptoms and severity - vectorized approach
    symptom_cols = [col for col in df.columns if col.startswith('symptom_')]

    # Calculate correlations efficiently
    corr_series = pd.Series(index=symptom_cols)
    for col in symptom_cols:
        corr_series[col] = df[col].corr(df['disease_severity_score'])

    # Sort by correlation
    corr_series = corr_series.sort_values(ascending=False)

    # Plot top 15 symptoms for clarity
    plt.figure(figsize=(12, 8))
    sns.barplot(x=corr_series.iloc[:15].values, y=corr_series.iloc[:15].index)
    plt.title('Top 15 Symptoms Correlated with Disease Severity')
    plt.tight_layout()
    plt.savefig('symptom_correlations.png')
    plt.close()

    # Just use basic numeric columns for correlation heatmap
    # Avoid using condition_count and symptom_count which don't exist yet
    numeric_cols = ['age', 'symptom_duration_days', 'disease_severity_score']
    plt.figure(figsize=(10, 8))
    corr_matrix = df[numeric_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix of Numeric Features')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()

    print("EDA complete. Visualizations saved as PNG files.")

def preprocess_data(df):
    """
    Preprocess the patient data for model training

    Args:
        df: DataFrame containing patient data

    Returns:
        Processed X features and y target
    """
    print("Preprocessing data...")

    # Make a copy to avoid modifying the original
    processed_df = df.copy()

    # Handle missing values in numeric columns - use more robust approaches
    processed_df['age'] = processed_df['age'].fillna(processed_df['age'].median())
    processed_df['symptom_duration_days'] = processed_df['symptom_duration_days'].fillna(processed_df['symptom_duration_days'].median())

    # Create symptom feature matrix
    symptom_cols = [col for col in processed_df.columns if col.startswith('symptom_')]

    # Check for and drop any duplicate columns before creating new ones
    processed_df = processed_df.loc[:, ~processed_df.columns.duplicated()]

    # Extract pre-existing conditions and count them more efficiently
    processed_df['condition_count'] = processed_df['pre_existing_conditions'].str.count(',') + processed_df['pre_existing_conditions'].astype(bool)

    # Create aggregate symptom features
    processed_df['symptom_count'] = processed_df[symptom_cols].sum(axis=1)
    processed_df['symptom_density'] = processed_df['symptom_count'] / processed_df['symptom_duration_days'].clip(lower=1)

    # Add interaction features - Age and severity
    processed_df['age_severity_interaction'] = 0
    mask_high_risk = (processed_df['age'] < 5) | (processed_df['age'] > 65)
    mask_severe = processed_df['severity_level'] == 'severe'
    processed_df.loc[mask_high_risk & mask_severe, 'age_severity_interaction'] = 1

    # Prepare features for modeling
    cat_features = ['gender', 'region', 'severity_level']
    num_features = ['age', 'symptom_duration_days', 'condition_count',
                   'symptom_count', 'symptom_density', 'age_severity_interaction']

    # Check if we have all the features needed - fail early if not
    for feature in cat_features + num_features:
        if feature not in processed_df.columns:
            raise ValueError(f"Required feature '{feature}' is missing from the DataFrame")

    # Target variable is the disease severity score
    y = processed_df['disease_severity_score'].values

    # Combine all features
    features = cat_features + num_features + symptom_cols

    # Verify all features exist and get unique columns only
    X = processed_df[features].copy()

    # Final check for duplicated columns before returning
    if X.columns.duplicated().any():
        # Drop duplicate columns if they exist
        X = X.loc[:, ~X.columns.duplicated()]
        print("Warning: Duplicated columns were removed from the feature set")

    return X, y

def build_model(X, y):
    """
    Build and train a regression model for disease severity prediction

    Args:
        X: Feature matrix
        y: Target variable

    Returns:
        Trained model and preprocessor
    """
    print("Building and training model...")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Verify columns in feature matrix
    print(f"\nFeature columns: {X.columns.tolist()}")
    print(f"Number of features: {len(X.columns)}")
    print(f"Any duplicate columns: {any(X.columns.duplicated())}")

    # Create preprocessor
    categorical_features = ['gender', 'region', 'severity_level']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Set sparse=False for better compatibility
    ])

    # Ensure we're using only the numeric features that exist in the DataFrame
    numerical_features = [col for col in ['age', 'symptom_duration_days', 'condition_count',
                          'symptom_count', 'symptom_density', 'age_severity_interaction'] if col in X.columns]

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    symptom_features = [col for col in X.columns if col.startswith('symptom_')]
    symptom_transformer = 'passthrough'  # Binary features don't need transformation

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
            ('sym', symptom_transformer, symptom_features)
        ],
        remainder='drop'  # Drop any columns not specified
    )

    # Define models to try with optimized defaults
    models = {
        'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),  # Use all cores
        'GradientBoosting': GradientBoostingRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42, n_jobs=-1)  # Use all cores
    }

    # Optimized hyperparameter grids - smaller for faster execution
    param_grids = {
        'RandomForest': {
            'model__n_estimators': [100],
            'model__max_depth': [None, 20],
            'model__min_samples_split': [2, 5]
        },
        'GradientBoosting': {
            'model__n_estimators': [100],
            'model__learning_rate': [0.05, 0.1]
        },
        'XGBoost': {
            'model__n_estimators': [100],
            'model__learning_rate': [0.05, 0.1],
            'model__max_depth': [5, 7]
        }
    }

    # Train and evaluate each model
    best_score = 0
    best_model_name = None
    best_model = None

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")

        # Create pipeline with preprocessor and model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Use grid search for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline,
            param_grids[model_name],
            cv=5,
            scoring='r2',
            n_jobs=-1,  # Use all cores
            verbose=1
        )

        # Fit model
        grid_search.fit(X_train, y_train)

        # Get best model
        y_pred = grid_search.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        print(f"{model_name} - Best parameters: {grid_search.best_params_}")
        print(f"{model_name} - R² Score: {r2:.4f}")
        print(f"{model_name} - MAE: {mae:.4f}")
        print(f"{model_name} - RMSE: {rmse:.4f}")

        # Keep track of best model
        if r2 > best_score:
            best_score = r2
            best_model_name = model_name
            best_model = grid_search.best_estimator_

    print(f"\nBest model: {best_model_name} with R² score of {best_score:.4f}")

    # Evaluate with cross-validation
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2', n_jobs=-1)
    print(f"Cross-validation R² scores: {cv_scores}")
    print(f"Mean CV R² score: {np.mean(cv_scores):.4f}")

    # Extract preprocessor from pipeline for separate saving
    preprocessor = best_model.named_steps['preprocessor']

    return best_model, preprocessor, best_model_name

def evaluate_model(model, X, y):
    """
    Evaluate the trained model and visualize results

    Args:
        model: Trained model
        X: Feature matrix
        y: Target variable
    """
    print("\nEvaluating model performance...")

    # Make predictions in batches for memory efficiency
    batch_size = 1000
    n_samples = len(X)
    y_pred = np.zeros(n_samples)

    for i in range(0, n_samples, batch_size):
        end = min(i + batch_size, n_samples)
        y_pred[i:end] = model.predict(X.iloc[i:end])

    # Calculate metrics
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    print(f"R² Score: {r2:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")

    # Calculate accuracy as percentage of predictions within ±5 points
    within_5_points = np.mean(np.abs(y - y_pred) <= 5)
    within_10_points = np.mean(np.abs(y - y_pred) <= 10)
    print(f"Accuracy (predictions within ±5 points): {within_5_points:.4f} ({within_5_points*100:.2f}%)")
    print(f"Accuracy (predictions within ±10 points): {within_10_points:.4f} ({within_10_points*100:.2f}%)")

    # Visualize actual vs predicted values - use sample for large datasets
    plt.figure(figsize=(10, 6))
    # If dataset is large, sample for visualization
    if len(y) > 1000:
        sample_indices = np.random.choice(range(len(y)), 1000, replace=False)
        y_sample = y[sample_indices]
        y_pred_sample = y_pred[sample_indices]
        plt.scatter(y_sample, y_pred_sample, alpha=0.5)
    else:
        plt.scatter(y, y_pred, alpha=0.5)

    # Plot reference line
    min_val = min(np.min(y), np.min(y_pred))
    max_val = max(np.max(y), np.max(y_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')

    plt.xlabel('Actual Severity Score')
    plt.ylabel('Predicted Severity Score')
    plt.title('Actual vs. Predicted Severity Scores')
    plt.savefig('actual_vs_predicted.png')
    plt.close()

    # Visualize residuals
    residuals = y - y_pred
    plt.figure(figsize=(10, 6))

    # If dataset is large, sample for visualization
    if len(y) > 1000:
        plt.scatter(y_pred[sample_indices], residuals[sample_indices], alpha=0.5)
    else:
        plt.scatter(y_pred, residuals, alpha=0.5)

    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Severity Score')
    plt.ylabel('Residuals (Actual - Predicted)')
    plt.title('Residual Plot')
    plt.savefig('residuals.png')
    plt.close()

    # Histogram of residuals
    plt.figure(figsize=(10, 6))
    plt.hist(residuals, bins=30, alpha=0.7)
    plt.axvline(x=0, color='r', linestyle='--')
    plt.xlabel('Residual Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of Residuals')
    plt.savefig('residuals_histogram.png')
    plt.close()

    # Feature importance for tree-based models
    if hasattr(model, 'named_steps') and hasattr(model.named_steps['model'], 'feature_importances_'):
        # Get feature names after preprocessing
        preprocessor = model.named_steps['preprocessor']
        features = []

        # Get feature names from each transformer
        if hasattr(preprocessor, 'transformers_'):
            for name, transformer, columns in preprocessor.transformers_:
                if name == 'cat' and hasattr(transformer, 'named_steps') and 'onehot' in transformer.named_steps:
                    encoder = transformer.named_steps['onehot']
                    if hasattr(encoder, 'get_feature_names_out'):
                        encoded_features = encoder.get_feature_names_out(columns)
                        features.extend(encoded_features)
                elif name == 'num':
                    features.extend(columns)
                elif name == 'sym':
                    features.extend(columns)

        # Get feature importances
        importances = model.named_steps['model'].feature_importances_

        # If we have the right number of feature names
        if len(features) == len(importances):
            # Create DataFrame of features and importances
            feature_importance = pd.DataFrame({'feature': features, 'importance': importances})
            feature_importance = feature_importance.sort_values('importance', ascending=False)

            # Plot top 20 features
            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
            plt.title('Top 20 Feature Importances')
            plt.tight_layout()
            plt.savefig('feature_importance.png')
            plt.close()

            # Print top 10 features
            print("\nTop 10 important features:")
            print(feature_importance.head(10))

    print("Model evaluation complete. Visualizations saved as PNG files.")

def save_model(model, preprocessor, model_name):
    """
    Save the trained model and preprocessor for later use

    Args:
        model: Trained model
        preprocessor: Data preprocessor
        model_name: Name of the model
    """
    print("\nSaving model and preprocessor...")

    # Create timestamp for model version
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save the full pipeline with compression for smaller file size
    model_filename = f"ethiopian_disease_model_{model_name}_{timestamp}.pkl"
    joblib.dump(model, model_filename, compress=3)

    # Save preprocessor separately
    preprocessor_filename = f"ethiopian_disease_preprocessor_{timestamp}.pkl"
    joblib.dump(preprocessor, preprocessor_filename, compress=3)

    print(f"Model saved as: {model_filename}")
    print(f"Preprocessor saved as: {preprocessor_filename}")

    return model_filename, preprocessor_filename

def create_inference_function(model_filename, preprocessor_filename):
    """
    Create a function for inference with the trained model

    Args:
        model_filename: Filename of the saved model
        preprocessor_filename: Filename of the saved preprocessor

    Returns:
        Inference function
    """
    # Load the model and preprocessor
    model = joblib.load(model_filename)

    # Create a more efficient prediction function using model closure
    def predict_disease_severity(patient_data):
        """
        Predict disease severity for a new patient

        Args:
            patient_data: Dictionary containing patient information
                Required keys: age, gender, region, symptom_*, severity_level,
                symptom_duration_days, pre_existing_conditions

        Returns:
            Dictionary with severity score and risk level
        """
        try:
            # Convert to DataFrame
            patient_df = pd.DataFrame([patient_data])

            # Add derived features needed by the model
            # Count pre-existing conditions
            patient_df['condition_count'] = patient_df['pre_existing_conditions'].apply(
                lambda x: len(x.split(',')) if isinstance(x, str) and x else 0
            )

            # Count symptoms
            symptom_cols = [col for col in patient_df.columns if col.startswith('symptom_')]
            patient_df['symptom_count'] = patient_df[symptom_cols].sum(axis=1)
            patient_df['symptom_density'] = patient_df['symptom_count'] / patient_df['symptom_duration_days'].clip(lower=1)

            # Age severity interaction
            patient_df['age_severity_interaction'] = 0
            age = patient_df['age'].iloc[0]
            severity = patient_df['severity_level'].iloc[0]
            if (age < 5 or age > 65) and severity == 'severe':
                patient_df.loc[0, 'age_severity_interaction'] = 1

            # Make prediction using the full pipeline
            severity_score = model.predict(patient_df)[0]

            # Map severity score to disease risk level
            if severity_score < 40:
                risk_level = "Low"
            elif severity_score < 60:
                risk_level = "Moderate"
            elif severity_score < 80:
                risk_level = "High"
            else:
                risk_level = "Severe"

            return {
                'severity_score': round(severity_score, 2),
                'risk_level': risk_level
            }

        except Exception as e:
            return {
                'error': str(e),
                'severity_score': None,
                'risk_level': None
            }

    print("\nInference function created successfully.")

    # Example usage
    print("\nExample inference usage:")
    example_patient = {
        'age': 35,
        'gender': 'female',
        'region': 'Addis Ababa',
        'symptom_fever': 1,
        'symptom_cough': 1,
        'symptom_headache': 0,
        'symptom_fatigue': 1,
        'symptom_nausea': 0,
        'symptom_vomiting': 0,
        'symptom_diarrhea': 0,
        'symptom_abdominal_pain': 0,
        'symptom_chest_pain': 1,
        'symptom_difficulty_breathing': 1,
        'symptom_joint_pain': 0,
        'symptom_rash': 0,
        'symptom_sore_throat': 1,
        'symptom_chills': 1,
        'symptom_loss_of_appetite': 0,
        'symptom_jaundice': 0,
        'symptom_swelling': 0,
        'symptom_blood_in_stool': 0,
        'symptom_night_sweats': 0,
        'symptom_weight_loss': 0,
        'symptom_duration_days': 5,
        'severity_level': 'moderate',
        'pre_existing_conditions': 'asthma'
    }

    result = predict_disease_severity(example_patient)
    print(f"Example Patient - Predicted Severity Score: {result['severity_score']}")
    print(f"Example Patient - Risk Level: {result['risk_level']}")

    return predict_disease_severity

def main():
    """
    Main function to run the entire pipeline
    """
    start_time = datetime.now()
    print(f"Starting Ethiopian Disease Prediction Pipeline at {start_time}")
    print("=============================================")

    # Step 1: Generate synthetic data
    df = generate_synthetic_data(n_samples=2500)

    # Step 2: Perform EDA
    perform_eda(df)

    # Step 3: Preprocess data
    X, y = preprocess_data(df)

    # Step 4: Build and train model
    model, preprocessor, model_name = build_model(X, y)

    # Step 5: Evaluate model
    evaluate_model(model, X, y)

    # Step 6: Save model
    model_filename, preprocessor_filename = save_model(model, preprocessor, model_name)

    # Step 7: Create inference function
    predict_function = create_inference_function(model_filename, preprocessor_filename)

    end_time = datetime.now()
    duration = end_time - start_time
    print(f"\nEthiopian Disease Prediction Pipeline completed successfully!")
    print(f"Total execution time: {duration}")

if __name__ == "__main__":
    main()

Starting Ethiopian Disease Prediction Pipeline at 2025-04-21 09:45:06.241846
Generating 2500 synthetic patient records...
Generated 2500 patient records with 20 symptoms
Performing exploratory data analysis...

Basic statistics (numeric features):
                         count       mean        std        min        25%  \
age                     2500.0  49.678400  29.632776   0.000000  24.000000   
symptom_duration_days   2500.0  15.453200   8.677484   1.000000   8.000000   
disease_severity_score  2500.0  93.152498   9.526568  47.920038  87.646051   

                          50%    75%    max  
age                      49.0   75.0  100.0  
symptom_duration_days    16.0   23.0   30.0  
disease_severity_score  100.0  100.0  100.0  

Disease distribution:
diagnosed_disease
Tuberculosis                   354
Hepatitis                      316
Intestinal Parasites           306
Pneumonia                      290
HIV/AIDS                       273
Malaria                        264
Acut