In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('C:\\Users\\defaultuser0\\Desktop\\Mgala\\Cholera.csv')
# Display basic info
print(df.info())
print(df.describe())
print(df.head())

In [None]:
!pip install xgboost


In [None]:
# Check for missing values
print("Missing values before imputation:")
print(df.isnull().sum())

# Handle missing values
# For numerical features (if any missing)
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
if df[numerical_cols].isna().any().any():  # Check if numerical columns have NaNs
    num_imputer = SimpleImputer(strategy='median')
    df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

# For categorical features (only if missing)
categorical_cols = df.select_dtypes(include=['object']).columns
if df[categorical_cols].isna().any().any():  # Check if categorical columns have NaNs
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Verify no missing values remain
print("\nMissing values after imputation:")
print(df.isnull().sum())

In [None]:
import pandas as pd

# First load or create your DataFrame
# For example, if reading from a CSV:
df = pd.read_csv('C:\\Users\\defaultuser0\\Desktop\\Mgala\\Cholera.csv')  # Replace with your actual data source

# Then perform one-hot encoding
df = pd.get_dummies(df, drop_first=True)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Sample data (replace with your actual DataFrame)
data = {
    'date': ['2023-01-01', '2023-01-02'],
    'location': ['CityA', 'CityB'],
    'temperature': [28, 30],
    'humidity': [80, 75],
    'water-sources-quality': ['Good', 'Bad'],
    'sanitation-facilities': ['Improved', 'Unimproved'],
    'population-density': [200, 150],
    'access_to_healthcare': [0.5, 0.3],
    'previous_outbreak-history': [1, 0],
    'reported_cases': [10, 5]  # Note: Typo in column name (adjust as needed)
}
df = pd.DataFrame(data)

# Step 1: Encode categorical columns
df = pd.get_dummies(df, columns=['water-sources-quality', 'sanitation-facilities'])

# Step 2: Define target and features
target_col = 'reported_cases'  # Verify exact name!
features = df.drop(columns=[target_col, 'date', 'location'])

# Step 3: Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step 4: Rebuild DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df[target_col] = df[target_col]
scaled_df['date'] = df['date']
scaled_df['location'] = df['location']

In [None]:
# First, verify your actual column names
print("Columns in DataFrame:", df.columns.tolist())

# Replace 'target_column' with your actual target column name
target_col = 'reported_cases'  # Change this to your actual target column

# Check class distribution
print(df[target_col].value_counts())

# Apply SMOTE if imbalance exists
X = df.drop(target_col, axis=1)
y = df[target_col]

# Continue with your SMOTE implementation...

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(df, figsize=(14, 12)):
    """
    Generate a correlation matrix heatmap with automatic handling of:
    - Numeric features (included directly)
    - Categorical features (label encoded)
    - Date features (converted to numeric)
    - High-cardinality features (auto-dropped)
    
    Parameters:
    df : pandas DataFrame
    figsize : tuple, size of the output figure
    """
    
    # Create a copy to avoid modifying original dataframe
    df_processed = df.copy()
    
    # 1. Convert dates to numeric (days since min date)
    date_cols = df_processed.select_dtypes(include=['datetime', 'datetime64']).columns
    for col in date_cols:
        df_processed[col] = (df_processed[col] - df_processed[col].min()).dt.days
    
    # 2. Label encode categorical features (skip high-cardinality)
    cat_cols = df_processed.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if df_processed[col].nunique() > 20:  # Drop columns with too many categories
            df_processed = df_processed.drop(col, axis=1)
        else:
            df_processed[col] = pd.factorize(df_processed[col])[0]  # Simple numeric encoding
    
    # 3. Calculate correlation matrix
    corr_matrix = df_processed.corr()
    
    # 4. Create the plot
    plt.figure(figsize=figsize)
    
    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    # Create heatmap with improved styling
    sns.heatmap(corr_matrix, 
                mask=mask,
                annot=True,
                fmt=".2f",
                cmap='coolwarm',
                vmin=-1,
                vmax=1,
                center=0,
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8})
    
    # Improve readability
    plt.title('Feature Correlation Matrix', pad=20, fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("C:\\Users\\defaultuser0\\Desktop\\Mgala\\Cholera.csv")

# Print all column names
print("Columns in your dataset:", df.columns.tolist())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv("C:\\Users\\defaultuser0\\Desktop\\Mgala\\Cholera.csv")

# Define features (X) and target (y) - Note the exact column name
X = df.drop("Reported_cases", axis=1)  # All columns EXCEPT the target
y = df["Reported_cases"]               # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42,
    # stratify=y  # Uncomment if y is categorical and you want stratified sampling
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
# Convert to 1/0 if needed
y = df["Reported_cases"].map({"Yes": 1, "No": 0})
# or for presence/absence:
y = (df["Reported_cases"] > 0).astype(int)

In [None]:
pip install xgboost

In [None]:
print(y.value_counts())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# 1. Load and prepare data
df = pd.read_csv("C:\\Users\\defaultuser0\\Desktop\\Mgala\\Cholera.csv")

# 2. Process Date
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df = df.drop('Date', axis=1)

# 3. Encode categorical variables
cat_cols = ['Location', 'Water_source_quality', 'Sanitation_facilities']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols))

# 4. Combine features
X = pd.concat([df.drop(cat_cols + ['Reported_cases'], axis=1), encoded_df], axis=1)
y = df['Reported_cases']

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train XGBoost regression model
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)

# 7. Evaluate
train_pred = xgb.predict(X_train)
test_pred = xgb.predict(X_test)

print(f"Train RMSE: {mean_squared_error(y_train, train_pred, squared=False):.2f}")
print(f"Test RMSE: {mean_squared_error(y_test, test_pred, squared=False):.2f}")
print(f"Train MAE: {mean_absolute_error(y_train, train_pred):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, test_pred):.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                          cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Best model
best_xgb = grid_search.best_estimator_

# Evaluate best model
y_pred_tuned = best_xgb.predict(X_test)
y_pred_proba_tuned = best_xgb.predict_proba(X_test)[:, 1]

print("\nTuned Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tuned))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba_tuned))

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 8))
xgb.plot_importance(best_xgb, max_num_features=15)
plt.title('Feature Importance')
plt.show()

In [None]:
import shap

# SHAP explainer
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Force plot for a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [None]:
import joblib

# Save the model
joblib.dump(best_xgb, 'cholera_prediction_xgboost_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Save the feature list
joblib.dump(selected_features, 'selected_features.pkl')

# To load the model later:
# loaded_model = joblib.load('cholera_prediction_xgboost_model.pkl')

In [None]:
# Example of monitoring model performance over time
def monitor_performance(model, X_new, y_new, threshold=0.05):
    y_pred_new = model.predict(X_new)
    y_pred_proba_new = model.predict_proba(X_new)[:, 1]
    
    new_accuracy = accuracy_score(y_new, y_pred_new)
    new_auc = roc_auc_score(y_new, y_pred_proba_new)
    
    # Compare with original performance
    original_accuracy = accuracy_score(y_test, y_pred_tuned)
    original_auc = roc_auc_score(y_test, y_pred_proba_tuned)
    
    accuracy_drop = original_accuracy - new_accuracy
    auc_drop = original_auc - new_auc
    
    if accuracy_drop > threshold or auc_drop > threshold:
        print(f"Warning: Significant performance drop detected!")
        print(f"Accuracy drop: {accuracy_drop:.4f}")
        print(f"AUC drop: {auc_drop:.4f}")
        return False
    else:
        print("Model performance is stable.")
        return True

# Example usage with new data
# new_data = pd.read_csv('new_cholera_data.csv')
# X_new = preprocess_new_data(new_data)  # You would need to implement this
# y_new = new_data['target_column']
# monitor_performance(best_xgb, X_new, y_new)