In [None]:
"""
Disease Prediction - CodeAlpha Internship (Task 4)
Single-file project:
- Loads a structured medical dataset (CSV)
- Preprocesses, feature-engineers
- Trains: Logistic Regression, SVM, RandomForest, XGBoost
- Evaluates: Accuracy, Precision, Recall, F1, ROC-AUC
- Saves best model artifact (pickle)
- Usage:
    python disease_prediction.py --data path/to/diabetes.csv --target Outcome --out_dir ./artifacts
"""

import argparse
import os
import json
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
try:
    from xgboost import XGBClassifier
    xgb_available = True
except Exception:
    xgb_available = False
    # If xgboost not installed, user can pip install xgboost

def load_data(path):
    df = pd.read_csv(path)
    return df

def basic_feature_engineering(df):
    """
    Example transformations:
    - Create BMI category if 'BMI' exists
    - Create 'age_group' buckets if 'Age' exists
    - Create debt-to-income style ratio if income/expense columns exist
    These are simple examples; adapt for your chosen dataset.
    """
    df = df.copy()
    if 'BMI' in df.columns:
        df['bmi_high'] = (df['BMI'] > 30).astype(int)
    if 'Age' in df.columns:
        df['age_over_50'] = (df['Age'] > 50).astype(int)
    # Example: if glucose and insulin present, create ratio
    if 'Glucose' in df.columns and 'Insulin' in df.columns:
        df['glucose_insulin_ratio'] = df['Glucose'] / (df['Insulin'] + 1e-6)
    return df

def prepare_features(df, target_col):
    # Drop rows where target is missing
    df = df.dropna(subset=[target_col])

    # Separate X/y
    X = df.drop(columns=[target_col])
    y = df[target_col].astype(int)

    # Identify numeric and categorical
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

    # Simple imputation / encode
    if len(categorical_cols) > 0:
        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

    # Impute numeric with median
    imputer = SimpleImputer(strategy='median')
    X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

    return X, y, numeric_cols

def train_and_evaluate(X_train, X_test, y_train, y_test, out_dir, random_state=42):
    results = {}
    models = {}

    # Common scaler pipeline for linear models
    scaler = StandardScaler()

    # Logistic Regression
    lr_pipe = Pipeline([
        ('scaler', scaler),
        ('clf', LogisticRegression(max_iter=2000, random_state=random_state))
    ])
    lr_pipe.fit(X_train, y_train)
    models['LogisticRegression'] = lr_pipe

    # SVM (probability=True for ROC AUC)
    svm_pipe = Pipeline([
        ('scaler', scaler),
        ('clf', SVC(kernel='rbf', probability=True, random_state=random_state))
    ])
    svm_pipe.fit(X_train, y_train)
    models['SVM'] = svm_pipe

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=random_state, n_jobs=-1)
    rf.fit(X_train, y_train)
    models['RandomForest'] = rf

    # XGBoost (if available)
    if xgb_available:
        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=random_state)
        xgb.fit(X_train, y_train)
        models['XGBoost'] = xgb

    # Evaluate each
    for name, model in models.items():
        y_pred = model.predict(X_test)
        # probabilities for ROC-AUC
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:,1]
        else:
            # SVM has predict_proba after probability=True; fallback:
            try:
                y_proba = model.decision_function(X_test)
                # scale to [0,1]
                y_proba = (y_proba - y_proba.min()) / (y_proba.max() - y_proba.min() + 1e-9)
            except Exception:
                y_proba = y_pred

        metrics = {
            "accuracy": float(accuracy_score(y_test, y_pred)),
            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
            "f1": float(f1_score(y_test, y_pred, zero_division=0)),
            "roc_auc": float(roc_auc_score(y_test, y_proba)),
            "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
        }
        results[name] = metrics

    # Choose best by roc_auc
    best_name = max(results.keys(), key=lambda k: results[k]['roc_auc'])
    best_model = models[best_name]
    print(f"Best model: {best_name} (ROC-AUC = {results[best_name]['roc_auc']:.4f})")

    # Save artifact
    artifact = {
        "model": best_model,
        "results": results,
        "feature_columns": X_train.columns.tolist()
    }
    os.makedirs(out_dir, exist_ok=True)
    artifact_path = os.path.join(out_dir, "disease_model_artifact.pkl")
    with open(artifact_path, "wb") as f:
        pickle.dump(artifact, f)

    # Save results summary
    summary_path = os.path.join(out_dir, "results_summary.json")
    with open(summary_path, "w") as f:
        json.dump(results, f, indent=4)

    print("Saved artifact to:", artifact_path)
    print("Saved results summary to:", summary_path)
    return results, artifact_path

def main(args):
    # Load
    print("Loading dataset:", args.data)
    df = load_data(args.data)

    # Basic feature engineering
    df_fe = basic_feature_engineering(df)

    # Prepare features
    X, y, numeric_cols = prepare_features(df_fe, args.target)
    print("Feature matrix shape:", X.shape)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, stratify=y, random_state=args.random_state
    )

    # Train & evaluate
    results, artifact_path = train_and_evaluate(X_train, X_test, y_train, y_test, args.out_dir, args.random_state)

    # Print summary
    print("\nModel evaluation summary:")
    for model_name, metrics in results.items():
        print(f"\n--- {model_name} ---")
        for k, v in metrics.items():
            if k != 'confusion_matrix':
                print(f"{k}: {v:.4f}")
    print("\nDone.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Disease Prediction (Task 4) - train and evaluate models")
    parser.add_argument("--data", type=str, required=True, help="Path to CSV dataset (structured).")
    parser.add_argument("--target", type=str, default="Outcome", help="Name of target column (binary: 0/1).")
    parser.add_argument("--out_dir", type=str, default="./artifacts", help="Output directory for model and results.")
    parser.add_argument("--test_size", type=float, default=0.2, help="Test split fraction.")
    parser.add_argument("--random_state", type=int, default=42, help="Random seed.")
    args = parser.parse_args()
    main(args)
