# AKI Prediction - Training Example

This notebook demonstrates how to use the modular AKI prediction package to:
1. Load and preprocess data
2. Train multiple models with hyperparameter tuning
3. Evaluate models and save the best one
4. Generate SHAP explanations

## Simple Usage Example


In [3]:
# Import the package
import sys
import os
sys.path.append(os.path.abspath('../src'))

# Import all functions from the package
from utils import (
    setup_plotting, load_vitaldb_data, preprocess_data, prepare_train_test_data
)
from train import (
    get_default_model_configs, hyperparameter_tuning, save_best_model
)
from evaluate import (
    evaluate_models, print_evaluation_summary
)
from visualization import (
    plot_roc_curves, plot_pr_curves, plot_model_comparison, plot_confusion_matrices
)
from shap_explainer import (
    explain_model_with_shap, analyze_logistic_regression_coefficients
)

# Setup plotting
setup_plotting()


## 1. Data Loading and Preprocessing


In [4]:
# Load and preprocess data
df = load_vitaldb_data()
X, y, feature_names = preprocess_data(df)
data_dict = prepare_train_test_data(X, y)


ðŸ”„ Loading VitalDB dataset...
âœ… Dataset loaded: 3989 records
ðŸ“Š Features available: 75
ðŸ”§ Preprocessing data...
âœ… Data preprocessing completed
ðŸ“Š Final dataset shape: (3989, 43)
ðŸŽ¯ Target distribution: 210/3989 positive cases (5.26%)
ðŸ”§ Preparing train/test data...
ðŸ“Š Training set: (3191, 43)
ðŸ“Š Test set: (798, 43)


## 2. Model Training with Hyperparameter Tuning


In [None]:
# Option 1: Train all models (default)
print("ðŸŽ¯ Option 1: Training all models...")
models_config_all = get_default_model_configs()
# tuned_models = hyperparameter_tuning(models_config_all, data_dict['X_train_dict'], data_dict['y_train'])

# Option 2: Train only specific models
print("\nðŸŽ¯ Option 2: Training only specific models...")
from train import get_default_model_configs

# Get all default configs
all_configs = get_default_model_configs()

# Select only the models you want to train
specific_models = {
    'LogisticRegression': all_configs['LogisticRegression'],
    'XGBoost': all_configs['XGBoost']
    # Uncomment to add more models:
    # 'RandomForest': all_configs['RandomForest'],
    # 'SVM': all_configs['SVM']
}

print(f"Selected models: {list(specific_models.keys())}")

# Train only the selected models
tuned_models = hyperparameter_tuning(
    specific_models, 
    data_dict['X_train_dict'], 
    data_dict['y_train']
)


## 2.1. Custom Model Configuration (Alternative)


In [None]:
# Option 3: Create custom model configurations with different parameters
print("ðŸŽ¯ Option 3: Custom model configurations...")

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Create custom configurations with simplified parameter grids for faster training
custom_models = {
    'LogisticRegression_Fast': {
        'model': LogisticRegression(random_state=0),
        'params': {
            'C': [0.1, 1, 10],  # Reduced parameter grid
            'solver': ['lbfgs'],  # Only one solver
            'class_weight': [None, 'balanced']
        },
        'data_type': 'scaled'
    },
    'XGBoost_Fast': {
        'model': XGBClassifier(random_state=0, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100],  # Reduced parameter grid
            'max_depth': [3, 6],
            'learning_rate': [0.1, 0.2],
            'scale_pos_weight': [1, 18]
        },
        'data_type': 'imputed'
    }
}

print(f"Custom models: {list(custom_models.keys())}")

# Uncomment to train custom models instead:
# tuned_models = hyperparameter_tuning(
#     custom_models, 
#     data_dict['X_train_dict'], 
#     data_dict['y_train']
# )


## 3. Model Evaluation


In [None]:
# Model data mapping for evaluation (auto-generate based on trained models)
model_data_mapping = {}

# Auto-generate mapping based on the models that were actually trained
for model_name in tuned_models.keys():
    if 'Logistic' in model_name or 'SVM' in model_name:
        model_data_mapping[model_name] = 'scaled'
    else:
        model_data_mapping[model_name] = 'imputed'

print(f"Model data mapping: {model_data_mapping}")

# Evaluate all models
results_df = evaluate_models(
    tuned_models, 
    data_dict['X_test_dict'], 
    data_dict['y_test'], 
    model_data_mapping
)

# Print summary
print_evaluation_summary(results_df)


## 4. Save Best Model


In [None]:
# Find and save the best model
best_model_name, best_model = save_best_model(
    tuned_models,
    data_dict['X_test_dict'],
    data_dict['y_test'],
    model_data_mapping
)


## 5. SHAP Explanations


In [None]:
# Generate SHAP explanations for the best model
if 'LogisticRegression' in tuned_models:
    lr_model = tuned_models['LogisticRegression']
    # Analyze coefficients first
    analyze_logistic_regression_coefficients(lr_model, feature_names)
    # Generate SHAP explanation
    explain_model_with_shap(
        lr_model, 
        data_dict['X_test_dict']['scaled'], 
        feature_names, 
        'LogisticRegression', 
        max_display=15
    )

# Generate SHAP explanation for XGBoost
if 'XGBoost' in tuned_models:
    xgb_model = tuned_models['XGBoost']
    explain_model_with_shap(
        xgb_model, 
        data_dict['X_test_dict']['imputed'], 
        feature_names, 
        'XGBoost', 
        max_display=15
    )
