# Model Comparison and Evaluation

This notebook compares the logistic regression baseline with multiple machine learning models for predicting insufficient pain management.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the current directory to path to import our modules
sys.path.append(os.getcwd())

from analgesia.prediction_of_insufficient_pain_management.data_preprocessing import load_and_preprocess_data
from analgesia.prediction_of_insufficient_pain_management.logistic_regression_baseline import run_logistic_regression_baseline
from analgesia.prediction_of_insufficient_pain_management.ml_models import run_complete_ml_pipeline

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("All modules imported successfully!")

In [None]:
# Load and preprocess the data
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/analgesia/data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'

print("Loading and preprocessing data...")
processed_data, processor = load_and_preprocess_data(data_path)

print(f"Processed data shape: {processed_data.shape}")
print(f"Features: {processed_data.shape[1] - 1}")
print(f"Samples: {processed_data.shape[0]}")

In [None]:
# Prepare data for modeling
print("Preparing train/test splits...")
X_train, X_test, y_train, y_test = processor.prepare_modeling_data(test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

In [None]:
# Run logistic regression baseline
print("\n" + "="*60)
print("RUNNING LOGISTIC REGRESSION BASELINE")
print("="*60)

lr_baseline = run_logistic_regression_baseline(
    X_train, X_test, y_train, y_test,
    tune_hyperparams=True,
    cv_folds=5
)

In [None]:
# Run machine learning models
print("\n" + "="*60)
print("RUNNING MACHINE LEARNING MODELS")
print("="*60)

# Start with a subset of models for efficiency
models_to_run = ['random_forest', 'gradient_boosting', 'svm']

ml_evaluator = run_complete_ml_pipeline(
    X_train, X_test, y_train, y_test,
    models_to_run=models_to_run,
    create_ensemble=True,
    cv_folds=5
)

In [None]:
# Compare all models
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

# Get ML models summary
ml_summary = ml_evaluator.get_model_comparison_summary()
print("\nMachine Learning Models Performance:")
print(ml_summary.round(4).to_string(index=False))

# Add logistic regression to comparison
lr_results = lr_baseline.evaluation_results
lr_cv_results = lr_baseline.cv_results

lr_row = {
    'Model': 'logistic_regression',
    'Accuracy': lr_results['accuracy'],
    'Precision': lr_results['precision'],
    'Recall': lr_results['recall'],
    'F1-Score': lr_results['f1'],
    'ROC-AUC': lr_results['roc_auc'],
    'CV_AUC_Mean': lr_cv_results['roc_auc']['test_mean'],
    'CV_AUC_Std': lr_cv_results['roc_auc']['test_std']
}

# Combine all results
all_results = pd.concat([ml_summary, pd.DataFrame([lr_row])], ignore_index=True)
all_results = all_results.sort_values('ROC-AUC', ascending=False)

print("\n\nAll Models Performance Comparison:")
print(all_results.round(4).to_string(index=False))

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Comparison', fontsize=16)

# 1. ROC-AUC Comparison
ax = axes[0, 0]
models = all_results['Model']
aucs = all_results['ROC-AUC']
colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
bars = ax.bar(models, aucs, color=colors)
ax.set_title('ROC-AUC Comparison')
ax.set_ylabel('ROC-AUC')
ax.set_xticklabels(models, rotation=45)
ax.grid(True, alpha=0.3)

# Add value labels on bars
for bar, auc in zip(bars, aucs):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
            f'{auc:.3f}', ha='center', va='bottom', fontsize=10)

# 2. Multiple Metrics Comparison
ax = axes[0, 1]
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.15

for i, (_, row) in enumerate(all_results.head(4).iterrows()):
    values = [row[metric] for metric in metrics]
    ax.bar(x + i*width, values, width, label=row['Model'], alpha=0.8)

ax.set_title('Multiple Metrics Comparison (Top 4 Models)')
ax.set_ylabel('Score')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Cross-validation AUC with error bars
ax = axes[1, 0]
cv_means = all_results['CV_AUC_Mean']
cv_stds = all_results['CV_AUC_Std']
bars = ax.bar(models, cv_means, yerr=cv_stds, capsize=5, color=colors, alpha=0.7)
ax.set_title('Cross-Validation ROC-AUC (Mean ± Std)')
ax.set_ylabel('CV ROC-AUC')
ax.set_xticklabels(models, rotation=45)
ax.grid(True, alpha=0.3)

# 4. Model Ranking
ax = axes[1, 1]
rank_data = all_results[['Model', 'ROC-AUC', 'F1-Score', 'Precision', 'Recall']]
rank_data_norm = rank_data.set_index('Model')
rank_data_norm = (rank_data_norm - rank_data_norm.min()) / (rank_data_norm.max() - rank_data_norm.min())

im = ax.imshow(rank_data_norm.T, cmap='RdYlGn', aspect='auto')
ax.set_xticks(np.arange(len(rank_data_norm.index)))
ax.set_yticks(np.arange(len(rank_data_norm.columns)))
ax.set_xticklabels(rank_data_norm.index, rotation=45)
ax.set_yticklabels(rank_data_norm.columns)
ax.set_title('Normalized Performance Heatmap')

# Add text annotations
for i in range(len(rank_data_norm.columns)):
    for j in range(len(rank_data_norm.index)):
        text = ax.text(j, i, f'{rank_data_norm.iloc[j, i]:.2f}',
                      ha="center", va="center", color="black", fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance comparison for top models
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

# Get feature importance for best performing models
best_model_name = all_results.iloc[0]['Model']
print(f"\nBest performing model: {best_model_name}")

if best_model_name == 'logistic_regression':
    importance_df = lr_baseline.get_feature_importance()
    print("\nTop 15 most important features (Logistic Regression):")
    print(importance_df.head(15)[['feature', 'coefficient']].to_string(index=False))
else:
    importance_df = ml_evaluator.get_feature_importance(best_model_name)
    if importance_df is not None:
        print(f"\nTop 15 most important features ({best_model_name}):")
        importance_col = importance_df.columns[1]  # Second column is the importance
        print(importance_df.head(15)[['feature', importance_col]].to_string(index=False))

# Plot feature importance
if importance_df is not None:
    plt.figure(figsize=(12, 8))
    top_features = importance_df.head(15)
    importance_col = importance_df.columns[1]
    
    plt.barh(range(len(top_features)), top_features[importance_col])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 15 Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Final recommendations
print("\n" + "="*80)
print("FINAL RECOMMENDATIONS")
print("="*80)

best_model = all_results.iloc[0]
print(f"\n1. BEST PERFORMING MODEL: {best_model['Model']}")
print(f"   - ROC-AUC: {best_model['ROC-AUC']:.4f}")
print(f"   - Precision: {best_model['Precision']:.4f}")
print(f"   - Recall: {best_model['Recall']:.4f}")
print(f"   - F1-Score: {best_model['F1-Score']:.4f}")

# Compare with logistic regression
lr_performance = all_results[all_results['Model'] == 'logistic_regression'].iloc[0]
improvement = best_model['ROC-AUC'] - lr_performance['ROC-AUC']

print(f"\n2. COMPARISON WITH LOGISTIC REGRESSION BASELINE:")
print(f"   - Baseline ROC-AUC: {lr_performance['ROC-AUC']:.4f}")
print(f"   - Best model ROC-AUC: {best_model['ROC-AUC']:.4f}")
print(f"   - Improvement: {improvement:.4f} ({improvement/lr_performance['ROC-AUC']*100:.1f}%)")

if improvement > 0.02:
    print(f"   - Recommendation: Use {best_model['Model']} for better performance")
elif improvement > 0.01:
    print(f"   - Recommendation: {best_model['Model']} shows modest improvement")
else:
    print(f"   - Recommendation: Logistic regression baseline is competitive")

print(f"\n3. CLINICAL INTERPRETATION:")
precision = best_model['Precision']
recall = best_model['Recall']
print(f"   - Precision ({precision:.3f}): Of patients predicted to have insufficient pain management,")
print(f"     {precision*100:.1f}% actually do")
print(f"   - Recall ({recall:.3f}): Of patients with insufficient pain management,")
print(f"     {recall*100:.1f}% are correctly identified")

print(f"\n4. NEXT STEPS:")
print(f"   - Validate the {best_model['Model']} on external dataset")
print(f"   - Consider feature engineering to improve performance")
print(f"   - Implement in clinical decision support system")
print(f"   - Monitor performance in real-world deployment")

print("\n" + "="*80)