# XGBoost Feature Importance Analysis

This notebook analyzes the most important predictors for the XGBoost model predicting insufficient pain management in prehospital trauma care.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Add the parent directory to the path
sys.path.append('/Users/jk1/icu_research/PreHosp')

from analgesia.prediction_of_insufficient_pain_management.data_preprocessing import load_and_preprocess_data
from analgesia.prediction_of_insufficient_pain_management.ml_models import MLModelEvaluator

In [None]:
# Load and preprocess data
print("📊 Loading and preprocessing data...")
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/analgesia/data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'
processed_data, processor = load_and_preprocess_data(data_path)
X_train, X_test, y_train, y_test = processor.prepare_modeling_data()

print(f"✅ Data loaded: {X_train.shape[0]} training samples, {X_train.shape[1]} features")

In [None]:
# Train XGBoost model
print("🔄 Training XGBoost model with hyperparameter tuning...")
evaluator = MLModelEvaluator()
evaluator.tune_model('xgboost', X_train, y_train)

xgb_model = evaluator.models['xgboost']
best_params = evaluator.best_params.get('xgboost', {})

# Get performance metrics
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"✅ Model trained - Accuracy: {accuracy:.3f}, ROC-AUC: {roc_auc:.3f}")
print(f"Best parameters: {best_params}")

In [None]:
# Extract and analyze feature importance
feature_importance = xgb_model.feature_importances_
feature_names = X_train.columns.tolist()

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Clinical name mapping
clinical_mapping = {
    'VAS_on_scene': 'VAS Pain Score at Scene',
    'GCS': 'Glasgow Coma Scale',
    'HR': 'Heart Rate',
    'SPO2': 'Oxygen Saturation',
    'Bewusstseinlage': 'Consciousness Level',
    'HR5': 'Heart Rate (5 min)',
    'GCS7': 'Glasgow Coma Scale (7 min)',
    'SPO211': 'Oxygen Saturation (11 min)',
    'Lagerungen': 'Patient Positioning',
    'Abfahrtsort': 'Departure Location',
    'Geschlecht_Weiblich': 'Female Gender',
    'Geschlecht_Unbekannt': 'Unknown Gender',
}

importance_df['clinical_name'] = importance_df['feature'].map(
    lambda x: clinical_mapping.get(x, x.replace('Thoraxdrainage_', 'Chest Tube: ').replace(', 0', ''))
)

print("Top 10 Most Important Features:")
print(importance_df[['clinical_name', 'importance']].head(10))

In [None]:
# Create feature importance visualization
plt.figure(figsize=(12, 8))

# Top 15 features
top_features = importance_df.head(15)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_features)))

bars = plt.barh(range(len(top_features)), top_features['importance'], color=colors)
plt.yticks(range(len(top_features)), top_features['clinical_name'])
plt.xlabel('Feature Importance', fontweight='bold', fontsize=12)
plt.title('Top 15 Most Important Predictors\nXGBoost Model for Insufficient Pain Management Prediction', 
          fontweight='bold', fontsize=14, pad=20)
plt.grid(axis='x', alpha=0.3)
plt.gca().invert_yaxis()

# Add importance values on bars
for i, (bar, importance) in enumerate(zip(bars, top_features['importance'])):
    plt.text(importance + 0.005, bar.get_y() + bar.get_height()/2, 
            f'{importance:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Save the figure
plt.savefig('xgboost_top_features.png', dpi=300, bbox_inches='tight')
print("\n📁 Figure saved as 'xgboost_top_features.png'")

In [None]:
# Feature importance by clinical category
categories = {
    'Vital Signs': ['HR', 'SPO2'],
    'Neurological': ['GCS', 'Bewusstseinlage'],
    'Pain Assessment': ['VAS_on_scene'],
    'Demographics': ['Geschlecht'],
    'Scene/Transport': ['Abfahrtsort', 'Lagerungen', 'Ist Reanimation'],
    'Medical Interventions': ['Thoraxdrainage']
}

category_importance = {}
for category, patterns in categories.items():
    total_importance = 0
    for _, row in importance_df.iterrows():
        for pattern in patterns:
            if pattern in row['feature']:
                total_importance += row['importance']
                break
    category_importance[category] = total_importance

# Create category visualization
plt.figure(figsize=(10, 6))
cat_df = pd.DataFrame(list(category_importance.items()), columns=['Category', 'Importance'])
cat_df = cat_df.sort_values('Importance', ascending=True)

bars = plt.barh(cat_df['Category'], cat_df['Importance'], 
                color=plt.cm.Set3(np.linspace(0, 1, len(cat_df))))
plt.xlabel('Cumulative Feature Importance', fontweight='bold')
plt.title('Feature Importance by Clinical Category', fontweight='bold', pad=20)
plt.grid(axis='x', alpha=0.3)

# Add values on bars
for bar, importance in zip(bars, cat_df['Importance']):
    plt.text(importance + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{importance:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n🏥 Clinical Category Importance:")
for category, importance in sorted(category_importance.items(), key=lambda x: x[1], reverse=True):
    print(f"   {category}: {importance:.4f}")

In [None]:
# Summary insights
cumulative_importance = np.cumsum(importance_df['importance'])
features_80 = np.argmax(cumulative_importance >= 0.8) + 1
features_90 = np.argmax(cumulative_importance >= 0.9) + 1

print("\n📊 KEY INSIGHTS FROM XGBoost FEATURE IMPORTANCE:")
print("="*60)
print(f"🎯 Most Important Predictor: {importance_df.iloc[0]['clinical_name']} ({importance_df.iloc[0]['importance']:.3f})")
print(f"📈 Top 3 predictors account for {cumulative_importance[2]:.1%} of total importance")
print(f"📉 {features_80} features capture 80% of predictive power")
print(f"📉 {features_90} features capture 90% of predictive power")
print(f"\n🏥 Clinical Implications:")
print(f"   • Pain assessment at scene is the dominant predictor")
print(f"   • Neurological status (GCS, consciousness) is highly important")
print(f"   • Vital signs contribute significantly to prediction")
print(f"   • Model is clinically interpretable and actionable")
print(f"   • Focus should be on accurate initial pain assessment and neurological evaluation")