# Test Data Preprocessing Pipeline

This notebook tests the data preprocessing functions to ensure they work correctly.

In [None]:
import sys
import os

# Add the current directory to path to import our modules
sys.path.append(os.getcwd())

from analgesia.prediction_of_insufficient_pain_management.data_preprocessing import PainManagementDataProcessor, load_and_preprocess_data
import pandas as pd
import numpy as np

In [None]:
# Test the preprocessing pipeline
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/analgesia/data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'

print("Testing data preprocessing pipeline...")
processed_data, processor = load_and_preprocess_data(data_path)

print(f"\nProcessed data shape: {processed_data.shape}")
print(f"Columns: {list(processed_data.columns)}")

In [None]:
# Check target variable distribution
if 'insufficient_pain_mgmt' in processed_data.columns:
    target_dist = processed_data['insufficient_pain_mgmt'].value_counts()
    print("Target variable distribution:")
    print(target_dist)
    print(f"\nPercentages:")
    print(target_dist / target_dist.sum() * 100)

In [None]:
# Test train/test split
try:
    X_train, X_test, y_train, y_test = processor.prepare_modeling_data()
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Training target distribution: {y_train.value_counts().to_dict()}")
    print(f"Test target distribution: {y_test.value_counts().to_dict()}")
except Exception as e:
    print(f"Error in train/test split: {e}")

In [None]:
# Verify all data leakage features are removed
print("Checking for data leakage...")
leakage_features = ['VAS_on_arrival', 'VAS_change', 'VAS_improved']
found_leakage = [feat for feat in leakage_features if feat in X_train.columns]

if found_leakage:
    print(f"❌ WARNING: Found leakage features in training set: {found_leakage}")
else:
    print(f"✅ GOOD: All leakage features excluded - no data leakage")
    print(f"   Excluded: {leakage_features}")

print(f"\nFeatures in training set: {list(X_train.columns)}")
print(f"Number of features: {X_train.shape[1]}")

# Check target correlation with excluded features (should still be high for VAS_on_arrival)
print(f"\nTarget correlations with excluded features:")
for feat in leakage_features:
    if feat in processed_data.columns:
        corr = processed_data[['insufficient_pain_mgmt', feat]].corr().iloc[0,1]
        print(f"  {feat}: {corr:.4f}")

print("\n(High correlations with excluded features confirm they would cause data leakage)")

In [None]:
# Display basic statistics of processed features
print("Basic statistics of processed features:")
print(processed_data.describe())