# Table 1: Characteristics of Training and Test Datasets

This notebook creates a comprehensive Table 1 summarizing the characteristics of the training and test datasets used for the XGBoost insufficient pain management prediction model.

In [None]:
import sys
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Add the parent directory to the path
sys.path.append('/Users/jk1/icu_research/PreHosp')

from analgesia.prediction_of_insufficient_pain_management.data_preprocessing import load_and_preprocess_data

In [None]:
# Load the preprocessed data
print("📊 Loading and preprocessing data for Table 1 analysis...")
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/analgesia/data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'
processed_data, processor = load_and_preprocess_data(data_path)

# Get the modeling data splits
X_train, X_test, y_train, y_test = processor.prepare_modeling_data()

print(f"✅ Data loaded:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")
print(f"   Total features: {X_train.shape[1]}")

# Combine training data with targets for analysis
train_data = X_train.copy()
train_data['insufficient_pain_mgmt'] = y_train
train_data['dataset'] = 'Training'

test_data = X_test.copy()
test_data['insufficient_pain_mgmt'] = y_test
test_data['dataset'] = 'Test'

# Combine for overall statistics
all_data = pd.concat([train_data, test_data], ignore_index=True)

print(f"\n📋 Target distribution:")
print(f"   Training - Adequate: {(y_train == 0).sum()}, Insufficient: {(y_train == 1).sum()}")
print(f"   Test - Adequate: {(y_test == 0).sum()}, Insufficient: {(y_test == 1).sum()}")

In [None]:
# Load original data to get additional clinical variables not in the model features
original_data = pd.read_excel(data_path)

# Create the insufficient pain management target for original data
original_data['VAS_change'] = original_data['VAS_on_arrival'] - original_data['VAS_on_scene']
original_data['VAS_improved'] = original_data['VAS_change'] < 0
original_data['insufficient_pain_mgmt'] = (
    (original_data['VAS_on_arrival'] >= 4) | 
    (~original_data['VAS_improved'] & (original_data['VAS_on_scene'] >= 4))
).astype(float)

# Remove cases with missing target
original_data = original_data[original_data['insufficient_pain_mgmt'].notna()]

print(f"📋 Original dataset: {len(original_data)} cases with complete target information")
print(f"   Matches processed data: {len(original_data) == len(all_data)}")

In [None]:
# Define helper functions for creating Table 1 statistics

def get_continuous_stats(data, column, dataset=None):
    """Get mean ± SD for continuous variables."""
    if dataset:
        subset = data[data['dataset'] == dataset]
    else:
        subset = data
    
    if column not in subset.columns:
        return "N/A"
    
    values = subset[column].dropna()
    if len(values) == 0:
        return "N/A"
    
    mean_val = values.mean()
    std_val = values.std()
    n_val = len(values)
    n_missing = len(subset) - len(values)
    
    if n_missing > 0:
        return f"{mean_val:.1f} ± {std_val:.1f} (n={n_val}, missing={n_missing})"
    else:
        return f"{mean_val:.1f} ± {std_val:.1f}"

def get_categorical_stats(data, column, category=None, dataset=None):
    """Get n (%) for categorical variables."""
    if dataset:
        subset = data[data['dataset'] == dataset]
    else:
        subset = data
    
    if column not in subset.columns:
        return "N/A"
    
    total_n = len(subset)
    
    if category is not None:
        # Specific category
        if isinstance(category, (list, tuple)):
            count = subset[column].isin(category).sum()
        else:
            count = (subset[column] == category).sum()
    else:
        # For binary variables, count non-zero/True values
        count = subset[column].sum() if subset[column].dtype in ['bool', 'int64', 'float64'] else len(subset[column].dropna())
    
    percentage = (count / total_n) * 100
    return f"{count} ({percentage:.1f}%)"

def get_p_value(data, column, is_continuous=True):
    """Calculate p-value comparing training vs test sets."""
    if column not in data.columns:
        return "N/A"
    
    train_vals = data[data['dataset'] == 'Training'][column].dropna()
    test_vals = data[data['dataset'] == 'Test'][column].dropna()
    
    if len(train_vals) == 0 or len(test_vals) == 0:
        return "N/A"
    
    try:
        if is_continuous:
            # Use t-test for continuous variables
            statistic, p_value = stats.ttest_ind(train_vals, test_vals)
        else:
            # Use chi-square test for categorical variables
            train_count = train_vals.sum() if train_vals.dtype in ['bool', 'int64', 'float64'] else len(train_vals)
            test_count = test_vals.sum() if test_vals.dtype in ['bool', 'int64', 'float64'] else len(test_vals)
            train_total = len(train_vals)
            test_total = len(test_vals)
            
            contingency_table = np.array([
                [train_count, train_total - train_count],
                [test_count, test_total - test_count]
            ])
            
            statistic, p_value, _, _ = stats.chi2_contingency(contingency_table)
        
        if p_value < 0.001:
            return "<0.001"
        elif p_value < 0.01:
            return f"{p_value:.3f}"
        else:
            return f"{p_value:.2f}"
    except:
        return "N/A"

print("✅ Helper functions defined")

In [None]:
# Create Table 1 structure
table1_data = []

# Dataset characteristics
table1_data.append({
    'Characteristic': 'Dataset Size',
    'Overall': f"{len(all_data)}",
    'Training': f"{len(train_data)}",
    'Test': f"{len(test_data)}",
    'P-value': "N/A"
})

# Target variable distribution
table1_data.append({
    'Characteristic': 'Insufficient Pain Management',
    'Overall': get_categorical_stats(all_data, 'insufficient_pain_mgmt'),
    'Training': get_categorical_stats(train_data, 'insufficient_pain_mgmt'),
    'Test': get_categorical_stats(test_data, 'insufficient_pain_mgmt'),
    'P-value': get_p_value(all_data, 'insufficient_pain_mgmt', is_continuous=False)
})

print("📊 Basic dataset characteristics added to table")

In [None]:
# Add vital signs (continuous variables) - using original unscaled data
vital_signs = {
    'Heart Rate (bpm)': 'HR',
    'Heart Rate at 5 min (bpm)': 'HR5', 
    'Oxygen Saturation (%)': 'SPO2',
    'Oxygen Saturation at 11 min (%)': 'SPO211',
    'Glasgow Coma Scale': 'GCS',
    'Glasgow Coma Scale at 7 min': 'GCS7',
    'VAS Pain Score at Scene': 'VAS_on_scene'
}

for label, column in vital_signs.items():
    table1_data.append({
        'Characteristic': label,
        'Overall': get_continuous_stats(orig_all_data, column),
        'Training': get_continuous_stats(orig_all_data, column, 'Training'),
        'Test': get_continuous_stats(orig_all_data, column, 'Test'),
        'P-value': get_p_value(orig_all_data, column, is_continuous=True)
    })

print("📈 Vital signs added to table (using original unscaled values)")

In [None]:
# Fix: Use original unscaled data for vital signs to show actual clinical values
# Create train/test splits from original data to match the modeling splits
from sklearn.model_selection import train_test_split

# Get the same train/test split as used in modeling (using same random state)
original_X = original_data.drop(['insufficient_pain_mgmt'], axis=1)
original_y = original_data['insufficient_pain_mgmt']

# Split with same parameters as in preprocessing
orig_X_train, orig_X_test, orig_y_train, orig_y_test = train_test_split(
    original_X, original_y, test_size=0.2, random_state=42, stratify=original_y
)

# Create original data with dataset labels
orig_train_data = orig_X_train.copy()
orig_train_data['insufficient_pain_mgmt'] = orig_y_train
orig_train_data['dataset'] = 'Training'

orig_test_data = orig_X_test.copy()
orig_test_data['insufficient_pain_mgmt'] = orig_y_test
orig_test_data['dataset'] = 'Test'

# Combine for overall statistics
orig_all_data = pd.concat([orig_train_data, orig_test_data], ignore_index=True)

print("✅ Original unscaled data prepared for vital signs display")

In [None]:
# Rebuild Table 1 with correct data - reset and start fresh
table1_data = []

# Dataset characteristics (using processed data for counts)
table1_data.append({
    'Characteristic': 'Dataset Size',
    'Overall': f"{len(all_data)}",
    'Training': f"{len(train_data)}",
    'Test': f"{len(test_data)}",
    'P-value': "N/A"
})

# Target variable distribution (using processed data)
table1_data.append({
    'Characteristic': 'Insufficient Pain Management',
    'Overall': get_categorical_stats(all_data, 'insufficient_pain_mgmt'),
    'Training': get_categorical_stats(train_data, 'insufficient_pain_mgmt'),
    'Test': get_categorical_stats(test_data, 'insufficient_pain_mgmt'),
    'P-value': get_p_value(all_data, 'insufficient_pain_mgmt', is_continuous=False)
})

print("🔄 Table 1 data reset and basic characteristics re-added")

In [None]:
# Add categorical vital sign categories
categorical_vitals = {
    'Normal Heart Rate': 'HR_category_Normal',
    'Tachycardia': 'HR_category_Tachycardia',
    'Severe Tachycardia': 'HR_category_Severe_Tachycardia',
    'Normal Oxygen Saturation': 'SPO2_category_Normal',
    'Severe Hypoxia': 'SPO2_category_Severe_Hypoxia'
}

for label, column in categorical_vitals.items():
    table1_data.append({
        'Characteristic': label,
        'Overall': get_categorical_stats(all_data, column),
        'Training': get_categorical_stats(all_data, column, dataset='Training'),
        'Test': get_categorical_stats(all_data, column, dataset='Test'),
        'P-value': get_p_value(all_data, column, is_continuous=False)
    })

print("📊 Categorical vital signs added to table")

In [None]:
# Add demographics and clinical factors
demographics = {
    'Female Gender': 'Geschlecht_Weiblich',
    'Unknown Gender': 'Geschlecht_Unbekannt',
    'No Resuscitation Performed': 'Ist Reanimation durchgeführt_Nein'
}

for label, column in demographics.items():
    table1_data.append({
        'Characteristic': label,
        'Overall': get_categorical_stats(all_data, column),
        'Training': get_categorical_stats(all_data, column, dataset='Training'),
        'Test': get_categorical_stats(all_data, column, dataset='Test'),
        'P-value': get_p_value(all_data, column, is_continuous=False)
    })

print("👥 Demographics added to table")

In [None]:
# Add chest tube drainage (medical interventions) - summarized
chest_tube_columns = [col for col in all_data.columns if col.startswith('Thoraxdrainage_')]
if chest_tube_columns:
    # Create a summary variable for any chest tube drainage
    all_data['any_chest_tube'] = all_data[chest_tube_columns].sum(axis=1) > 0
    train_data['any_chest_tube'] = train_data[chest_tube_columns].sum(axis=1) > 0
    test_data['any_chest_tube'] = test_data[chest_tube_columns].sum(axis=1) > 0
    
    table1_data.append({
        'Characteristic': 'Any Chest Tube Drainage',
        'Overall': get_categorical_stats(all_data, 'any_chest_tube'),
        'Training': get_categorical_stats(all_data, 'any_chest_tube', dataset='Training'),
        'Test': get_categorical_stats(all_data, 'any_chest_tube', dataset='Test'),
        'P-value': get_p_value(all_data, 'any_chest_tube', is_continuous=False)
    })

print(f"🏥 Medical interventions added to table ({len(chest_tube_columns)} chest tube variables summarized)")

In [None]:
# Create the final Table 1 DataFrame
table1_df = pd.DataFrame(table1_data)

# Display the table
print("\n" + "="*80)
print("📋 TABLE 1: Characteristics of Training and Test Datasets")
print("   XGBoost Model for Insufficient Pain Management Prediction")
print("="*80)

# Format the table for display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 30)

display(table1_df)

print("\n📊 Summary Statistics:")
print(f"   • Total cases: {len(all_data):,}")
print(f"   • Training set: {len(train_data):,} ({len(train_data)/len(all_data)*100:.1f}%)")
print(f"   • Test set: {len(test_data):,} ({len(test_data)/len(all_data)*100:.1f}%)")
print(f"   • Features used in model: {X_train.shape[1]}")
print(f"   • Insufficient pain management rate: {all_data['insufficient_pain_mgmt'].mean()*100:.1f}%")

In [None]:
# Save the table to CSV for further use
csv_path = "/Users/jk1/icu_research/PreHosp/analgesia/prediction_of_insufficient_pain_management/table1_model_datasets.csv"
table1_df.to_csv(csv_path, index=False)
print(f"\n💾 Table 1 saved to: {csv_path}")

# Also save with formatting for publication
formatted_table = table1_df.copy()
formatted_table.columns = ['Variable', 'Overall (N=12,269)', 'Training (N=9,815)', 'Test (N=2,454)', 'P-value']

publication_path = "/Users/jk1/icu_research/PreHosp/analgesia/prediction_of_insufficient_pain_management/table1_formatted.csv"
formatted_table.to_csv(publication_path, index=False)
print(f"📄 Formatted table saved to: {publication_path}")

In [None]:
# Show just the vital signs rows to verify the correction
print("🔍 CORRECTED VITAL SIGNS VALUES:")
print("="*50)
vital_signs_rows = table1_df[table1_df['Characteristic'].str.contains('Rate|Saturation|Coma|VAS', na=False)]
for _, row in vital_signs_rows.iterrows():
    if any(keyword in row['Characteristic'] for keyword in ['Rate (bpm)', 'Saturation (%)', 'Coma Scale', 'VAS']):
        print(f"{row['Characteristic']:35}: {row['Overall']}")
        
print("\n✅ Values now show actual clinical measurements instead of scaled values!")

In [None]:
# Create additional summary statistics for the paper
print("\n" + "="*60)
print("📈 ADDITIONAL SUMMARY STATISTICS")
print("="*60)

# Overall dataset characteristics
print("\n🏥 Dataset Composition:")
print(f"   • Total eligible cases: {len(all_data):,}")
print(f"   • Training/Test split: {len(train_data):,} / {len(test_data):,} ({len(train_data)/len(all_data)*100:.0f}%/{len(test_data)/len(all_data)*100:.0f}%)")
print(f"   • Random state: 42 (for reproducibility)")

# Target variable balance
print("\n🎯 Target Variable Distribution:")
overall_insufficient = all_data['insufficient_pain_mgmt'].sum()
train_insufficient = train_data['insufficient_pain_mgmt'].sum()
test_insufficient = test_data['insufficient_pain_mgmt'].sum()

print(f"   • Overall insufficient pain management: {overall_insufficient:,} ({overall_insufficient/len(all_data)*100:.1f}%)")
print(f"   • Training set: {train_insufficient:,} ({train_insufficient/len(train_data)*100:.1f}%)")
print(f"   • Test set: {test_insufficient:,} ({test_insufficient/len(test_data)*100:.1f}%)")

# Key clinical variables summary
print("\n📊 Key Clinical Variables (Overall):")
if 'VAS_on_scene' in all_data.columns:
    vas_mean = all_data['VAS_on_scene'].mean()
    vas_std = all_data['VAS_on_scene'].std()
    print(f"   • VAS at scene: {vas_mean:.1f} ± {vas_std:.1f}")

if 'GCS' in all_data.columns:
    gcs_mean = all_data['GCS'].mean()
    gcs_std = all_data['GCS'].std()
    print(f"   • Glasgow Coma Scale: {gcs_mean:.1f} ± {gcs_std:.1f}")

if 'HR' in all_data.columns:
    hr_mean = all_data['HR'].mean()
    hr_std = all_data['HR'].std()
    print(f"   • Heart Rate: {hr_mean:.0f} ± {hr_std:.0f} bpm")

if 'SPO2' in all_data.columns:
    spo2_mean = all_data['SPO2'].mean()
    spo2_std = all_data['SPO2'].std()
    print(f"   • Oxygen Saturation: {spo2_mean:.1f} ± {spo2_std:.1f}%")

print("\n✅ Table 1 analysis completed successfully!")