# Data Exploration for Insufficient Pain Management Prediction

This notebook explores the dataset to understand:
- Available variables and their types
- Target variable (VAS > 3) distribution
- Missing data patterns
- Potential predictive variables available in prehospital setting
- Data quality issues

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load the data
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/analgesia/data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'

try:
    # Read the Excel file
    df = pd.read_excel(data_path)
    print(f"Data loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
    print(f"Rows: {len(df)}")
except Exception as e:
    print(f"Error loading data: {e}")
    # Try to see if file exists
    if Path(data_path).exists():
        print("File exists but cannot be read")
    else:
        print("File does not exist")

In [None]:
# Basic dataset overview
print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:")
for i, col in enumerate(df.columns):
    print(f"{i+1:2d}. {col}")

In [None]:
# Display first few rows
print("=== FIRST 5 ROWS ===")
df.head()

In [None]:
# Data types and basic info
print("=== DATA TYPES ===")
df.info()

In [None]:
# Look for VAS-related columns to define our target variable
print("=== VAS-RELATED COLUMNS ===")
vas_columns = [col for col in df.columns if 'vas' in col.lower() or 'pain' in col.lower()]
print(f"Found {len(vas_columns)} VAS/pain-related columns:")
for col in vas_columns:
    print(f"  - {col}")
    if not df[col].empty:
        print(f"    Type: {df[col].dtype}, Non-null: {df[col].notna().sum()}/{len(df)}")
        if pd.api.types.is_numeric_dtype(df[col]):
            print(f"    Range: {df[col].min():.2f} - {df[col].max():.2f}")
        print()

In [None]:
# Missing data analysis
print("=== MISSING DATA ANALYSIS ===")
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Percentage', ascending=False)

# Show columns with missing data
columns_with_missing = missing_df[missing_df['Missing_Count'] > 0]
print(f"Columns with missing data: {len(columns_with_missing)}/{len(df.columns)}")
print("\nTop 20 columns with most missing data:")
print(columns_with_missing.head(20))

In [None]:
# Visualize missing data pattern
plt.figure(figsize=(15, 8))
missing_top20 = columns_with_missing.head(20)
sns.barplot(x=missing_top20['Missing_Percentage'], y=missing_top20.index)
plt.title('Top 20 Columns with Missing Data')
plt.xlabel('Missing Percentage (%)')
plt.tight_layout()
plt.show()

In [None]:
# Identify potential prehospital variables
print("=== IDENTIFYING PREHOSPITAL VARIABLES ===")
# Common prehospital variable keywords
prehospital_keywords = [
    'age', 'sex', 'gender', 'weight', 'height', 'bmi',
    'mechanism', 'injury', 'trauma', 'accident',
    'systolic', 'diastolic', 'bp', 'pressure', 'hr', 'heart_rate',
    'spo2', 'oxygen', 'saturation', 'resp', 'respiratory',
    'gcs', 'consciousness', 'alert', 'awake',
    'time', 'duration', 'transport', 'distance',
    'medication', 'drug', 'analgesic', 'morphine', 'fentanyl',
    'scene', 'location', 'weather', 'temperature'
]

potential_prehospital = []
for col in df.columns:
    col_lower = col.lower()
    for keyword in prehospital_keywords:
        if keyword in col_lower:
            potential_prehospital.append(col)
            break

print(f"Found {len(potential_prehospital)} potential prehospital variables:")
for col in potential_prehospital:
    print(f"  - {col}")

In [None]:
# Statistical summary of numeric variables
print("=== NUMERIC VARIABLES SUMMARY ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"Found {len(numeric_cols)} numeric columns")
df[numeric_cols].describe()

In [None]:
# Categorical variables analysis
print("=== CATEGORICAL VARIABLES ANALYSIS ===")
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(f"Found {len(categorical_cols)} categorical columns")

for col in categorical_cols[:10]:  # Show first 10
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    if df[col].nunique() <= 10:  # Show value counts for small number of categories
        print(f"  Value counts:")
        print(df[col].value_counts().to_string())
    else:
        print(f"  Top 5 values:")
        print(df[col].value_counts().head().to_string())

In [None]:
# Define target variable: Insufficient pain management (VAS on arrival > 3)
print("=== TARGET VARIABLE DEFINITION ===" )
print("Insufficient pain management defined as VAS_on_arrival > 3")

# Check VAS_on_arrival distribution
print(f"\nVAS_on_arrival statistics:")
print(f"Non-null values: {df['VAS_on_arrival'].notna().sum()}/{len(df)} ({(df['VAS_on_arrival'].notna().sum()/len(df)*100):.1f}%)")
print(f"Mean: {df['VAS_on_arrival'].mean():.2f}")
print(f"Median: {df['VAS_on_arrival'].median():.2f}")
print(f"Std: {df['VAS_on_arrival'].std():.2f}")

# Create target variable
df['insufficient_pain_mgmt'] = (df['VAS_on_arrival'] > 3).astype(int)

# Handle missing values in target (for now, exclude them)
target_available = df['VAS_on_arrival'].notna()
print(f"\nCases with VAS_on_arrival data: {target_available.sum()}")

# Target variable distribution
target_dist = df.loc[target_available, 'insufficient_pain_mgmt'].value_counts()
print(f"\nTarget variable distribution (insufficient pain management):")
print(f"No (VAS ≤ 3): {target_dist[0]} ({target_dist[0]/target_dist.sum()*100:.1f}%)")
print(f"Yes (VAS > 3): {target_dist[1]} ({target_dist[1]/target_dist.sum()*100:.1f}%)")

# VAS distribution visualization
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df['VAS_on_arrival'].hist(bins=11, edgecolor='black', alpha=0.7)
plt.title('VAS on Arrival Distribution')
plt.xlabel('VAS Score')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
target_counts = df.loc[target_available, 'insufficient_pain_mgmt'].value_counts()
plt.pie(target_counts.values, labels=['Adequate (≤3)', 'Insufficient (>3)'], autopct='%1.1f%%')
plt.title('Pain Management Adequacy')

plt.subplot(1, 3, 3)
sns.boxplot(x='insufficient_pain_mgmt', y='VAS_on_arrival', data=df[target_available])
plt.xlabel('Insufficient Pain Management')
plt.ylabel('VAS on Arrival')
plt.title('VAS Distribution by Target Class')

plt.tight_layout()
plt.show()