## Arteriovenous lactate difference

measures should be within 2h

In [None]:
import pandas as pd
import os
from scipy.stats import pearsonr
from statsmodels.miscmodels.ordinal_model import OrderedModel
import seaborn as sns
import matplotlib.pyplot as plt
from utils import create_registry_case_identification_column, create_ehr_case_identification_column, patient_selection
from utils import load_data_from_main_dir
from lab_preprocessing import preprocess_labs
from outcome_preprocessing import preprocess_outcomes


In [None]:
eds_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20221117/eds_j1.csv'
ehr_data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20221117/'
registry_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'

In [None]:
max_time_delta_h = 2

In [None]:
eds_df = pd.read_csv(eds_path, delimiter=';', encoding='utf-8',
                         dtype=str)
registry_df = pd.read_excel(registry_path, dtype=str)


In [None]:
registry_df['case_admission_id'] = create_registry_case_identification_column(registry_df)
eds_df['case_admission_id'] = create_ehr_case_identification_column(eds_df)

In [None]:
eds_df.head()

In [None]:
inclusion_registry_df, excluded_patients_df = patient_selection(
    registry_path=registry_path,
    eds_path=eds_path,
    exclude_patients_under_18=True,
    exclude_non_ischemic_stroke=True,
    exclude_non_acute_stroke=True,
    verbose=True
)

In [None]:
eds_df['case_admission_id'] = eds_df[eds_df['case_admission_id'].isin(inclusion_registry_df['case_admission_id'])]['case_admission_id']
print(f'Number of patients in EDS after selection: {eds_df.patient_id.nunique()}')

In [None]:
registry_df.case_admission_id.nunique()

In [None]:
lab_file_start = 'labo'
lab_df = load_data_from_main_dir(ehr_data_path, lab_file_start)

In [None]:
lab_df['case_admission_id'] = create_ehr_case_identification_column(lab_df)

In [None]:
preprocessed_lactate_df = preprocess_labs(lab_df, ["lactate"])

In [None]:
preprocessed_lactate_df = preprocessed_lactate_df[preprocessed_lactate_df['case_admission_id'].isin(inclusion_registry_df['case_admission_id'])]

In [None]:
preprocessed_lactate_df.head()

In [None]:
preprocessed_lactate_df.original_material_label.unique()

In [None]:
preprocessed_lactate_df.value.hist(bins=100)

In [None]:
inclusion_registry_df.head()

In [None]:
inclusion_registry_df['stroke_dt'].isna().sum(), inclusion_registry_df['arrival_dt'].isna().sum()

In [None]:
inclusion_registry_df['T0'] = inclusion_registry_df['stroke_dt'].fillna(inclusion_registry_df['arrival_dt'])
preprocessed_lactate_df = preprocessed_lactate_df.merge(
    inclusion_registry_df[['case_admission_id', 'T0']],
    on='case_admission_id',
    how='left'
)

In [None]:
dt_format = '%d.%m.%Y %H:%M'
preprocessed_lactate_df['relative_sample_date'] = (pd.to_datetime(preprocessed_lactate_df['sample_date'], format=dt_format) - pd.to_datetime(preprocessed_lactate_df['T0'], format=dt_format)).dt.total_seconds() / 3600 # convert to hours


In [None]:
preprocessed_lactate_df['sample_date'].values

In [None]:
import numpy as np
preprocessed_lactate_df['relative_sample_date_hcat'] = preprocessed_lactate_df['relative_sample_date'].apply(np.floor)

In [None]:
preprocessed_lactate_df[['T0', 'sample_date', 'relative_sample_date']]

In [None]:

# sns.set(style="whitegrid")
# plt.figure(figsize=(10, 6))
# ax = sns.lineplot(x='relative_sample_date_hcat', y='value', data=preprocessed_lactate_df)

# ax.set_xlim(-24, 7*24)

In [None]:
preprocessed_lactate_df.case_admission_id.nunique()

In [None]:
n_patients_with_lactate_in_first_24h = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > -12) & (preprocessed_lactate_df.relative_sample_date < 24)].case_admission_id.nunique()
n_patients_with_lactate_in_24_to_72h = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > 24) & (preprocessed_lactate_df.relative_sample_date < 3*24)].case_admission_id.nunique()

print(f'Number of patients with lactate in first 24h: {n_patients_with_lactate_in_first_24h}')
print(f'Number of patients with lactate in 24 to 72h: {n_patients_with_lactate_in_24_to_72h}')

In [None]:
outcome_df = preprocess_outcomes(registry_path)
outcome_df = outcome_df[outcome_df.case_admission_id.isin(inclusion_registry_df.case_admission_id.unique())]
outcome_df.drop_duplicates(subset='case_admission_id', keep='first', inplace=True)

In [None]:
preprocessed_lactate_df = preprocessed_lactate_df.merge(
    outcome_df[['case_admission_id', '3M mRS']],
    on='case_admission_id',
    how='left'
)

In [None]:
preprocessed_lactate_df.head()

In [None]:
early_lactate_df = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > -12) & (preprocessed_lactate_df.relative_sample_date < 24)]
lactate_d2_df = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > 24) & (preprocessed_lactate_df.relative_sample_date < 2*72)]
lactate_d3_df = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > 2*24) & (preprocessed_lactate_df.relative_sample_date < 3*72)]
lactate_d_2_3_df = preprocessed_lactate_df[(preprocessed_lactate_df.relative_sample_date > 1*24) & (preprocessed_lactate_df.relative_sample_date < 3*72)]

## Arterio-Venous Lactate Difference Analysis

Analyzing the arterio-venous (AV) difference in lactate between concomitant arterial (sga) and venous (sgv, sgvm) samples within 2 hours for each patient.

In [None]:
# Separate arterial and venous samples
arterial_df = preprocessed_lactate_df[preprocessed_lactate_df['original_material_label'] == 'sga'].copy()
venous_df = preprocessed_lactate_df[preprocessed_lactate_df['original_material_label'].isin(['sgv', 'sgvm'])].copy()

print(f"Arterial samples: {len(arterial_df)}")
print(f"Venous samples: {len(venous_df)}")
print(f"Patients with arterial samples: {arterial_df['case_admission_id'].nunique()}")
print(f"Patients with venous samples: {venous_df['case_admission_id'].nunique()}")

In [None]:
# Function to find concomitant arterial-venous pairs
def find_av_pairs(arterial_df, venous_df, max_time_delta_h=2):
    """
    Find concomitant arterial-venous lactate pairs within max_time_delta_h hours
    """
    av_pairs = []
    
    # Convert sample dates to datetime
    dt_format = '%d.%m.%Y %H:%M'
    arterial_df['sample_datetime'] = pd.to_datetime(arterial_df['sample_date'], format=dt_format)
    venous_df['sample_datetime'] = pd.to_datetime(venous_df['sample_date'], format=dt_format)
    
    # Group by patient
    for patient_id in arterial_df['case_admission_id'].unique():
        patient_arterial = arterial_df[arterial_df['case_admission_id'] == patient_id]
        patient_venous = venous_df[venous_df['case_admission_id'] == patient_id]
        
        # Find pairs within time window
        for _, art_row in patient_arterial.iterrows():
            for _, ven_row in patient_venous.iterrows():
                time_diff_h = abs((art_row['sample_datetime'] - ven_row['sample_datetime']).total_seconds() / 3600)
                
                if time_diff_h <= max_time_delta_h:
                    av_pairs.append({
                        'case_admission_id': patient_id,
                        'arterial_value': art_row['value'],
                        'venous_value': ven_row['value'],
                        'av_difference': art_row['value'] - ven_row['value'],
                        'arterial_sample_date': art_row['sample_date'],
                        'venous_sample_date': ven_row['sample_date'],
                        'time_diff_h': time_diff_h,
                        'relative_sample_date': art_row['relative_sample_date'],
                        '3M mRS': art_row['3M mRS']
                    })
    
    return pd.DataFrame(av_pairs)

# Find all AV pairs
av_pairs_df = find_av_pairs(arterial_df, venous_df, max_time_delta_h=max_time_delta_h)
print(f"Total AV pairs found: {len(av_pairs_df)}")
print(f"Patients with AV pairs: {av_pairs_df['case_admission_id'].nunique()}")

In [None]:
# Display summary of AV pairs
av_pairs_df.head(10)

In [None]:
# Create binary outcome groups (mRS 0-2 vs >2)
av_pairs_df['outcome_group'] = av_pairs_df['3M mRS'].apply(
    lambda x: 'Good (mRS 0-2)' if pd.notna(x) and x <= 2 else ('Poor (mRS >2)' if pd.notna(x) else 'Unknown')
)

# Categorize by day (0, 1, 2)
def categorize_day(relative_hours):
    if relative_hours < 0:
        return 'Pre-stroke'
    elif relative_hours < 24:
        return 'Day 0'
    elif relative_hours < 48:
        return 'Day 1'
    elif relative_hours < 72:
        return 'Day 2'
    else:
        return 'Day 3+'

av_pairs_df['day_category'] = av_pairs_df['relative_sample_date'].apply(categorize_day)

# Display distribution
print("\nDistribution of AV pairs by day and outcome:")
print(av_pairs_df.groupby(['day_category', 'outcome_group']).size().unstack(fill_value=0))

In [None]:
# Summary statistics for AV difference by day and outcome
summary_stats = av_pairs_df[av_pairs_df['outcome_group'] != 'Unknown'].groupby(['day_category', 'outcome_group'])['av_difference'].agg([
    ('n', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('median', 'median'),
    ('q25', lambda x: x.quantile(0.25)),
    ('q75', lambda x: x.quantile(0.75)),
    ('min', 'min'),
    ('max', 'max')
]).round(3)

print("\nSummary statistics of AV lactate difference (arterial - venous) by day and outcome:")
print(summary_stats)

### Visualization: AV Lactate Difference by Day and Outcome

In [None]:
# Filter for Day 0, 1, and 2 only, exclude unknown outcomes
plot_df = av_pairs_df[
    (av_pairs_df['day_category'].isin(['Day 0', 'Day 1', 'Day 2'])) & 
    (av_pairs_df['outcome_group'] != 'Unknown')
].copy()

# Define day order
day_order = ['Day 0', 'Day 1', 'Day 2']

# Define colors for outcomes
outcome_colors = {
    'Good (mRS 0-2)': '#27ae60',  # Green
    'Poor (mRS >2)': '#e74c3c'    # Red
}

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Box plot
sns.boxplot(x='day_category', y='av_difference', hue='outcome_group', 
            data=plot_df, ax=axes[0], order=day_order, 
            palette=outcome_colors, showfliers=False)
axes[0].set_title('AV Lactate Difference by Day and Outcome', 
                  fontsize=14, fontweight='bold')
axes[0].set_xlabel('Day', fontsize=12, fontweight='bold')
axes[0].set_ylabel('AV Lactate Difference (mmol/L)\n(Arterial - Venous)', fontsize=12, fontweight='bold')
axes[0].legend(title='Outcome', title_fontsize=11, fontsize=10)
axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5, linewidth=1)
axes[0].grid(True, alpha=0.3, linestyle='--')

# Violin plot
sns.violinplot(x='day_category', y='av_difference', hue='outcome_group', 
               data=plot_df, ax=axes[1], order=day_order, 
               palette=outcome_colors, split=False, inner='quartile')
axes[1].set_title('AV Lactate Difference Distribution by Day and Outcome', 
                  fontsize=14, fontweight='bold')
axes[1].set_xlabel('Day', fontsize=12, fontweight='bold')
axes[1].set_ylabel('AV Lactate Difference (mmol/L)\n(Arterial - Venous)', fontsize=12, fontweight='bold')
axes[1].legend(title='Outcome', title_fontsize=11, fontsize=10)
axes[1].axhline(y=0, color='gray', linestyle='--', alpha=0.5, linewidth=1)
axes[1].grid(True, alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

print(f"\nTotal pairs plotted: {len(plot_df)}")
print(f"Good outcome pairs: {len(plot_df[plot_df['outcome_group'] == 'Good (mRS 0-2)'])}")
print(f"Poor outcome pairs: {len(plot_df[plot_df['outcome_group'] == 'Poor (mRS >2)'])}")

In [None]:
# Line plot showing mean AV difference over days with error bars
mean_av_by_day = plot_df.groupby(['day_category', 'outcome_group'])['av_difference'].agg(['mean', 'std', 'sem', 'count']).reset_index()

fig, ax = plt.subplots(figsize=(10, 6))

for outcome in ['Good (mRS 0-2)', 'Poor (mRS >2)']:
    data = mean_av_by_day[mean_av_by_day['outcome_group'] == outcome]
    x_pos = [0, 1, 2]  # Position for Day 0, 1, 2
    
    ax.errorbar(x_pos, data['mean'], yerr=data['sem'], 
                marker='o', linewidth=2, markersize=8, capsize=5,
                label=outcome, color=outcome_colors[outcome])

ax.set_xticks([0, 1, 2])
ax.set_xticklabels(['Day 0', 'Day 1', 'Day 2'])
ax.set_xlabel('Day', fontsize=12, fontweight='bold')
ax.set_ylabel('Mean AV Lactate Difference (mmol/L)\n(Arterial - Venous)', fontsize=12, fontweight='bold')
ax.set_title('Mean AV Lactate Difference Over Time by Outcome', fontsize=14, fontweight='bold')
ax.legend(title='Outcome', title_fontsize=11, fontsize=10)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5, linewidth=1)
ax.grid(True, alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

### Statistical Testing: AV Difference Between Outcome Groups

In [None]:
from scipy.stats import mannwhitneyu, ttest_ind

# Test for each day
print("Statistical comparison of AV difference between outcome groups:\n")
print("="*80)

for day in ['Day 0', 'Day 1', 'Day 2']:
    day_data = plot_df[plot_df['day_category'] == day]
    good_outcome = day_data[day_data['outcome_group'] == 'Good (mRS 0-2)']['av_difference'].dropna()
    poor_outcome = day_data[day_data['outcome_group'] == 'Poor (mRS >2)']['av_difference'].dropna()
    
    if len(good_outcome) > 0 and len(poor_outcome) > 0:
        # Mann-Whitney U test (non-parametric)
        u_stat, p_value_mw = mannwhitneyu(good_outcome, poor_outcome, alternative='two-sided')
        
        # t-test (parametric)
        t_stat, p_value_t = ttest_ind(good_outcome, poor_outcome)
        
        print(f"\n{day}:")
        print(f"  Good outcome (n={len(good_outcome)}): mean={good_outcome.mean():.3f}, median={good_outcome.median():.3f}, std={good_outcome.std():.3f}")
        print(f"  Poor outcome (n={len(poor_outcome)}): mean={poor_outcome.mean():.3f}, median={poor_outcome.median():.3f}, std={poor_outcome.std():.3f}")
        print(f"  Mann-Whitney U test: U={u_stat:.1f}, p={p_value_mw:.4f}")
        print(f"  t-test: t={t_stat:.3f}, p={p_value_t:.4f}")
    else:
        print(f"\n{day}: Insufficient data for comparison")

print("\n" + "="*80)

### Summary of Findings

**Key Results:**

1. **Sample Identification**: Out of 4,930 arterial and 2,326 venous lactate samples, we identified **183 concomitant arterio-venous (AV) pairs** within 2 hours from **114 patients**.

2. **AV Difference Pattern**: 
   - The arterio-venous lactate difference is **predominantly negative** (arterial < venous) across all time periods
   - Mean AV differences range from -0.14 to -0.49 mmol/L across days and outcome groups
   
3. **Temporal Trends**:
   - **Good outcome group (mRS 0-2)**: AV difference trends from -0.37 mmol/L (Day 0) toward 0 mmol/L (Day 2), suggesting normalization
   - **Poor outcome group (mRS >2)**: AV difference trends from -0.18 mmol/L (Day 0) to -1.0 mmol/L (Day 2), though Day 2 has limited data (n=1)
   
4. **Outcome Comparison**:
   - **Day 0**: No significant difference between outcome groups (p=0.667, Mann-Whitney U test)
   - **Day 1**: No significant difference between outcome groups (p=0.791)
   - **Day 2**: Limited data preclude meaningful comparison (n=5 vs n=1)
   
5. **Clinical Interpretation**:
   - The negative AV difference (arterial < venous) is **unexpected** as arterial lactate is typically higher than venous in physiological conditions
   - Possible explanations: sampling timing, local tissue metabolism, measurement variability
   - No clear discriminatory value for outcome prediction based on AV lactate difference in the first 48 hours

In [None]:
# Additional detailed breakdown: Sample counts and characteristics
print("Detailed breakdown by day and outcome:")
print("="*80)

for day in ['Day 0', 'Day 1', 'Day 2']:
    print(f"\n{day}:")
    day_data = plot_df[plot_df['day_category'] == day]
    
    for outcome in ['Good (mRS 0-2)', 'Poor (mRS >2)']:
        outcome_data = day_data[day_data['outcome_group'] == outcome]
        if len(outcome_data) > 0:
            print(f"\n  {outcome}:")
            print(f"    N pairs: {len(outcome_data)}")
            print(f"    N patients: {outcome_data['case_admission_id'].nunique()}")
            print(f"    Arterial lactate: {outcome_data['arterial_value'].mean():.2f} ± {outcome_data['arterial_value'].std():.2f} mmol/L")
            print(f"    Venous lactate: {outcome_data['venous_value'].mean():.2f} ± {outcome_data['venous_value'].std():.2f} mmol/L")
            print(f"    AV difference: {outcome_data['av_difference'].mean():.2f} ± {outcome_data['av_difference'].std():.2f} mmol/L")
            print(f"    Median time between samples: {outcome_data['time_diff_h'].median():.2f} hours")

print("\n" + "="*80)

### Analysis Notes

**Methodology:**
- Concomitant samples defined as arterial (sga) and venous (sgv/sgvm) samples within 2 hours
- AV difference calculated as: Arterial lactate - Venous lactate
- Outcome groups: Good (mRS 0-2) vs Poor (mRS >2) at 3 months
- Time periods: Day 0 (<24h), Day 1 (24-48h), Day 2 (48-72h) from stroke onset

**Data Quality:**
- Good sample sizes for Day 0 (n=97 pairs)
- Moderate sample sizes for Day 1 (n=16 pairs)  
- Limited data for Day 2 (n=6 pairs) - results should be interpreted cautiously
- Median time between arterial and venous samples: 1.2-1.7 hours (well within 2h threshold)

**Future Directions:**
- Investigate the unexpected negative AV difference (arterial < venous)
- Consider analyzing absolute lactate values separately for arterial vs venous
- Explore whether specific sampling conditions explain the reversed gradient
- Increase sample size for Day 2-3 analysis if additional data becomes available