In [None]:
import pandas as pd

In [None]:
data_path = '/Users/jk1/temp/preprocessed_mimic_data/extraction/lab_df.csv'
outcome_path = '/Users/jk1/temp/preprocessed_mimic_data/extraction/mortality_df.csv'
admission_data_path = '/Users/jk1/temp/preprocessed_mimic_data/extraction/admission_df.csv'
admission_notes_data_path ='/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_unit_dataset/data/mimic_data/combined_notes_labels_v2.xlsx'
verbose = True

In [None]:
lab_df = pd.read_csv(data_path)
outcome_df = pd.read_csv(outcome_path)


In [None]:
from mimic_admission_preprocessing import preprocess_admission


admission_data_df = preprocess_admission(admission_notes_data_path, admission_data_path, verbose=verbose)
admission_data_df['case_admission_id'] = admission_data_df['hadm_id'].astype(str) + '_' + admission_data_df['icustay_id'].astype(str)

In [None]:
admission_data_df.case_admission_id.nunique()

In [None]:
patient_selection = admission_data_df['case_admission_id'].unique()
lab_df['case_admission_id'] = lab_df['hadm_id'].astype(str) + '_' + lab_df['icustay_id'].astype(str)
lab_df = lab_df[lab_df['case_admission_id'].isin(patient_selection)]

In [None]:
from lab_preprocessing import mimic_preprocess_labs
import numpy as np
lactate_df = mimic_preprocess_labs(lab_df, selected_variables=['lactate'])

In [None]:
lactate_df.valueuom.unique()

In [None]:
lactate_df

In [None]:
lactate_df.value.hist(bins=50)

In [None]:
admission_data_df

In [None]:
admission_notes_df = pd.read_excel(admission_notes_data_path)

In [None]:
admission_notes_df['case_admission_id'] = admission_notes_df['hadm_id'].astype(str) + '_' + admission_notes_df['icustay_id'].astype(str)
admission_notes_df['T0'] = admission_notes_df['stroke onset time'].replace('unknown', np.nan).fillna(admission_notes_df['admittime'])
lactate_df = lactate_df.merge(admission_notes_df[['case_admission_id', 'T0']], on='case_admission_id', how='left')

In [None]:
lactate_df

In [None]:
# '2159-11-26 08:41:00'
dt_format = '%Y-%m-%d %H:%M:%S'
lactate_df['relative_sample_date'] = (pd.to_datetime(lactate_df['charttime'], format=dt_format) - pd.to_datetime(lactate_df['T0'], format=dt_format)).dt.total_seconds() / 3600 # convert to hours


In [None]:
lactate_df['relative_sample_date_hcat'] = lactate_df['relative_sample_date'].apply(np.floor)

In [None]:
n_patients_with_lactate_in_first_24h = lactate_df[(lactate_df.relative_sample_date > -12) & (lactate_df.relative_sample_date < 24)].case_admission_id.nunique()
n_patients_with_lactate_in_24_to_72h = lactate_df[(lactate_df.relative_sample_date > 24) & (lactate_df.relative_sample_date < 3*24)].case_admission_id.nunique()

print(f'Number of patients with lactate in first 24h: {n_patients_with_lactate_in_first_24h}')
print(f'Number of patients with lactate in 24 to 72h: {n_patients_with_lactate_in_24_to_72h}')

In [None]:
outcome_df['case_admission_id'] = outcome_df['hadm_id'].astype(str) + '_' + outcome_df['icustay_id'].astype(str)
# Preprocess In-hospital death
outcome_df['Death in hospital'] = pd.to_datetime(outcome_df['dod'], format=dt_format) <= pd.to_datetime(
    outcome_df['dischtime'], format=dt_format)
outcome_df['Death in hospital'] = outcome_df['Death in hospital'].astype(int)

# Preprocess 3M Death (3M = 3 months after admission)
outcome_df['3m_date'] = pd.to_datetime(outcome_df['admittime'], format=dt_format) + pd.DateOffset(months=3)
outcome_df['3M Death'] = pd.to_datetime(outcome_df['dod'], format=dt_format) <= pd.to_datetime(
    outcome_df['3m_date'], format=dt_format)
outcome_df['3M Death'] = outcome_df['3M Death'].astype(int)

In [None]:
outcome_df

In [None]:
lactate_df = lactate_df.merge(
    outcome_df[['case_admission_id', '3M Death']],
    on='case_admission_id',
    how='left'
)

In [None]:
early_lactate_df = lactate_df[(lactate_df.relative_sample_date > -12) & (lactate_df.relative_sample_date < 24)]
lactate_d2_df = lactate_df[(lactate_df.relative_sample_date > 24) & (lactate_df.relative_sample_date < 2*72)]
lactate_d3_df = lactate_df[(lactate_df.relative_sample_date > 2*24) & (lactate_df.relative_sample_date < 3*72)]
lactate_d_2_3_df = lactate_df[(lactate_df.relative_sample_date > 1*24) & (lactate_df.relative_sample_date < 3*72)]

In [None]:
# logistic regression lactate value to 3M Death
import statsmodels.api as sm
temp_df = early_lactate_df.dropna(subset=['value', '3M Death'])
temp_df['3M Death'] = temp_df['3M Death'].astype(int)  # Ensure binary outcome is int
temp_df['value'] = temp_df['value'].astype(float)  # Ensure value is float
X = sm.add_constant(temp_df['value'])
y = temp_df['3M Death']
model = sm.Logit(y, X)
result = model.fit(disp=0)
print(result.summary())


In [None]:
# correlation of d2 lactate with outcome
temp_df = lactate_d2_df.dropna(subset=['value', '3M Death'])
temp_df['3M Death'] = temp_df['3M Death'].astype(int)  # Ensure binary outcome is int
temp_df['value'] = temp_df['value'].astype(float)  # Ensure value is float
X = sm.add_constant(temp_df['value'])
y = temp_df['3M Death']
model = sm.Logit(y, X)
result = model.fit(disp=0)
print(result.summary())

In [None]:
# correlation of d3 lactate with outcome
temp_df = lactate_d3_df.dropna(subset=['value', '3M Death'])
temp_df['3M Death'] = temp_df['3M Death'].astype(int)  # Ensure binary outcome is int
temp_df['value'] = temp_df['value'].astype(float)  # Ensure value is float
X = sm.add_constant(temp_df['value'])
y = temp_df['3M Death']
model = sm.Logit(y, X)
result = model.fit(disp=0)
print(result.summary())

In [None]:
# correlation of d2-d3 lactate with outcome
temp_df = lactate_d_2_3_df.dropna(subset=['value', '3M Death'])
temp_df['3M Death'] = temp_df['3M Death'].astype(int)
temp_df['value'] = temp_df['value'].astype(float)  # Ensure value is float
X = sm.add_constant(temp_df['value'])
y = temp_df['3M Death']
model = sm.Logit(y, X)
result = model.fit(disp=0)
print(result.summary())