## Compute length of stay in intermediate/intensive care unit

in extractions of 2022 end_date is overwritten by passage_end_date

In [None]:
import pandas as pd
import os
import numpy as np
from preprocessing.geneva_stroke_unit_preprocessing.utils import create_ehr_case_identification_column

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
location_file_start = 'passages'

In [None]:
location_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype=str)
                  for f in os.listdir(data_path)
                  if f.startswith(location_file_start)]
location_df = pd.concat(location_files, ignore_index=True)
location_df['case_admission_id'] = create_ehr_case_identification_column(location_df)

In [None]:
location_df.head()

In [None]:
location_df.care_unit.value_counts()

In [None]:
intermediate_care_equivalents = ['2EL+-US', 'JUL033-US']
intensive_care_equivalents = ['OPERASI-US', 'JULSI-US']
imc_and_icu_equivalents = intermediate_care_equivalents + intensive_care_equivalents

In [None]:
imc_and_icu_df = location_df[location_df.care_unit.isin(imc_and_icu_equivalents)]

In [None]:
# for every case_admission_id find last occurrence (as defined by end_date column) of intermediate care or ICU
last_exit_date_imc_and_icu_df = location_df.groupby('case_admission_id').apply(lambda x: x[x.care_unit.isin(imc_and_icu_equivalents)].sort_values('end_date').tail(1)['end_date'])
last_exit_date_imc_and_icu_df = last_exit_date_imc_and_icu_df.reset_index().rename(columns={'end_date': 'last_exit_date_imc_and_icu'})

In [None]:
location_df = location_df.merge(last_exit_date_imc_and_icu_df, on='case_admission_id', how='left')

In [None]:
datetime_format = '%d.%m.%Y %H:%M'
location_df['length_of_continuous_care_stay'] = (pd.to_datetime(location_df['last_exit_date_imc_and_icu'], format=datetime_format) - pd.to_datetime(location_df['eds_final_begin'], format=datetime_format)).dt.total_seconds() / (60 * 60)

In [None]:
# keep single row per case_admission_id
los_df = location_df.drop_duplicates('case_admission_id')[['case_admission_id', 'length_of_continuous_care_stay']]

In [None]:
los_df.length_of_continuous_care_stay.describe()

In [None]:
# plot histogram
import seaborn as sns
import matplotlib.pyplot as plt

g = sns.displot(los_df.length_of_continuous_care_stay, kde=True, bins=250)
ax = g.ax

ax.set(xlabel='Length of stay in intermediate/intensive care unit (hours)', ylabel='Number of cases')
ax.set_xlim(0, 250)

# show kde in violet
for i in range(0, len(ax.patches)):
    ax.patches[i].set_color('lightblue')
    ax.patches[i].set_edgecolor('black')
    ax.patches[i].set_linewidth(0.5)
    ax.patches[i].set_alpha(0.5)
    
for i in range(0, len(ax.lines)):
    ax.lines[i].set_color('purple')
    ax.lines[i].set_linewidth(1.5)
    ax.lines[i].set_alpha(0.5)

# show legend for KDE and histogram
ax.legend(['KDE', 'Histogram'])

# set figure size to 10x10
fig = plt.gcf()
fig.set_size_inches(10, 10)

plt.show()

In [None]:
# save figure
# fig.savefig('/Users/jk1/temp/length_of_stay_in_intermediate_intensive_care_unit.tif', dpi=600, bbox_inches='tight')