#### Admission data

Goal:     Preprocess the admission data manually extracted from admission / discharge notes


In [None]:
import pandas as pd
import numpy as np

In [None]:
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/mimic_data/combined_notes_labels_v2.xlsx'
admission_table_path = '/Users/jk1/temp/mimic/extraction/admission_df.csv'

In [None]:
admission_data_df = pd.read_excel(admission_data_path)
admission_table_df = pd.read_csv(admission_table_path)

In [None]:
admission_table_df.head()

In [None]:
admission_table_df.admission_location.unique()

In [None]:
admission_table_df.gender.unique()

## Data extracted from notes

In [None]:
admission_data_df.head()

In [None]:
for column in admission_data_df.columns:
    print(f"'{column}',")

In [None]:
med_hist_columns = ['Antihypert. drugs pre-stroke',
                    'Lipid lowering drugs pre-stroke',
                    'Antiplatelet drugs',
                    'Anticoagulants',
                    'MedHist Hypertension',
                    'MedHist Diabetes',
                    'MedHist Hyperlipidemia',
                    'MedHist Smoking',
                    'MedHist Atrial Fibr.',
                    'MedHist CHD',
                    'MedHist PAD',
                    'MedHist cerebrovascular_event']

In [None]:
for column in med_hist_columns:
    print(admission_data_df[column].unique())

In [None]:
admission_data_df['admitted to ICU for stroke'].unique()

In [None]:
admission_data_df['onset to ICU admission > 7d'].unique()

# Preprocess timings

In [None]:
pd.to_datetime(admission_data_df['intime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
pd.to_datetime(admission_data_df['stroke onset time'].replace(to_replace=r"unk(nown|own)", value=np.nan, regex=True), format='%Y-%m-%d %H:%M:%S')

In [None]:
admission_data_df['stroke onset time'].replace(to_replace=r"unk(nown|own)", value=np.nan, regex=True)

In [None]:
date_format = '%Y-%m-%d %H:%M:%S'

admission_data_df['onset_to_admission_min'] = (pd.to_datetime(admission_data_df['admittime'], format=date_format) -
                                                  pd.to_datetime(admission_data_df['stroke onset time']
                                                                 .replace(to_replace=r"unk(nown|own)", value=np.nan, regex=True),
                                                                format=date_format)).dt.total_seconds() / 60

In [None]:
admission_data_df[admission_data_df.onset_to_admission_min > (7 * 24 * 60)]

In [None]:
admission_data_df.onset_to_admission_min.describe()

In [None]:
admission_data_df['categorical_onset_to_admission_time'] = pd.cut(
    admission_data_df['onset_to_admission_min'],
    bins=[-float("inf"), 270, 540, 1440, float("inf")],
    labels=['<270min', '271-540min', '541-1440min', '>1440min'])

In [None]:
admission_data_df['categorical_onset_to_admission_time'].describe()

In [None]:
admission_data_df['categorical_onset_to_admission_time'] = admission_data_df[
    'categorical_onset_to_admission_time'].cat.add_categories('onset_unknown')
admission_data_df.loc[admission_data_df.onset_to_admission_min.isna(), 'categorical_onset_to_admission_time'] = 'onset_unknown'

In [None]:
admission_data_df[['admittime', 'stroke onset time', 'onset_to_admission_min', 'categorical_onset_to_admission_time']]

# Preprocess procedures

In [None]:
admission_data_df['IVT time']

In [None]:
admission_data_df['IVT'] = ~admission_data_df['IVT time'].isna()

admission_data_df['onset_to_IVT_min'] = (pd.to_datetime(admission_data_df['IVT time']
                                                             .replace(to_replace=r"y", value=np.nan, regex=True),
                                                            format=date_format) -
                                        pd.to_datetime(admission_data_df['stroke onset time']
                                                       .replace(to_replace=r"unk(nown|own)", value=np.nan, regex=True),
                                                       format=date_format)).dt.total_seconds() / 60

In [None]:
admission_data_df[['hadm_id', 'stroke onset time', 'IVT time', 'onset_to_IVT_min']]

In [None]:
admission_data_df['categorical_IVT'] = pd.cut(admission_data_df['onset_to_IVT_min'],
                                               bins=[-float("inf"), 90, 270, 540, float("inf")],
                                               labels=['<90min', '91-270min', '271-540min', '>540min'])

In [None]:
admission_data_df.loc[(admission_data_df['categorical_IVT'].isna())
                              & (admission_data_df['IVT'] == True), 'categorical_IVT'] = \
                                    admission_data_df['categorical_IVT'].mode()[0]

admission_data_df['categorical_IVT'] = admission_data_df['categorical_IVT'].cat.add_categories('no_IVT')
admission_data_df['categorical_IVT'].fillna('no_IVT', inplace=True)

In [None]:
temp = admission_data_df[['stroke onset time', 'IVT time', 'onset_to_IVT_min', 'categorical_IVT', 'IVT']]
temp

In [None]:
temp[temp['stroke onset time'].isin(['unknown', 'unkown'])]

In [None]:
admission_data_df['categorical_IVT'].mode()[0]

In [None]:
admission_data_df['IAT'] = ~admission_data_df['IAT time'].isna()

admission_data_df['onset_to_IAT_min'] = (pd.to_datetime(admission_data_df['IAT time']
                                                             .replace(to_replace=r"y", value=np.nan, regex=True),
                                                            format=date_format) -
                                        pd.to_datetime(admission_data_df['stroke onset time']
                                                       .replace(to_replace=r"unk(nown|own)", value=np.nan, regex=True),
                                                       format=date_format)).dt.total_seconds() / 60

In [None]:
admission_data_df[['hadm_id', 'stroke onset time', 'IAT time', 'onset_to_IAT_min']]

In [None]:
admission_data_df['categorical_IAT'] = pd.cut(admission_data_df['onset_to_IAT_min'],
                                               bins=[-float("inf"), 270, 540, float("inf")],
                                               labels=['<270min', '271-540min', '>540min'])

In [None]:
admission_data_df['categorical_IAT'].mode()[0]