In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import os

In [None]:
stroke_registry_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection.csv'
out_dir = '/Users/jk1/temp/opsum_extration_output'

In [None]:
patient_selection_df = pd.read_csv(patient_selection, dtype=str)

In [None]:
full_data_df = pd.read_excel(stroke_registry_data_path)
full_data_df['patient_id'] = full_data_df['Case ID'].apply(lambda x: x[8:-4])
full_data_df['EDS_last_4_digits'] = full_data_df['Case ID'].apply(lambda x: x[-4:])
full_data_df['case_admission_id'] = full_data_df['patient_id'].astype(str) \
                                 + full_data_df['EDS_last_4_digits'].astype(str) \
                                 + '_' + pd.to_datetime(full_data_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d%m%Y').astype(str)

In [None]:
patient_selection_df['case_admission_id'] = patient_selection_df['patient_id'].astype(str) \
                                 + patient_selection_df['EDS_last_4_digits'].astype(str) \
                                 + '_' + pd.to_datetime(patient_selection_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d%m%Y').astype(str)
selected_full_data_df = full_data_df[
    full_data_df['case_admission_id'].isin(patient_selection_df['case_admission_id'].tolist())]


In [None]:
selected_full_data_df['begin_date'] = pd.to_datetime(selected_full_data_df['Arrival at hospital'],
                                                     format='%Y%m%d').dt.strftime('%d.%m.%Y') + ' ' + \
                                      selected_full_data_df['Arrival time']

In [None]:
selected_full_data_df['onset_datetime'] = pd.to_datetime(pd.to_datetime(selected_full_data_df['Onset date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + selected_full_data_df['Onset time'], format='%d-%m-%Y %H:%M')
selected_full_data_df['IVT_datetime'] = pd.to_datetime(pd.to_datetime(selected_full_data_df['IVT start date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + selected_full_data_df['IVT start time'], format='%d-%m-%Y %H:%M')
selected_full_data_df['groin_puncture_datetime'] = pd.to_datetime(pd.to_datetime(selected_full_data_df['Date of groin puncture'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + selected_full_data_df['Time of groin puncture'], format='%d-%m-%Y %H:%M')

In [None]:
selected_full_data_df[['Date of groin puncture', 'Time of groin puncture', 'groin_puncture_datetime']]

In [None]:
selected_full_data_df['onset_to_IVT_min'] = (pd.to_datetime(selected_full_data_df['IVT_datetime'], format='%d-%m-%Y %H:%M:%S') - pd.to_datetime(selected_full_data_df['onset_datetime'], format='%d-%m-%Y %H:%M:%S')).dt.total_seconds() / 60
selected_full_data_df['onset_to_groin_min'] = (pd.to_datetime(selected_full_data_df['groin_puncture_datetime'], format='%d-%m-%Y %H:%M:%S') - pd.to_datetime(selected_full_data_df['onset_datetime'], format='%d-%m-%Y %H:%M:%S')).dt.total_seconds() / 60

In [None]:
selected_full_data_df[['onset_datetime','IVT_datetime', 'groin_puncture_datetime', 'onset_to_IVT_min', 'onset_to_groin_min']]

In [None]:
ax = selected_full_data_df.plot(y='onset_to_IVT_min', x="Door to image (min.)", kind='scatter')
selected_full_data_df.plot(y='onset_to_groin_min', x="Door to image (min.)", kind='scatter', ax=ax, color='red')
ax.set_xlim(0, 100)
ax.set_ylim(0, 800)
plt.show()

In [None]:
selected_full_data_df["onset_to_IVT_min"].describe()

In [None]:
selected_full_data_df["onset_to_IVT_min"].quantile(0.05)

Finding wrong onset to IVT timings

In [None]:
temp_df = selected_full_data_df[(selected_full_data_df["onset_to_IVT_min"] > 24 * 60) | (selected_full_data_df["onset_to_IVT_min"] < 10)]
temp_df[['Case ID', 'onset_to_IVT_min', 'onset_datetime', 'Arrival at hospital', 'IVT_datetime']]

In [None]:
# temp_df.to_excel(os.path.join(out_dir, 'wrong_onset_to_IVT_min.xlsx'))

In [None]:
temp_df = selected_full_data_df[(selected_full_data_df["onset_to_IVT_min"] < 24 * 60) & (selected_full_data_df["onset_to_IVT_min"] > 10)]
temp_df.plot.hist(y='onset_to_IVT_min', bins=100)
plt.show()

## Categorizing IVT treatment

Categories:
'not_treated', '<90min', '91-270min', '271-540min', '>540min'

In [None]:
selected_full_data_df['categorical_IVT'] = pd.cut(selected_full_data_df['onset_to_IVT_min'],
                                                  bins=[-float("inf"), 90,270,540,float("inf")],
                                                  labels=['<90min', '91-270min', '271-540min', '>540min'])

In [None]:
selected_full_data_df['categorical_IVT'] = selected_full_data_df['categorical_IVT'].cat.add_categories('no_IVT')
selected_full_data_df['categorical_IVT'].fillna('no_IVT', inplace=True)

In [None]:
selected_full_data_df[['categorical_IVT', 'onset_to_IVT_min']]

For patients with unknown IVT timing, replace NaN with mode

In [None]:
selected_full_data_df[(selected_full_data_df['categorical_IVT'] == 'no_IVT') & (selected_full_data_df['IVT with rtPA'] != 'no')]

In [None]:
selected_full_data_df[(selected_full_data_df['categorical_IVT'] != 'no_IVT')]['categorical_IVT'].mode()

In [None]:
selected_full_data_df.loc[(selected_full_data_df['categorical_IVT'] == 'no_IVT')
                          & (selected_full_data_df['IVT with rtPA'] != 'no'), 'categorical_IVT'] = selected_full_data_df[(selected_full_data_df['categorical_IVT'] != 'no_IVT')]['categorical_IVT'].mode()[0]

In [None]:
selected_full_data_df['categorical_IVT'].value_counts()

# Preprocessing IAT

In [None]:
selected_full_data_df["onset_to_groin_min"].describe()

Find wrong IAT timings

In [None]:
selected_full_data_df["onset_to_groin_min"].quantile(0.95)

In [None]:
temp_df = selected_full_data_df[(selected_full_data_df["onset_to_groin_min"] > 30 * 60) | (selected_full_data_df["onset_to_groin_min"] < 10)]
temp_df[['Case ID', 'onset_to_groin_min', 'onset_datetime', 'Arrival at hospital', 'Date of groin puncture', 'Time of groin puncture', 'Arrival time']]

In [None]:
# temp_df.to_excel(os.path.join(out_dir, 'wrong_onset_to_IAT_timings.xlsx'))

## Categorizing IAT treatment

Categories
- not treated
- < 270min (4.5h)
- 271-540min (<9)
-  \> 540min

In [None]:
temp_df = selected_full_data_df[(selected_full_data_df["onset_to_groin_min"] < 30 * 60) & (selected_full_data_df["onset_to_groin_min"] > 10)]
temp_df.plot.hist(y='onset_to_groin_min', bins=100)
plt.show()

In [None]:
selected_full_data_df['categorical_IAT'] = pd.cut(selected_full_data_df['onset_to_groin_min'],
                                                  bins=[-float("inf"), 270, 540,float("inf")],
                                                  labels=['<270min', '271-540min', '>540min'])

In [None]:
selected_full_data_df['categorical_IAT'] = selected_full_data_df['categorical_IAT'].cat.add_categories('no_IAT')
selected_full_data_df['categorical_IAT'].fillna('no_IAT', inplace=True)

In [None]:
selected_full_data_df[['onset_to_groin_min', 'categorical_IAT']]

Replacing eventually missing timings with mode

In [None]:
selected_full_data_df.loc[(selected_full_data_df['categorical_IAT'] == 'no_IAT')
                          & (selected_full_data_df['IAT'] == 'yes'), 'categorical_IAT'] = selected_full_data_df[(selected_full_data_df['categorical_IAT'] != 'no_IAT')]['categorical_IAT'].mode()[0]

In [None]:
selected_full_data_df['categorical_IAT'].value_counts()

In [None]:
treatment_columns = ['categorical_IVT', 'categorical_IAT']

In [None]:
treatment_df = selected_full_data_df[treatment_columns + ['case_admission_id', 'begin_date']]

In [None]:
treatment_df.isna().sum().sum()

In [None]:
pd.melt(treatment_df, id_vars=['case_admission_id', 'begin_date'], var_name='sample_label')


In [None]:
selected_full_data_df["IVT with rtPA"].value_counts()

In [None]:
selected_full_data_df.loc[treatment_df["IVT with rtPA"] == "started before admission", "IVT with rtPA"] = "yes"

In [None]:
selected_full_data_df["IAT"]

In [None]:
selected_full_data_df["IAT"].value_counts()