In [None]:
import pandas as pd

In [None]:
ehr_data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
stroke_registry_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/treatment_effects/gsu_extraction_14112023_100007/high_frequency_data_patient_selection_with_details.csv'
variable_selection_path = '/Users/jk1/temp/treatment_effects/selected_variables_treatment_effect.xlsx'
output_path = '/Users/jk1/temp/treatment_effects/temporal_prepro_test'
log_dir = output_path
verbose = True

In [None]:
from geneva_stroke_unit_preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(ehr_data_path, stroke_registry_data_path,
                                            patient_selection_path, variable_selection_path,
                                            log_dir=log_dir, verbose=verbose)

In [None]:
feature_df.head(1000)

In [None]:
from geneva_stroke_unit_preprocessing.utils import create_registry_case_identification_column
from geneva_stroke_unit_preprocessing.patient_selection.restrict_to_patient_selection import \
    restrict_to_patient_selection

stroke_registry_df = pd.read_excel(stroke_registry_data_path)
stroke_registry_df['patient_id'] = stroke_registry_df['Case ID'].apply(lambda x: x[8:-4])
stroke_registry_df['EDS_last_4_digits'] = stroke_registry_df['Case ID'].apply(lambda x: x[-4:])
stroke_registry_df['case_admission_id'] = create_registry_case_identification_column(stroke_registry_df)


restricted_stroke_registry_df = restrict_to_patient_selection(stroke_registry_df, patient_selection_path,
                                                              verbose=verbose, restrict_to_event_period=False)

In [None]:
restricted_stroke_registry_df.head()

In [None]:
# check if all patients with Date of groin puncture have a IAT end date
restricted_stroke_registry_df['Date of groin puncture']

In [None]:
restricted_stroke_registry_df['IAT_start_datetime'] = pd.to_datetime(
        pd.to_datetime(stroke_registry_df['Date of groin puncture'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
        + ' ' + pd.to_datetime(stroke_registry_df['Time of groin puncture'], format='%H:%M',
                                                       infer_datetime_format=True).dt.strftime('%H:%M'), format='%d-%m-%Y %H:%M')

In [None]:
restricted_stroke_registry_df['IAT_end_datetime'] = pd.to_datetime(
        pd.to_datetime(stroke_registry_df['IAT end date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
        + ' ' + pd.to_datetime(stroke_registry_df['IAT end time'], format='%H:%M',
                                                       infer_datetime_format=True).dt.strftime('%H:%M'), format='%d-%m-%Y %H:%M')

In [None]:
# compute duration of IAT
restricted_stroke_registry_df['IAT_duration'] = restricted_stroke_registry_df['IAT_end_datetime'] - restricted_stroke_registry_df['IAT_start_datetime']

In [None]:
# if restricted_stroke_registry_df['IAT_end_datetime'] is nan set it to the IAT start datetime + 2h
restricted_stroke_registry_df.loc[restricted_stroke_registry_df['IAT_end_datetime'].isna(), 'IAT_end_datetime'] = restricted_stroke_registry_df.loc[restricted_stroke_registry_df['IAT_end_datetime'].isna(), 'IAT_start_datetime'] + pd.Timedelta(hours=2)

In [None]:
restricted_stroke_registry_df[['IAT_start_datetime', 'IAT_end_datetime', 'IAT_duration']]

In [None]:
restricted_stroke_registry_df['IVT_start_datetime'] = pd.to_datetime(
        pd.to_datetime(stroke_registry_df['IVT start date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
        + ' ' + pd.to_datetime(stroke_registry_df['IVT start time'], format='%H:%M',
                                                       infer_datetime_format=True).dt.strftime('%H:%M'), format='%d-%m-%Y %H:%M')

In [None]:
# check if all patients with IVT with rtPA == yes have a IVT start datetime
restricted_stroke_registry_df.loc[restricted_stroke_registry_df['IVT with rtPA'] == 'yes', 'IVT_start_datetime'].isna().sum()

In [None]:
# set ivt end datetime to ivt start datetime + 1h
restricted_stroke_registry_df['IVT_end_datetime'] = restricted_stroke_registry_df['IVT_start_datetime'] + pd.Timedelta(hours=1)

In [None]:
restricted_stroke_registry_df[['IVT_start_datetime', 'IVT_end_datetime']]

In [None]:
# set acute treatment end datetime to the max of ivt end datetime and iat end datetime
restricted_stroke_registry_df['acute_treatment_end_datetime'] = restricted_stroke_registry_df[['IVT_end_datetime', 'IAT_end_datetime']].max(axis=1)

In [None]:
restricted_stroke_registry_df[['IVT_end_datetime', 'IAT_end_datetime', 'acute_treatment_end_datetime']]

In [None]:
# add acute_treatment_end_datetime column to feature_df
feature_df = feature_df.merge(restricted_stroke_registry_df[['case_admission_id', 'acute_treatment_end_datetime']], on='case_admission_id', how='left')

In [None]:
feature_df[['case_admission_id', 'acute_treatment_end_datetime']]

In [None]:
feature_df[feature_df.case_admission_id == '124928_5998'].acute_treatment_end_datetime.unique()