# Select patients from stroke registry data
- Inclusion criteria: > 18y, ischemic stroke, inpatient/non-transferred, not refusing to participate
- Exclusion criteria: < 12h, hospitalisation > 7d after stroke onset
    - Optional exclusion criteria: international transfer (from France), intra-hospital stroke

In [None]:
import pandas as pd
import numpy as np
import os
from preprocessing.geneva_stroke_unit_preprocessing.utils import create_registry_case_identification_column, \
    create_ehr_case_identification_column

In [None]:
stroke_registry_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'

In [None]:
manual_eds_completion_folder = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/manuel_eds_completion'

In [None]:
# general consent is present for the extraction of 20221117
general_consent_eds_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20221117/eds_j1.csv'

In [None]:
output_path = '/Users/jk1/temp/opsum_extraction_output'

In [None]:
all_data_df = pd.read_excel(stroke_registry_data_path)

In [None]:
all_data_df['patient_id'] = all_data_df['Case ID'].apply(lambda x: x[8:-4]).astype(str)
all_data_df['EDS_last_4_digits'] = all_data_df['Case ID'].apply(lambda x: x[-4:]).astype(str)
all_data_df['case_admission_id'] = create_registry_case_identification_column(all_data_df)

In [None]:
n_duplicates = len(all_data_df[all_data_df['Type of event'] == 'duplicate']['case_admission_id'].unique())
n_records_screened = len(all_data_df['case_admission_id'].unique()) - n_duplicates

all_data_df = all_data_df[all_data_df['Type of event'] != 'duplicate']

print('Number of records screened: ', n_records_screened, 'after removing duplicates: ', n_duplicates)

### Exclude patients refusing participation in research

In [None]:
general_consent_eds_df = pd.read_csv(general_consent_eds_path, delimiter=';', encoding='utf-8', dtype=str)
general_consent_eds_df['case_admission_id'] = create_ehr_case_identification_column(general_consent_eds_df)

In [None]:
consent_df = all_data_df[['case_admission_id', 'Patient refuses use of data for research']]\
    .merge(general_consent_eds_df[['case_admission_id', 'patient_id_gc']], on='case_admission_id', how='left')

In [None]:
consent_df.patient_id_gc.unique(), consent_df['Patient refuses use of data for research'].unique()

In [None]:
patients_refusing_use_of_data = consent_df[(consent_df.patient_id_gc == 'Non') | ((consent_df['Patient refuses use of data for research'] == 'yes') & (consent_df.patient_id_gc != 'Oui'))].case_admission_id.unique()

In [None]:
# Remove patients not wanting to participate in research
n_patient_refuses_research = len(patients_refusing_use_of_data)
print(f'{n_patient_refuses_research} patients refuse the use of their data')
full_data_df = all_data_df[~all_data_df.case_admission_id.isin(patients_refusing_use_of_data)]

### Include only ischemic stroke patients

In [None]:
full_data_df['Type of event'].value_counts()

In [None]:
# all_data_df['Type of event'].value_counts().to_excel(os.path.join(output_path, 'type_of_event.xlsx'))

In [None]:
# select only ischemic stroke patients
all_stroke_df = full_data_df[full_data_df['Type of event'] == 'Ischemic stroke']

In [None]:
n_patients_not_ischemic_stroke = len(full_data_df['case_admission_id'].unique()) - len(all_stroke_df['case_admission_id'].unique())
print('Number of patients excluded because not ischemic stroke: ', n_patients_not_ischemic_stroke)

### Exclude patients not hospitalised in our center or discharged

In [None]:
all_stroke_df['Initial hospitalization'].value_counts()

In [None]:
# all_data_df['Initial hospitalization'].value_counts().to_excel(os.path.join(output_path, 'initial_hospitalization.xlsx'))

In [None]:
# exclude patients that were immediately discharged or referred to other center
stroke_df = all_stroke_df[all_stroke_df['Initial hospitalization'] != 'Outpatient management']
stroke_df = stroke_df[stroke_df['Initial hospitalization'] != 'Referral to other Stroke Unit or Stroke Center']
stroke_df = stroke_df[stroke_df['Initial hospitalization'] != 'Referral to other hospital or care institution']

In [None]:
n_not_hospitalised_in_house = len(all_stroke_df['case_admission_id'].unique()) - len(stroke_df['case_admission_id'].unique())
print('Number of patients excluded because discharged or referred to other center: ', n_not_hospitalised_in_house)

### Exclude patients with less than 12h of hospitalization

In [None]:
# set end of reference period to stroke onset or arrival at hospital, whichever is later
# this takes into account potential in-hospital stroke events

datatime_format = '%d.%m.%Y %H:%M'
stroke_df['arrival_dt'] = pd.to_datetime(stroke_df['Arrival at hospital'],
                                                  format='%Y%m%d').dt.strftime('%d.%m.%Y') + ' ' + \
                                   pd.to_datetime(stroke_df['Arrival time'], format='%H:%M',
                                                  infer_datetime_format=True).dt.strftime('%H:%M')

stroke_df['stroke_dt'] = pd.to_datetime(stroke_df['Onset date'],
                                                 format='%Y%m%d').dt.strftime('%d.%m.%Y') + ' ' + \
                                    pd.to_datetime(stroke_df['Onset time'], format='%H:%M',
                                                   infer_datetime_format=True).dt.strftime('%H:%M')

stroke_df['delta_onset_arrival'] = (
        pd.to_datetime(stroke_df['stroke_dt'], format=datatime_format, errors='coerce')
        - pd.to_datetime(stroke_df['arrival_dt'], format=datatime_format, errors='coerce')
).dt.total_seconds()
stroke_df['registry_sampling_start_upper_bound_reference'] = stroke_df \
    .apply(lambda x: x['stroke_dt'] if x['delta_onset_arrival'] > 0 else x['arrival_dt'],
           axis=1)


In [None]:
stroke_df['discharge_dt'] = pd.to_datetime(stroke_df['Discharge date'],
                                                  format='%Y%m%d').dt.strftime('%d.%m.%Y') + ' ' + \
                                   pd.to_datetime(stroke_df['Discharge time'], format='%H:%M',
                                                  infer_datetime_format=True).dt.strftime('%H:%M')

stroke_df['death_dt'] = pd.to_datetime(stroke_df['Death at hospital date'],
                                                  format='%Y%m%d').dt.strftime('%d.%m.%Y') + ' ' + \
                                   pd.to_datetime(stroke_df['Death at hospital time'], format='%H:%M',
                                                  infer_datetime_format=True).dt.strftime('%H:%M')

stroke_df['registry_sampling_end'] = stroke_df['discharge_dt'].fillna(stroke_df['death_dt'])


In [None]:
stroke_df['registry_sample_range'] = pd.to_datetime(stroke_df['registry_sampling_end'], format=datatime_format) \
                                                - pd.to_datetime(stroke_df['registry_sampling_start_upper_bound_reference'], format=datatime_format)

In [None]:
cid_with_hospitalization_duration_less_than_12h = stroke_df[stroke_df['registry_sample_range'] < pd.Timedelta('12h')]['case_admission_id'].unique()

In [None]:
n_with_hospitalization_duration_less_than_12h = len(cid_with_hospitalization_duration_less_than_12h)
print('Number of patients excluded because hospitalization duration less than 12h: ', len(cid_with_hospitalization_duration_less_than_12h))
print('NB: more patients will be excluded programmatically if total span of data is less than 12h')

In [None]:
# exclude patients with less than 12h of hospitalization
stroke_df = stroke_df[~stroke_df['case_admission_id'].isin(cid_with_hospitalization_duration_less_than_12h)]

### Exclude non acute stroke (hospitalisation > 7 days after stroke onset)

In [None]:
cid_with_non_acute_stroke = stroke_df[stroke_df['delta_onset_arrival'] < -7 * 24 * 60 * 60]['case_admission_id'].unique()
n_with_non_acute_stroke = len(cid_with_non_acute_stroke)
print('Number of patients excluded because non acute stroke: ', n_with_non_acute_stroke)

In [None]:
stroke_df = stroke_df[~stroke_df['case_admission_id'].isin(cid_with_non_acute_stroke)]

### Optional exclusion criteria

In [None]:
exclude_transfers_from_france = True
exclude_intra_hospital_stroke = True
exclude_patients_under_18 = True

In [None]:
n_patients_transferred_from_france = 0
if exclude_transfers_from_france:
    # find cids from transfers from France (where Non-Swiss == yes & referral == other hospital)
    cids_transfers_from_france = stroke_df[(stroke_df['Referral'] == 'Other hospital') & (stroke_df['Non-Swiss'] == 'yes')]['case_admission_id'].values
    n_patients_transferred_from_france = len(cids_transfers_from_france)
    print('Number of patients excluded because transfers from France: ', n_patients_transferred_from_france)
    stroke_df = stroke_df[~stroke_df['case_admission_id'].isin(cids_transfers_from_france)]

In [None]:
n_patients_with_intra_hospital_stroke = 0
if exclude_intra_hospital_stroke:
    # find cids with intra-hospital stroke
    cids_intra_hospital_stroke = stroke_df[stroke_df['Referral'] == 'In-hospital event']['case_admission_id'].unique()
    n_patients_with_intra_hospital_stroke = len(cids_intra_hospital_stroke)
    print('Number of patients excluded because intra-hospital stroke: ', n_patients_with_intra_hospital_stroke)
    stroke_df = stroke_df[~stroke_df['case_admission_id'].isin(cids_intra_hospital_stroke)]

In [None]:
stroke_df

In [None]:
n_patients_under_18 = 0
if exclude_patients_under_18:
    # find cids with patients under 18 (exclude patients with age < 1 as this represents wrong DOB)
    cids_under_18 = stroke_df[(stroke_df['Age (calc.)'] < 18) & (stroke_df['Age (calc.)'] > 1)]['case_admission_id'].unique()
    n_patients_under_18 = len(cids_under_18)
    print('Number of patients excluded because under 18: ', n_patients_under_18)
    stroke_df = stroke_df[~stroke_df['case_admission_id'].isin(cids_under_18)]

### End of exclusion criteria

In [None]:
len(stroke_df['case_admission_id'].unique())

In [None]:
# counting patients with outcome variables
sum(stroke_df['3M Death'].value_counts())

In [None]:
stroke_df['Death in hospital'].value_counts()


In [None]:
stroke_df['Referral'].value_counts()

In [None]:
stroke_df['Initial hospitalization'].value_counts()

## Adding additional information

In [None]:
onset_date = pd.to_datetime(pd.to_datetime(stroke_df['Onset date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + stroke_df['Onset time'])

admission_date = pd.to_datetime(pd.to_datetime(stroke_df['Arrival at hospital'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + stroke_df['Arrival time'])

discharge_date = pd.to_datetime(pd.to_datetime(stroke_df['Discharge date'], format='%Y%m%d').dt.strftime('%d-%m-%Y') \
                                        + ' ' + stroke_df['Discharge time'])


In [None]:
stroke_df.head()

#### Fuse with databases of manually completed EDS

In [None]:
pd.read_excel('/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/manuel_eds_completion/patients_with_missing_data_manual_completion.xlsx', dtype=str,                                            converters={"EDS_last_4_digits": str})

In [None]:
manual_eds_completion_dfs = [pd.read_excel(os.path.join(manual_eds_completion_folder, f),
                                           converters={"EDS_last_4_digits": str, # Ensure EDS_last_4_digits is read as string, maintaining leading 0s
                                                       }) for f in os.listdir(manual_eds_completion_folder) if f.endswith('.xlsx')]

In [None]:
all_manual_eds_completions = pd.concat(manual_eds_completion_dfs)

In [None]:
all_manual_eds_completions = all_manual_eds_completions[['patient_id', 'EDS_last_4_digits', 'manual_eds', 'manual_patient_id']]
all_manual_eds_completions = all_manual_eds_completions.astype(str)
all_manual_eds_completions['EDS_last_4_digits'] = all_manual_eds_completions['EDS_last_4_digits'].str.zfill(4)
all_manual_eds_completions['manual_patient_id'] = all_manual_eds_completions['manual_patient_id'].str.replace(r'\.0$', "", regex=True)
all_manual_eds_completions['manual_eds'] = all_manual_eds_completions['manual_eds'].str.replace(r'\.0$', "", regex=True)

In [None]:
all_manual_eds_completions

In [None]:
all_manual_eds_completions[all_manual_eds_completions.patient_id == '157394']

In [None]:
stroke_df = stroke_df.merge(all_manual_eds_completions, how='left', on=['patient_id', 'EDS_last_4_digits'])

In [None]:
selected_columns = ['patient_id', 'EDS_last_4_digits', 'manual_eds', 'manual_patient_id', 'DOB',
                                                   'Arrival at hospital', 'Arrival time',
                                                   'Discharge date', 'Discharge time',
                                                   'Death at hospital date', 'Death at hospital time', 'Time of symptom onset known', 'Onset date', 'Onset time', 'Referral']

In [None]:
# ensure leading zeros are kept
stroke_df[['patient_id', 'EDS_last_4_digits']] = stroke_df[['patient_id', 'EDS_last_4_digits']].astype(str)

In [None]:
extraction_target_df = stroke_df.copy()
# for extraction replace missing stroke onset date with admission_date (to have a reference date in case of in-hospital strokes)
extraction_target_df['Onset time'] = extraction_target_df.apply(lambda x: x['Arrival time'] if pd.isnull(x['Onset date']) else x['Onset time'], axis=1)
extraction_target_df['Onset date'] = extraction_target_df.apply(lambda x: x['Arrival at hospital'] if pd.isnull(x['Onset date']) else x['Onset date'], axis=1)

In [None]:
high_frequency_data_patient_selection_with_details = stroke_df[selected_columns]
extraction_target_df = extraction_target_df[selected_columns]

In [None]:
high_frequency_data_patient_selection_with_details.rename(columns={'Onset date': 'Stroke onset date', 'Onset time': 'Stroke onset time'}, inplace=True)
extraction_target_df.rename(columns={'Onset date': 'Stroke onset date', 'Onset time': 'Stroke onset time'}, inplace=True)


In [None]:
high_frequency_data_patient_selection_with_details.head()

In [None]:
import pandas as pd
# excluded patients logs
excluded_patients_df = pd.DataFrame({
    'n_records_screened': n_records_screened,
    'n_patient_refuses_research': n_patient_refuses_research,
    'n_patients_not_ischemic_stroke': n_patients_not_ischemic_stroke,
    'n_not_hospitalised_in_house': n_not_hospitalised_in_house,
    'n_with_hospitalization_duration_less_than_12h': n_with_hospitalization_duration_less_than_12h,
    'n_with_non_acute_stroke': n_with_non_acute_stroke,
    'n_patients_transferred_from_france': n_patients_transferred_from_france,
    'n_patients_with_intra_hospital_stroke': n_patients_with_intra_hospital_stroke,
    'Comments': 'more patients will be excluded programmatically (1. insufficient length of hosp, 2. patient not found in EHR)'
}, index=[0])

excluded_patients_df = excluded_patients_df.T
excluded_patients_df.columns = ['number of patients']

In [None]:
save_data = True

In [None]:
extraction_target_df.astype(str).head()

In [None]:
from modun.file_io import ensure_dir
import time

if save_data:
    timestamp = time.strftime("%d%m%Y_%H%M%S")
    output_path = os.path.join(output_path, f'gsu_extraction_{timestamp}')
    ensure_dir(output_path)

    high_frequency_data_patient_selection_with_details.to_csv(os.path.join(output_path, 'high_frequency_data_patient_selection_with_details.csv'))
    extraction_target_df.to_csv(os.path.join(output_path, 'high_frequency_data_patient_selection_extraction_target.csv'))
    excluded_patients_df.to_csv(os.path.join(output_path, f'excluded_patients_df_{timestamp}.csv'))