# Exclude cases with missmatch between EHR and registry data

*Exclusion criteria for cases start date of EHR sampling*
- EHR sampling start date needs to at most 10 days before stroke onset (so that 14 days periods includes 72h of stroke monitoring) [when stroke onset is not available, arrival date from registry is used]
- EHR sampling start date should be at most 7 days after reference date in registry (stroke onset or arrival date, whichever is later)

*Exclusion criteria for individual samples*
- Samples occurring before the day of stroke onset should be excluded


__This code is now in:__ `patient_selection/restrict_to_patient_selection.py`


In [None]:
import pandas as pd

In [None]:
raw_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/Extraction_20211110'
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection_with_details.csv'

In [None]:
from preprocessing.vitals_preprocessing.vitals_preprocessing import preprocess_vitals
from preprocessing.variable_assembly.variable_database_assembly import load_data_from_main_dir

# Load and preprocess vitals data
vitals_file_start = 'patientvalue'
vitals_df = load_data_from_main_dir(raw_data_path, vitals_file_start)
vitals_df = preprocess_vitals(vitals_df, verbose=True)
vitals_df = vitals_df[['case_admission_id','datetime','vital_value','vital_name']]
vitals_df.rename(columns={'vital_name': 'sample_label', 'vital_value':'value', 'datetime':'sample_date'}, inplace=True)
vitals_df['source'] = 'EHR'

In [None]:
patient_selection_df = pd.read_csv(patient_selection_path, dtype=str)
patient_selection_df['case_admission_id'] = patient_selection_df['patient_id'].astype(str) \
                                 + '_' + patient_selection_df['EDS_last_4_digits'].astype(str)

In [None]:
restricted_to_selection_df = vitals_df[
        vitals_df['case_admission_id'].isin(patient_selection_df['case_admission_id'])]

In [None]:
datatime_format = '%d.%m.%Y %H:%M'

In [None]:
# find first sample_date for each case_admission id
temp_df = restricted_to_selection_df.copy()
temp_df['sample_date'] = pd.to_datetime(temp_df['sample_date'], format=datatime_format)
first_sample_date = temp_df.groupby('case_admission_id').sample_date.min()
first_sample_date = first_sample_date.reset_index(level=0)
first_sample_date.rename(columns={'sample_date': 'first_sample_date'}, inplace=True)

In [None]:
first_sample_date.head()

In [None]:
first_sample_date = first_sample_date.merge(patient_selection_df, on='case_admission_id', how='left')
# set stroke onset date as reference (or Arrival date if no stroke onset date is available)
first_sample_date['event_start_date_reference'] = first_sample_date['Stroke onset date'].fillna(first_sample_date['Arrival at hospital'])
first_sample_date['registry_onset_to_first_sample_date_days'] = (
            pd.to_datetime(first_sample_date['first_sample_date'], format=datatime_format) - pd.to_datetime(
        first_sample_date['event_start_date_reference'], format='%Y%m%d')).dt.days

In [None]:
first_sample_date[['event_start_date_reference', 'Stroke onset date', 'Arrival at hospital']]

In [None]:
import pandas as pd
pd.to_datetime('14:18:00', format='%H:%M', infer_datetime_format=True).strftime('%H:%M')


In [None]:
first_sample_date[['case_admission_id', 'registry_onset_to_first_sample_date_days', 'event_start_date_reference', 'first_sample_date', 'Arrival at hospital']]

In [None]:
cid_sampled_too_early = first_sample_date[first_sample_date['registry_onset_to_first_sample_date_days'] < -10]['case_admission_id'].unique()
cid_sampled_too_early

In [None]:
# set end of reference period to stroke onset or arrival at hospital, whichever is later
first_sample_date['delta_onset_arrival'] = (
            pd.to_datetime(first_sample_date['Stroke onset date'], format='%Y%m%d') - pd.to_datetime(
        first_sample_date['Arrival at hospital'], format='%Y%m%d')).dt.total_seconds()
first_sample_date['sampling_start_upper_bound_reference'] = first_sample_date\
        .apply(lambda x: x['Stroke onset date'] if x['delta_onset_arrival'] > 0 else x['Arrival at hospital'], axis=1)
first_sample_date[['sampling_start_upper_bound_reference', 'Arrival at hospital', 'Stroke onset date', 'delta_onset_arrival']]

In [None]:
first_sample_date['registry_upper_bound_to_first_sample_date_days'] = (
            pd.to_datetime(first_sample_date['first_sample_date'], format=datatime_format) - pd.to_datetime(
        first_sample_date['sampling_start_upper_bound_reference'], format='%Y%m%d')).dt.days

In [None]:
first_sample_date[['case_admission_id', 'registry_upper_bound_to_first_sample_date_days', 'sampling_start_upper_bound_reference', 'first_sample_date', 'Arrival at hospital', 'Stroke onset date']]

In [None]:
cid_sampled_too_late = first_sample_date[first_sample_date['registry_upper_bound_to_first_sample_date_days'] > 7]['case_admission_id'].unique()

In [None]:
# drop cid from temp_df if in cid_sampled_too_early or cid_sampled_too_late
temp_df = temp_df[~temp_df['case_admission_id'].isin(cid_sampled_too_early)]
temp_df = temp_df[~temp_df['case_admission_id'].isin(cid_sampled_too_late)]

In [None]:
len(cid_sampled_too_early) + len(cid_sampled_too_late)

In [None]:
# Samples occurring before 1 day before stroke onset should be excluded
temp_df = temp_df.merge(patient_selection_df[['case_admission_id', 'Stroke onset date', 'Arrival at hospital']], on='case_admission_id', how='left')
temp_df['event_start_date_reference'] = temp_df['Stroke onset date'].fillna(temp_df['Arrival at hospital'])
temp_df['delta_sample_date_stroke_onset'] = (
            pd.to_datetime(temp_df['sample_date'], format=datatime_format) - pd.to_datetime(
        temp_df['event_start_date_reference'], format='%Y%m%d')).dt.days

In [None]:
# drop rows with delta_sample_date_stroke_onset < 0
temp_df = temp_df[temp_df['delta_sample_date_stroke_onset'] >= 0]
temp_df.drop(['delta_sample_date_stroke_onset', 'event_start_date_reference', 'Stroke onset date', 'Arrival at hospital'], axis=1, inplace=True)

In [None]:
from preprocessing.patient_selection.restrict_to_patient_selection import restrict_to_patient_selection

restrict_to_patient_selection(vitals_df, patient_selection_path, True)