In [None]:
import pandas as pd
import numpy as np
import os
from preprocessing.utils import create_ehr_case_identification_column, create_registry_case_identification_column
from preprocessing.variable_assembly.variable_database_assembly import load_data_from_main_dir

In [None]:
raw_ehr_data_dir = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20221117'
patient_selection_path = '/Users/jk1/temp/opsum_extraction_output/high_frequency_data_patient_selection_with_details.csv'
out_dir = '/Users/jk1/temp/opsum_extraction_output/ehr_extraction_errors/Extraction_20221117'

In [None]:
patient_selection_df = pd.read_csv(patient_selection_path, dtype=str)
patient_selection_df['case_admission_id'] = create_registry_case_identification_column(patient_selection_df)
cids_in_selection = set(patient_selection_df['case_admission_id'].unique())
patient_selection_df_initial_columns = patient_selection_df.columns

In [None]:
lab_file_start = 'labo'
lab_df = load_data_from_main_dir(raw_ehr_data_dir, lab_file_start)
lab_df['case_admission_id'] = create_ehr_case_identification_column(lab_df)
lab_df = lab_df[lab_df['case_admission_id'].isin(cids_in_selection)]

In [None]:
scales_file_start = 'scale'
scales_df = load_data_from_main_dir(raw_ehr_data_dir, scales_file_start)
scales_df['case_admission_id'] = create_ehr_case_identification_column(scales_df)
scales_df = scales_df[scales_df['case_admission_id'].isin(cids_in_selection)]
scales_df.rename(columns={'event_date': 'sample_date'}, inplace=True)


In [None]:
ventilation_file_start = 'ventilation'
ventilation_df = load_data_from_main_dir(raw_ehr_data_dir, ventilation_file_start)
ventilation_df['case_admission_id'] = create_ehr_case_identification_column(ventilation_df)
ventilation_df = ventilation_df[ventilation_df['case_admission_id'].isin(cids_in_selection)]
ventilation_df.rename(columns={'datetime': 'sample_date'}, inplace=True)


In [None]:
vitals_file_start = 'patientvalue'
vitals_df = load_data_from_main_dir(raw_ehr_data_dir, vitals_file_start)
vitals_df['case_admission_id'] = create_ehr_case_identification_column(vitals_df)
vitals_df = vitals_df[vitals_df['case_admission_id'].isin(cids_in_selection)]
vitals_df.rename(columns={'datetime': 'sample_date'}, inplace=True)

In [None]:
vitals_df.head()

In [None]:
scales_df.head()

In [None]:
ehr_sample_dates = pd.concat([lab_df[['case_admission_id', 'sample_date']],
                                scales_df[['case_admission_id', 'sample_date']],
                                ventilation_df[['case_admission_id', 'sample_date']],
                                vitals_df[['case_admission_id', 'sample_date']]])

In [None]:
ehr_sample_dates.head()

In [None]:
ehr_sample_dates['sample_date'] = pd.to_datetime(ehr_sample_dates['sample_date'], format='%d.%m.%Y %H:%M')

In [None]:
ehr_first_sample_date = ehr_sample_dates.groupby('case_admission_id').sample_date.min()
ehr_first_sample_date = ehr_first_sample_date.reset_index(level=0)
ehr_first_sample_date.columns = ['case_admission_id', 'ehr_first_sample_date']

In [None]:
ehr_last_sample_date = ehr_sample_dates.groupby('case_admission_id').sample_date.max()
ehr_last_sample_date = ehr_last_sample_date.reset_index(level=0)
ehr_last_sample_date.columns = ['case_admission_id', 'ehr_last_sample_date']

In [None]:
ehr_sample_date_range = pd.merge(ehr_first_sample_date, ehr_last_sample_date, on='case_admission_id')
ehr_sample_date_range['ehr_sample_range'] = pd.to_datetime(ehr_sample_date_range['ehr_last_sample_date']) - pd.to_datetime(ehr_sample_date_range['ehr_first_sample_date'])

In [None]:
ehr_sample_date_range[ehr_sample_date_range['ehr_sample_range'] < pd.Timedelta('1 days 00:00:00')]

In [None]:
# set end of reference period to stroke onset or arrival at hospital, whichever is later
patient_selection_df['delta_onset_arrival'] = (
        pd.to_datetime(patient_selection_df['Stroke onset date'], format='%Y%m%d') - pd.to_datetime(
    patient_selection_df['Arrival at hospital'], format='%Y%m%d')).dt.total_seconds()
patient_selection_df['registry_sampling_start_upper_bound_reference'] = patient_selection_df \
        .apply(lambda x: x['Stroke onset date'] if x['delta_onset_arrival'] > 0 else x['Arrival at hospital'], axis=1)

In [None]:
patient_selection_df['registry_sampling_end'] = patient_selection_df['Discharge date'].fillna(patient_selection_df['Death at hospital date'])

In [None]:
patient_selection_df['registry_sample_range'] = pd.to_datetime(patient_selection_df['registry_sampling_end'], format='%Y%m%d') \
                                                - pd.to_datetime(patient_selection_df['registry_sampling_start_upper_bound_reference'], format='%Y%m%d')

In [None]:
patient_selection_df[['case_admission_id', 'registry_sample_range', 'registry_sampling_start_upper_bound_reference', 'registry_sampling_end']]

In [None]:
merged_data = pd.merge(ehr_sample_date_range, patient_selection_df[['case_admission_id', 'registry_sample_range', 'registry_sampling_start_upper_bound_reference', 'registry_sampling_end']], on='case_admission_id')
merged_data.head()

In [None]:
merged_data['delta_sample_range'] = merged_data['registry_sample_range'] - merged_data['ehr_sample_range']
merged_data.head()

In [None]:
# find cid with delta_sample_range over 1 day
patients_with_missing_data = merged_data[(merged_data['delta_sample_range'] > pd.Timedelta('1 days')) & (merged_data['ehr_sample_range'] < pd.Timedelta('3 days'))]
patients_with_missing_data

In [None]:
patients_with_missing_data[patients_with_missing_data.case_admission_id == '224339_5465']

Patients with probable wrong discharge date

In [None]:
merged_data[(~merged_data.case_admission_id.isin(patients_with_missing_data.case_admission_id) )
            & (merged_data['registry_sample_range'] < pd.Timedelta('1 days'))
            & (merged_data['ehr_sample_range'] >= pd.Timedelta('1 days'))]

In [None]:
patients_with_missing_data_with_info = pd.merge(patients_with_missing_data, patient_selection_df[patient_selection_df_initial_columns], on='case_admission_id')

In [None]:
patients_with_missing_data_with_info.head()

In [None]:
patients_with_missing_data_with_info.to_csv(os.path.join(out_dir, 'patients_with_some_missing_EHR_data_with_info.tsv'), index=False, sep='\t')