In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

from preprocessing.utils import create_ehr_case_identification_column

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
ventilation_file_start = 'ventilation'

In [None]:
ventilation_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype=str)
             for f in os.listdir(data_path)
             if f.startswith(ventilation_file_start)]

In [None]:
ventilation_df = pd.concat(ventilation_files, ignore_index=True)

In [None]:
ventilation_df['case_admission_id'] = create_ehr_case_identification_column(ventilation_df)

columns_to_drop = ['nr', 'patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
                       'end_date', 'death_date', 'death_hosp', 'eds_final_id',
                       'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
                       'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
                       'date_from', 'date_to', 'patient_id_manual', 'stroke_onset_date', 'Referral', 'match_by', 'multiple_id']
ventilation_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
ventilation_df.columns

In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)


In [None]:
ventilation_df['FIO2']= ventilation_df['FIO2'].astype(float)

In [None]:
ventilation_df['FIO2'].describe()

In [None]:
ventilation_df['FIO2_unit'].unique()

Converting O2 flow to FIO2

In [None]:
ventilation_df['O2'] = ventilation_df['O2'].astype(str).apply(lambda t: t.replace(',', '.'))
ventilation_df['O2'] = ventilation_df['O2'].astype(float)

In [None]:
ventilation_df['O2_unit'].value_counts()

In [None]:
ventilation_df.loc[(ventilation_df['O2_unit'] == '%') & (ventilation_df['FIO2'].isnull()), 'FIO2'] = ventilation_df[(ventilation_df['O2_unit'] == '%') & (ventilation_df['FIO2'].isnull())]['O2']

In [None]:
ventilation_df[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'] <0 )]

In [None]:
ventilation_df.loc[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'] > 15), 'O2'] = np.nan
ventilation_df.loc[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'] < 0), 'O2'] = np.nan


In [None]:
ventilation_df.loc[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'].notnull()) & (ventilation_df['FIO2'].isnull()), 'FIO2'] = 20 + 4 * ventilation_df[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'].notnull()) & (ventilation_df['FIO2'].isnull())]['O2']
ventilation_df.loc[(ventilation_df['O2_unit'] == 'L/min') & (ventilation_df['O2'] == 0) & (ventilation_df['FIO2'].isnull()), 'FIO2'] = 21


In [None]:
variables_to_drop = ['air', 'air_unit', 'peep', 'peep_unit', 'startingFlow', 'startingFlow_unit',
                     'flow', 'flow_unit', 'temperature', 'temperature_unit',
       'ai', 'ai_unit', 'epap', 'epap_unit', 'ipap', 'ipap_unit', 'slop',
       'slop_unit', 'ti_max', 'ti_max_unit', 'ti_min', 'ti_min_unit',
       'trigger_insp', 'trigger_insp_unit', 'duration', 'duration_unit']
ventilation_df.drop(variables_to_drop, axis=1, inplace=True)

In [None]:
ventilation_df.head()

In [None]:
from preprocessing.utils import safe_conversion_to_numeric

fio2_df = safe_conversion_to_numeric(ventilation_df[['case_admission_id', 'FIO2', 'FIO2_unit', 'datetime']].dropna(subset=['FIO2']), 'FIO2')
spo2_df = safe_conversion_to_numeric(ventilation_df[['case_admission_id', 'spo2', 'spo2_unit', 'datetime']].dropna(subset=['spo2']), 'spo2')

In [None]:
fio2_df

In [None]:
from preprocessing.utils import restrict_variable_to_possible_ranges

fio2_df, excluded_fio2_df = restrict_variable_to_possible_ranges(fio2_df, 'FIO2', possible_value_ranges, verbose=True)

In [None]:
spo2_df, excluded_spo2_df = restrict_variable_to_possible_ranges(spo2_df, 'spo2', possible_value_ranges, verbose=True)

In [None]:
fio2_df.head()

In [None]:
spo2_df.head()

In [None]:
fio2_df.describe()

In [None]:
fio2_df['FIO2'].plot.hist(bins=50)
plt.show()

In [None]:
spo2_df['spo2'].plot.hist(bins=50)
plt.show()

In [None]:
spo2_df.describe()

In [None]:
import seaborn as sns
g = sns.displot(x="FIO2", data=fio2_df, kde=True, legend=False)
plt.show()

## Impute missing FiO2 to 21%

In [None]:
eds_df = eds_df = pd.read_csv(os.path.join(data_path, 'eds_j1.csv'), delimiter=';', encoding='utf-8', dtype=str)

In [None]:
eds_df['case_admission_id'] = create_ehr_case_identification_column(eds_df)

In [None]:
len(set(eds_df['case_admission_id']) - set(fio2_df['case_admission_id']))

In [None]:
case_admission_ids_with_no_fio2 = set(eds_df['case_admission_id']) - set(fio2_df['case_admission_id'])
room_air_fio2_df = pd.DataFrame(columns=['case_admission_id', 'FIO2', 'FIO2_unit', 'datetime'])
room_air_fio2_df['case_admission_id'] = pd.Series(list(case_admission_ids_with_no_fio2))
room_air_fio2_df['FIO2'] = 21
room_air_fio2_df['FIO2_unit'] = '%'
room_air_fio2_df['datetime'] = eds_df['eds_final_begin'].fillna(eds_df['begin_date'])

In [None]:
room_air_fio2_df