In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

from preprocessing.utils import create_ehr_case_identification_column
from preprocessing.vitals_preprocessing.vitals_preprocessing import string_to_numeric
from preprocessing.utils import restrict_variable_to_possible_ranges


In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20220815'
vitals_file_start = 'patientvalue'

In [None]:
vitals_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8', dtype=str)
             for f in os.listdir(data_path)
             if f.startswith(vitals_file_start)]

In [None]:
vitals_df = pd.concat(vitals_files, ignore_index=True)

In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
vitals_df['case_admission_id'] = create_ehr_case_identification_column(vitals_df)


In [None]:
vitals_df.head()

In [None]:
columns_to_drop = ['nr', 'patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
                   'end_date', 'death_date', 'death_hosp', 'eds_final_id',
                   'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
                   'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
                   'date_from', 'date_to', 'patient_id_manual', 'stroke_onset_date', 'Referral',
                   'match_by', 'multiple_id']
vitals_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
vitals_df.columns

## Extracting relevant information from table (only needed for extractions after 08 2022)

In [None]:
pv_values = vitals_df.patient_value.value_counts().reset_index()
pv_values

In [None]:
pv_values[~pv_values['index'].str.contains('lab')]

__Target parameters__: pv.ta, pv.pulse, pv.spo2, pv.fr, pv.temperature, pv.glycemia, pv.weight

In [None]:
import time

start = time.time()
print("hello")

temp_df = vitals_df[(vitals_df.patient_value.values == 'patient.sv.poids')]

end = time.time()
print(end - start)

In [None]:
print(temp_df.subkey.unique())
temp_df.head(1000)

In [None]:
temp_df[temp_df.subkey == 'Unite'].value.unique()

In [None]:
cid2 = temp_df[temp_df.subkey == 'Valeur'].case_admission_id.unique()

In [None]:
cid1 = temp_df[temp_df.subkey == 'weight'].case_admission_id.unique()

In [None]:
len(set(cid2) - set(cid1))

In [None]:
temp_df[temp_df.subkey == 'o2'].head()

#### Preprocessing temperature

In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    temperature_df = vitals_df[vitals_df.patient_value.values == 'pv.temperature']
    temperature_df = temperature_df[temperature_df.subkey.values == 'temperature']
    temperature_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    temperature_df.rename(columns={'value':'temperature', 'unit':'temp_unit'}, inplace=True)
else:
    temperature_df = vitals_df[['case_admission_id', 'datetime', 'temperature', 'temp_unit']]

temperature_df.dropna(subset=['temperature'], inplace=True)

In [None]:
temperature_df.head()

In [None]:
from preprocessing.utils import safe_conversion_to_numeric

# convert ',' to '.' in temperature column
temperature_df['temperature'] = temperature_df['temperature'].astype(str).apply(lambda t: t.replace(',', '.'))
# remove trailing '.'
temperature_df['temperature'] = temperature_df['temperature'].apply(lambda t: t.rstrip('.'))
temperature_df = temperature_df[temperature_df.temperature != '']

temperature_df = safe_conversion_to_numeric(temperature_df, 'temperature')

In [None]:
if len(temperature_df['temp_unit'].unique()) > 1:
    raise ValueError('Temperature units not unified:', temperature_df['temp_unit'].unique())
temperature_df['temp_unit'].unique()

In [None]:
temperature_df['temperature'].describe()

In [None]:
from preprocessing.utils import restrict_variable_to_possible_ranges

temperature_df, excluded_temperature_df = restrict_variable_to_possible_ranges(temperature_df, 'temperature', possible_value_ranges, verbose=True)

In [None]:
temperature_df['temperature'].plot.hist(bins=50)
plt.show()



### Processing glycemia

In [None]:

glycemia_df = vitals_df[vitals_df.patient_value.values == 'pv.glycemia']
glycemia_df = glycemia_df[glycemia_df.subkey.values == 'glycemia']
glycemia_df.drop(columns=['patient_value', 'subkey'], inplace=True)
glycemia_df.dropna(subset=['value'], inplace=True)

In [None]:
if len(glycemia_df['unit'].unique()) > 1:
    raise ValueError('Glycemia units not unified:', glycemia_df['unit'].unique())
glycemia_df['unit'].unique()

In [None]:
# convert ',' to '.' in temperature column
glycemia_df['value'] = glycemia_df['value'].astype(str).apply(lambda t: t.replace(',', '.'))
# remove trailing '.'
glycemia_df['value'] = glycemia_df['value'].astype(str).apply(lambda t: t.rstrip('.'))
glycemia_df = glycemia_df[glycemia_df.value != '']

In [None]:
glycemia_df = safe_conversion_to_numeric(glycemia_df, 'value')


In [None]:
glycemia_df.rename(columns={'value':'glucose'}, inplace=True)
glycemia_df, excluded_glycemia_df = restrict_variable_to_possible_ranges(glycemia_df, 'glucose', possible_value_ranges, verbose=True)

### Preprocessing systolic blood pressure


In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    sys_bp_df = vitals_df[(vitals_df.patient_value.values == 'pv.ta') & (vitals_df.subkey.values == 'sys')]
    sys_bp_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    sys_bp_df.rename(columns={'value':'sys', 'unit':'sys_unit'}, inplace=True)
else:
    sys_bp_df = vitals_df[['case_admission_id', 'datetime', 'sys', 'sys_unit']]

sys_bp_df.dropna(subset=['sys'], inplace=True)



In [None]:
sys_bp_df['sys_unit'].fillna('mmHg', inplace=True)
if len(sys_bp_df['sys_unit'].unique()) > 1:
    raise ValueError('Systolic blood pressure units not unified:', sys_bp_df['sys_unit'].unique())
sys_bp_df['sys_unit'].unique()

In [None]:
# convert ',' to '.' in temperature column
sys_bp_df['sys'] = sys_bp_df['sys'].astype(str).apply(lambda t: t.replace(',', '.'))
# remove trailing '.'
sys_bp_df['sys'] = sys_bp_df['sys'].astype(str).apply(lambda t: t.rstrip('.'))
sys_bp_df = sys_bp_df[sys_bp_df.sys != '']

In [None]:
sys_bp_df = safe_conversion_to_numeric(sys_bp_df, 'sys')

In [None]:
sys_bp_df, excluded_sys_bp_df = restrict_variable_to_possible_ranges(sys_bp_df, 'sys', possible_value_ranges, verbose=True)

In [None]:
sys_bp_df['sys'].describe()


In [None]:
sys_bp_df['sys'].plot.hist(bins=50)
plt.show()

### Preprocessing diastolic blood pressure


In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    dia_bp_df = vitals_df[(vitals_df.patient_value.values == 'pv.ta') & (vitals_df.subkey.values == 'dia')]
    dia_bp_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    dia_bp_df.rename(columns={'value':'dia', 'unit':'dia_unit'}, inplace=True)
else:
    dia_bp_df = vitals_df[['case_admission_id', 'datetime', 'dia', 'dia_unit']]

dia_bp_df.dropna(subset=['dia'], inplace=True)


In [None]:
dia_bp_df

In [None]:
dia_bp_df['dia_unit'].fillna('mmHg', inplace=True)
if len(dia_bp_df['dia_unit'].unique()) > 1:
    raise ValueError('Diasystolic blood pressure units not unified:', dia_bp_df['dia_unit'].unique())

In [None]:
dia_bp_df = string_to_numeric(dia_bp_df, 'dia')

In [None]:
dia_bp_df, excluded_dia_bp_df = restrict_variable_to_possible_ranges(dia_bp_df, 'dia', possible_value_ranges, verbose=True)

In [None]:
dia_bp_df.describe()

In [None]:
dia_bp_df['dia'].plot.hist(bins=50)
plt.show()

### Preprocessing mean blood pressure

In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    mean_bp_df = vitals_df[(vitals_df.patient_value.values == 'pv.ta') & (vitals_df.subkey.values == 'mean')]
    mean_bp_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    mean_bp_df.rename(columns={'value':'mean', 'unit':'mean_unit'}, inplace=True)
else:
    mean_bp_df = vitals_df[['case_admission_id', 'datetime', 'mean', 'mean_unit']]

mean_bp_df.dropna(subset=['mean'], inplace=True)

In [None]:
mean_bp_df

In [None]:
mean_bp_df['mean_unit'].fillna('mmHg', inplace=True)
if len(mean_bp_df['mean_unit'].unique()) > 1:
    raise ValueError('Mean blood pressure units not unified:', mean_bp_df['mean_unit'].unique())

In [None]:
mean_bp_df = string_to_numeric(mean_bp_df, 'mean')

In [None]:
mean_bp_df, excluded_mean_bp_df = restrict_variable_to_possible_ranges(mean_bp_df, 'mean', possible_value_ranges, verbose=True)

In [None]:
mean_bp_df.describe()

In [None]:
mean_bp_df['mean'].plot.hist(bins=50)
plt.show()

### Preprocessing heart rate


In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    pulse_df = vitals_df[(vitals_df.patient_value.values == 'pv.pulse') & (vitals_df.subkey.values == 'pulse')]
    pulse_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    pulse_df.rename(columns={'value':'pulse', 'unit':'pulse_unit'}, inplace=True)
else:
    pulse_df = vitals_df[['case_admission_id', 'datetime', 'pulse', 'pulse_unit']]

pulse_df.dropna(subset=['pulse'], inplace=True)


In [None]:
pulse_df

In [None]:
pulse_target_unit = possible_value_ranges[possible_value_ranges.variable_label == 'pulse'].units.iloc[0]
pulse_equivalent_units = ['bpm', 'puls./min.', '/min']
if pulse_target_unit in pulse_equivalent_units:
    pulse_df['pulse_unit'].fillna(pulse_target_unit, inplace=True)
    # convert to regex with |
    pulse_df['pulse_unit'].replace('|'.join(pulse_equivalent_units), pulse_target_unit, regex=True, inplace=True)
else:
    raise ValueError(f'Pulse target unit as defined in {possible_value_ranges_file}, not part of {pulse_equivalent_units}')
if len(pulse_df['pulse_unit'].unique()) > 1:
    raise ValueError('Pulse units not unified:', pulse_df['pulse_unit'].unique())

In [None]:
pulse_df[pulse_df.pulse.str.contains('-')]

In [None]:
pulse_df = pulse_df[pulse_df['pulse'] != '-']
pulse_df = string_to_numeric(pulse_df, 'pulse')


In [None]:
pulse_df, excluded_pulse_df = restrict_variable_to_possible_ranges(pulse_df, 'pulse', possible_value_ranges, verbose=True)

In [None]:
pulse_df.describe()

In [None]:
pulse_df.plot.hist(bins=50)
plt.show()

### Preprocessing respiratory rate


In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    resp_rate_df = vitals_df[(vitals_df.patient_value.values == 'pv.fr')]
    resp_rate_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    resp_rate_df.rename(columns={'value': 'fr', 'unit': 'fr_unit'}, inplace=True)
else:
    resp_rate_df = vitals_df[['case_admission_id', 'datetime', 'fr', 'fr_unit']]

resp_rate_df.dropna(subset=['fr'], inplace=True)


In [None]:
from preprocessing.vitals_preprocessing.vitals_preprocessing import harmonize_units

resp_rate_equivalent_units = ['/min', 'cycles/min.']
resp_rate_df = harmonize_units(resp_rate_df, 'fr', 'fr_unit', possible_value_ranges, resp_rate_equivalent_units)

In [None]:
resp_rate_df = string_to_numeric(resp_rate_df, 'fr')

In [None]:
resp_rate_df, excluded_resp_rate_df = restrict_variable_to_possible_ranges(resp_rate_df, 'fr', possible_value_ranges, verbose=True)

In [None]:
excluded_resp_rate_df

In [None]:
resp_rate_df.describe()

In [None]:
resp_rate_df.plot.hist(bins=50)
plt.show()

### Preprocessing oxygen saturation


In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    spo2_df = vitals_df[(vitals_df.patient_value.values == 'pv.spo2') & (vitals_df.subkey.values == 'spo2') ]
    spo2_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    spo2_df.rename(columns={'value': 'spo2', 'unit': 'spo2_unit'}, inplace=True)
else:
    spo2_df = vitals_df[['case_admission_id', 'datetime', 'spo2', 'spo2_unit']]

spo2_df.dropna(subset=['spo2'], inplace=True)


In [None]:
spo2_df = harmonize_units(spo2_df, 'spo2', 'spo2_unit', possible_value_ranges, ['%'])

In [None]:
spo2_df = string_to_numeric(spo2_df, 'spo2')

In [None]:
spo2_df, excluded_spo2_df = restrict_variable_to_possible_ranges(spo2_df, 'spo2', possible_value_ranges, verbose=True)

In [None]:
spo2_df.describe()

In [None]:
spo2_df.plot.hist(bins=50)
plt.show()

### Processing FiO2

In [None]:
# convert for compatibility with old data
fio2_df = vitals_df[(vitals_df.patient_value.values == 'pv.spo2')
                    & ((vitals_df.subkey.values == 'o2') | (vitals_df.subkey.values == 'fio2')) ]
fio2_df.drop(columns=['patient_value'], inplace=True)
fio2_df.rename(columns={'value': 'FIO2', 'unit': 'FIO2_unit'}, inplace=True)
fio2_df.dropna(subset=['FIO2'], inplace=True)

In [None]:
fio2_df = string_to_numeric(fio2_df, 'FIO2')


In [None]:
fio2_df

In [None]:
# Converting    O2    flow    to FIO2
fio2_df.loc[(fio2_df['FIO2_unit'] == 'L/min') & (fio2_df['FIO2'] > 15), 'FIO2'] = np.nan
fio2_df.loc[(fio2_df['FIO2_unit'] == 'L/min') & (fio2_df['FIO2'] < 0), 'FIO2'] = np.nan
# Set to 21% when flow == 0
fio2_df.loc[(fio2_df['FIO2_unit'] == 'L/min') & (fio2_df['FIO2'] == 0), 'FIO2'] = 21

fio2_df.loc[(fio2_df['FIO2_unit'] == 'L/min')
                   & (fio2_df['FIO2'].notnull()), 'FIO2'] = 20 + 4 * fio2_df[
    (fio2_df['FIO2_unit'] == 'L/min')
    & (fio2_df['FIO2'].notnull())]['FIO2']



In [None]:
fio2_df.loc[fio2_df['FIO2_unit'] == 'L/min', 'FIO2_unit'] = '%'

In [None]:
fio2_df = harmonize_units(fio2_df, 'FIO2', 'FIO2_unit', possible_value_ranges, ['%'])


In [None]:
fio2_df, excluded_fio2_df = restrict_variable_to_possible_ranges(fio2_df, 'FIO2', possible_value_ranges,
                                                      verbose=True)

In [None]:
excluded_fio2_df

In [None]:
fio2_df.dropna(subset=['FIO2'], inplace=True)
fio2_df.drop(columns=['subkey'], inplace=True)

In [None]:
fio2_df.describe()

In [None]:
fio2_df.plot.hist(bins=50)
plt.show()

### Preprocessing weight



In [None]:
if 'subkey' in vitals_df.columns:
    # convert for compatibility with old data
    weight_df = vitals_df[((vitals_df.patient_value.values == 'pv.weight') & (vitals_df.subkey.values == 'weight'))
                           | ((vitals_df.patient_value.values == 'patient.sv.poids') & (vitals_df.subkey.values == 'Valeur'))]
    weight_df.drop(columns=['patient_value', 'subkey'], inplace=True)
    weight_df.rename(columns={'value': 'weight', 'unit': 'weight_unit'}, inplace=True)
else:
    weight_df = vitals_df[['case_admission_id', 'datetime', 'weight', 'weight_unit']]

weight_df.dropna(subset=['weight'], inplace=True)

In [None]:
weight_df = harmonize_units(weight_df, 'weight', 'weight_unit', possible_value_ranges, ['kg'])

In [None]:
weight_df = string_to_numeric(weight_df, 'weight')

In [None]:
weight_df, excluded_weight_df = restrict_variable_to_possible_ranges(weight_df, 'weight', possible_value_ranges, verbose=True)

In [None]:
weight_df = weight_df.drop_duplicates()

In [None]:
weight_df.describe()

In [None]:
weight_df.plot.hist(bins=50)
plt.show()

# Testing entire pipeline

In [None]:
from preprocessing.vitals_preprocessing.vitals_preprocessing import preprocess_vitals

preprocessed_vitals_df = preprocess_vitals(vitals_df, verbose=True)

In [None]:
preprocessed_vitals_df.head()

In [None]:
preprocessed_vitals_df.vital_name.unique()

In [None]:
from matplotlib.dates import DateFormatter
import seaborn as sns
import matplotlib.pyplot as plt

pa_id = np.random.choice(preprocessed_vitals_df.case_admission_id.unique())
vital = 'heart_rate'
temp = preprocessed_vitals_df[(preprocessed_vitals_df['case_admission_id'] == pa_id) & (preprocessed_vitals_df.vital_name == vital)].copy()
temp['datetime'] = pd.to_datetime(temp['datetime'], format='%d.%m.%Y %H:%M')
ax = sns.scatterplot(x='datetime', y='vital_value', data=temp, hue='vital_value', legend=False)
# Define the date format
date_form = DateFormatter("%m-%d-%Y")
ax.xaxis.set_major_formatter(date_form)
ax.tick_params(axis="x", rotation=45)
ax.set_ylim(0,100)
ax.set_title(f'{vital} for subj {pa_id}')
plt.show()