In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

from preprocessing.utils import create_ehr_case_identification_column

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
vitals_file_start = 'patientvalue'

In [None]:
vitals_files = [pd.read_csv(os.path.join(data_path, f), delimiter=';', encoding='utf-8')
             for f in os.listdir(data_path)
             if f.startswith(vitals_file_start)]

In [None]:
vitals_df = pd.concat(vitals_files, ignore_index=True)

In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                          'preprocessing', 'possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
def restrict_variable_to_possible_ranges(df, variable_name, possible_value_ranges, verbose=False):
    """
    Restricts a variable to the possible ranges in the possible_value_ranges dataframe.
    """
    variable_range = possible_value_ranges[possible_value_ranges['variable_label'] == variable_name]
    variable_range = variable_range.iloc[0]
    clean_df = df.copy()
    clean_df[variable_name] = df[variable_name].apply(lambda x: np.nan if x < variable_range['Min'] or x > variable_range['Max'] else x)
    if verbose:
        print(f'Excluding {clean_df[variable_name].isna().sum()} observations because out of range')
    excluded_df = df[clean_df[variable_name].isna()]
    # TODO verify this - probably na should only be dropped for a certain variable
    clean_df = clean_df.dropna()
    return clean_df, excluded_df



In [None]:
vitals_df['case_admission_id'] = create_ehr_case_identification_column(vitals_df)


In [None]:
columns_to_drop = ['nr', 'patient_id', 'eds_end_4digit', 'eds_manual', 'DOB', 'begin_date',
       'end_date', 'death_date', 'death_hosp', 'eds_final_id',
       'eds_final_begin', 'eds_final_end', 'eds_final_patient_id',
       'eds_final_birth', 'eds_final_death', 'eds_final_birth_str',
       'date_from', 'date_to', 'patient_value']
vitals_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
vitals_df.columns

Preprocessing temperature

In [None]:
temperature_df = vitals_df[['case_admission_id', 'datetime', 'temperature', 'temp_unit']].dropna()

In [None]:
# convert ',' to '.' in temperature column
temperature_df['temperature'] = temperature_df['temperature'].astype(str).apply(lambda t: t.replace(',', '.'))
# remove trailing '.'
temperature_df['temperature'] = temperature_df['temperature'].apply(lambda t: t.rstrip('.'))
temperature_df['temperature'] = temperature_df['temperature'].astype(float)

In [None]:
temperature_df['temp_unit'].unique()

In [None]:
temperature_df['temperature'].describe()

In [None]:
temperature_df, _ = restrict_variable_to_possible_ranges(temperature_df, 'temperature', possible_value_ranges, verbose=True)

In [None]:
temperature_df['temperature'].plot.hist(bins=50)
plt.show()



Preprocessing systolic blood pressure


In [None]:
sys_bp_df = vitals_df[['case_admission_id', 'datetime', 'sys', 'sys_unit']].dropna()

In [None]:
sys_bp_df, excluded_sys_bp_df = restrict_variable_to_possible_ranges(sys_bp_df, 'sys', possible_value_ranges, verbose=True)

In [None]:
sys_bp_df['sys_unit'].unique()

In [None]:
sys_bp_df['sys'].describe()


In [None]:
sys_bp_df['sys'].plot.hist(bins=50)
plt.show()

Preprocessing diastolic blood pressure


In [None]:
dia_bp_df = vitals_df[['case_admission_id', 'datetime', 'dia', 'dia_unit']].dropna()

In [None]:
dia_bp_df, excluded_dia_bp_df = restrict_variable_to_possible_ranges(dia_bp_df, 'dia', possible_value_ranges, verbose=True)

In [None]:
dia_bp_df['dia_unit'].unique()

In [None]:
dia_bp_df.describe()

In [None]:
dia_bp_df['dia'].plot.hist(bins=50)
plt.show()

Preprocessing mean blood pressure

In [None]:
mean_bp_df = vitals_df[['case_admission_id', 'datetime', 'mean', 'mean_unit']].dropna()

In [None]:
mean_bp_df, excluded_mean_bp_df = restrict_variable_to_possible_ranges(mean_bp_df, 'mean', possible_value_ranges, verbose=True)

In [None]:
mean_bp_df['mean_unit'].unique()

In [None]:
mean_bp_df.describe()

In [None]:
mean_bp_df['mean'].plot.hist(bins=50)
plt.show()

Preprocessing heart rate


In [None]:
pulse_df = vitals_df[['case_admission_id', 'datetime', 'pulse', 'pulse_unit']].dropna()

In [None]:
pulse_df['pulse'] = pulse_df['pulse'].astype(str).apply(lambda p: p.replace(',', '.'))
pulse_df = pulse_df[pulse_df['pulse'] != '.']
pulse_df['pulse'] = pulse_df['pulse'].astype(float)

In [None]:
pulse_df, excluded_pulse_df = restrict_variable_to_possible_ranges(pulse_df, 'pulse', possible_value_ranges, verbose=True)

In [None]:
pulse_df['pulse_unit'].unique()
pulse_df['pulse_unit'] = '/min'

In [None]:
pulse_df.describe()

In [None]:
pulse_df.plot.hist(bins=50)
plt.show()

Preprocessing respiratory rate


In [None]:
resp_rate_df = vitals_df[['case_admission_id', 'datetime', 'fr', 'fr_unit']].dropna()

In [None]:
resp_rate_df['fr'] = resp_rate_df['fr'].astype(str).apply(lambda r: r.replace(',', '.'))
resp_rate_df = resp_rate_df[resp_rate_df['fr'] != '.']
resp_rate_df['fr'] = resp_rate_df['fr'].astype(float)

In [None]:
resp_rate_df, excluded_resp_rate_df = restrict_variable_to_possible_ranges(resp_rate_df, 'fr', possible_value_ranges, verbose=True)

In [None]:
excluded_resp_rate_df

In [None]:
resp_rate_df['fr_unit'].unique()
resp_rate_df['fr_unit'] = '/min'

In [None]:
resp_rate_df.describe()

In [None]:
resp_rate_df.plot.hist(bins=50)
plt.show()

Preprocessing oxygen saturation


In [None]:
spo2_df = vitals_df[['case_admission_id', 'datetime', 'spo2', 'spo2_unit']].dropna()

In [None]:
spo2_df, excluded_spo2_df = restrict_variable_to_possible_ranges(spo2_df, 'spo2', possible_value_ranges, verbose=True)

In [None]:
spo2_df['spo2_unit'].unique()

In [None]:
spo2_df.describe()

In [None]:
spo2_df.plot.hist(bins=50)
plt.show()

Preprocessing weight



In [None]:
weight_df = vitals_df[['case_admission_id', 'datetime', 'weight', 'weight_unit']].dropna()

In [None]:
weight_df['weight_unit'].unique()

In [None]:
weight_df, excluded_weight_df = restrict_variable_to_possible_ranges(weight_df, 'weight', possible_value_ranges, verbose=True)

In [None]:
weight_df.describe()

In [None]:
weight_df.plot.hist(bins=50)
plt.show()

In [None]:
from matplotlib.dates import DateFormatter
import seaborn as sns
import matplotlib.pyplot as plt

pa_id = '9857_17062020'
selected_vital_df = weight_df
vital = 'weight'
temp = selected_vital_df[(selected_vital_df['case_admission_id'] == pa_id)].copy()
temp['datetime'] = pd.to_datetime(temp['datetime'], format='%d.%m.%Y %H:%M')
ax = sns.scatterplot(x='datetime', y=vital, data=temp, hue=vital, legend=False)
# Define the date format
date_form = DateFormatter("%m-%d-%Y")
ax.xaxis.set_major_formatter(date_form)
ax.tick_params(axis="x", rotation=45)
ax.set_ylim(0,100)
plt.show()