In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
monitoring_path = '/Users/jk1/temp/mimic/extraction/monitoring_df.csv'
patient_selection_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/mimic_data/combined_notes_labels.xlsx'

In [None]:
monitoring_df = pd.read_csv(monitoring_path)

In [None]:
selection_df = pd.read_excel(patient_selection_path)


In [None]:
possible_value_ranges_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))),
                                              'preprocessing/possible_ranges_for_variables.xlsx')
possible_value_ranges = pd.read_excel(possible_value_ranges_file)

In [None]:
all_label_df = monitoring_df.label.value_counts()
all_label_df

In [None]:
# all_label_df.to_csv('./all_monitoring_labels.csv')

### GCS extraction

In [None]:
GCS_components = ['GCS - Eye Opening', 'GCS - Motor Response', 'GCS - Verbal Response', 'Eye Opening', 'Verbal Response', 'Motor Response', 'GCS Total']

In [None]:
for label in GCS_components:
    print(label, monitoring_df[monitoring_df.label == label].value.count())
    print(monitoring_df[monitoring_df.label == label].value.unique())

In [None]:
verbal_components = ['GCS - Verbal Response', 'Verbal Response']

In [None]:
set(monitoring_df.hadm_id) - set(monitoring_df[monitoring_df.label.isin(verbal_components)].hadm_id)

In [None]:
len(set(monitoring_df[(monitoring_df.label.isin(GCS_components))].hadm_id)&set(selection_df.hadm_id)) / len(set(selection_df.hadm_id))

### FiO2 extraction

Target: % of FiO2 (21-100)

In [None]:
FiO2_labels = ['FiO2 Set', 'Inspired O2 Fraction']
O2_flow_labels = ['O2 Flow', 'O2 Flow (lpm)', 'O2 Flow (lpm) #2']
O2_labels = FiO2_labels + O2_flow_labels

In [None]:
fio2_df = monitoring_df[monitoring_df.label.isin(O2_labels)]
fio2_df.dropna(subset=['valuenum'], inplace=True)

In [None]:
# convert FiO2 set to percentage
fio2_df.loc[fio2_df.label == 'FiO2 Set', 'valuenum'] = fio2_df[fio2_df.label == 'FiO2 Set'].valuenum * 100

In [None]:
# Converting    O2    flow    to FIO2
fio2_df.loc[(fio2_df.label.isin(O2_flow_labels)) & (fio2_df.valuenum > 15), 'valuenum'] = np.nan
fio2_df.loc[(fio2_df.label.isin(O2_flow_labels)) & (fio2_df.valuenum < 0), 'valuenum'] = np.nan
# Set to 21% when flow == 0
fio2_df.loc[(fio2_df.label.isin(O2_flow_labels)) & (fio2_df.valuenum == 0), 'valuenum'] = 21

fio2_df.loc[(fio2_df.label.isin(O2_flow_labels))
            & (fio2_df.valuenum.notnull()), 'valuenum'] = 20 + 4 * fio2_df[
    (fio2_df.label.isin(O2_flow_labels))
    & (fio2_df.valuenum.notnull())]['valuenum']
fio2_df['valueuom'] = '%'
fio2_df['value'] = fio2_df.valuenum.astype(str)
fio2_df['label'] = 'FIO2'
fio2_df = fio2_df.drop_duplicates()

### O2 Saturation preprocessing

In [None]:
o2_sat_labels = ['O2 saturation pulseoxymetry', 'SpO2']

In [None]:
spo2_df = monitoring_df[monitoring_df.label.isin(o2_sat_labels)]
spo2_df.head()

In [None]:
for label in o2_sat_labels:
    print(label)
    print(monitoring_df[monitoring_df.label.isin([label])].valueuom.unique())
    print(monitoring_df[monitoring_df.label.isin([label])].valuenum.describe())

In [None]:
spo2_df.dropna(subset=['valuenum'], inplace=True)
spo2_df = spo2_df.drop_duplicates()
# spo2_df, _ = restrict_variable_to_possible_ranges(spo2_df, 'spo2', possible_value_ranges,
#                                                   verbose=verbose)
spo2_df['label'] = 'oxygen_saturation'

### systolic_blood_pressure

In [None]:
sys_bp_labels = ['Arterial BP [Systolic]', 'Non Invasive Blood Pressure systolic', 'NBP [Systolic]', 'Arterial Blood Pressure systolic', 'ART BP Systolic', 'Arterial BP #2 [Systolic]', 'Manual Blood Pressure Systolic Left', 'Manual Blood Pressure Systolic Right', 'Manual BP [Systolic]']

In [None]:
for label in sys_bp_labels:
    print(label)
    print(monitoring_df[monitoring_df.label.isin([label])].valuenum.describe())


In [None]:
sys_bp_df = monitoring_df[monitoring_df.label.isin(sys_bp_labels)]

In [None]:
sys_bp_df.valuenum.dtype

## diastolic bp

In [None]:
dia_bp_labels = ['Arterial BP [Diastolic]', 'Non Invasive Blood Pressure diastolic', 'NBP [Diastolic]', 'Arterial Blood Pressure diastolic', 'ART BP Diastolic', 'Arterial BP #2 [Diastolic]', 'Manual BP [Diastolic]', 'Manual Blood Pressure Diastolic Left', 'Manual Blood Pressure Diastolic Right']

In [None]:
dia_bp_df = monitoring_df[monitoring_df.label.isin(dia_bp_labels)]

In [None]:
dia_bp_df.valuenum.dtype

## mean bp

In [None]:
mean_bp_labels = ['Arterial BP Mean', 'Non Invasive Blood Pressure mean', 'NBP Mean', 'Arterial Blood Pressure mean', 'ART BP mean', 'Arterial BP Mean #2', 'Manual BP Mean(calc)']

In [None]:
mean_bp_df = monitoring_df[monitoring_df.label.isin(mean_bp_labels)]


In [None]:
mean_bp_df.valuenum.dtype

In [None]:
mean_bp_df.valuenum.describe()

### heart rate

In [None]:
heart_rate_labels = ['Heart Rate']

In [None]:
heart_rate_df = monitoring_df[monitoring_df.label == heart_rate_labels[0]]

In [None]:
heart_rate_df.valueuom.unique()

### respiratory rate

In [None]:
resp_rate_labels = ['Respiratory Rate', 'Respiratory Rate (spontaneous)', 'Respiratory Rate (Total)']

In [None]:
monitoring_df[monitoring_df.label == resp_rate_labels[2]]

In [None]:
resp_rate_df = monitoring_df[monitoring_df.label.isin(resp_rate_labels)]


In [None]:
resp_rate_df.valueuom.unique()

In [None]:
resp_rate_df.valuenum.describe()

### temperature

In [None]:
temperature_labels = ['Temperature F', 'Temperature C (calc)', 'Temperature Fahrenheit', 'Temperature C', 'Temperature F (calc)', 'Temperature Celsius']

In [None]:
temperature_df = monitoring_df[monitoring_df.label.isin(temperature_labels)]

In [None]:
temperature_df.groupby('label').valueuom.unique()

In [None]:
temperature_df[temperature_df.valueuom.isna()].valuenum.unique()

In [None]:
fahrenheit_equivalents = ['Deg. F', '?F']
celsius_equivalents = ['Deg. C', '?C']
temperature_df.loc[temperature_df.valueuom.isin(fahrenheit_equivalents), 'valuenum'] = (temperature_df[temperature_df.valueuom.isin(fahrenheit_equivalents)] - 32) * (5 / 9)
temperature_df.loc[temperature_df.valueuom.isin(fahrenheit_equivalents + celsius_equivalents), 'valueuom'] = celsius_equivalents[0]

# if len(temperature_df['temp_unit'].unique()) > 1:
#     raise ValueError('Temperature units not unified:', temperature_df['temp_unit'].unique())


In [None]:
temperature_df.valuenum.dtype

### weight

itemids: 762, 763, 3723, 3580, 3581, 3582

In [None]:
admission_weight_labels = ['Admit Wt', 'Admission Weight (lbs.)', 'Admission Weight (Kg)', 'Previous WeightF', 'Previous Weight']
monitoring_weight_labels = ['Daily Weight']

In [None]:
monitoring_df[monitoring_df.label.isin(admission_weight_labels + monitoring_weight_labels)].valueuom.unique()

In [None]:
weight_df = monitoring_df[monitoring_df.label.isin(admission_weight_labels + monitoring_weight_labels)]

In [None]:
len(set(monitoring_df[(monitoring_df.label.isin(admission_weight_labels + monitoring_weight_labels))].hadm_id)&set(selection_df.hadm_id)) / len(set(selection_df.hadm_id))

### height

itemids: 920, 1394, 4187, 3486, 3485, 4188

In [None]:
height_labels = ['Height (cm)', 'Height', 'Admit Ht']
height_itemids = [920, 1394, 4187, 3486, 3485, 4188]

In [None]:
height_df = monitoring_df[monitoring_df.label.isin(height_labels)]

In [None]:
monitoring_df[monitoring_df.label.isin(height_labels)].isin(['inches', 'Inch'])

In [None]:
len(set(monitoring_df[(monitoring_df.label.isin(height_labels))].hadm_id)&set(selection_df.hadm_id)) / len(set(selection_df.hadm_id))

### glycemia

In [None]:
glucose_labels = ['Fingerstick Glucose', 'Glucose finger stick', 'Glucose', 'Glucose (serum)', 'Glucose (whole blood)', 'Glucose (70-105)']

In [None]:
monitoring_df[monitoring_df.label.isin(glucose_labels)].valuenum.dtype

In [None]:
for label in glucose_labels:
    print(label)
    print(monitoring_df[monitoring_df.label.isin([label])].valueuom.unique())
    print(monitoring_df[monitoring_df.label.isin([label])].valuenum.describe())

In [None]:
monitoring_df.head()

# Verifying preprocessing

In [None]:
preprocessed_data_path = '/Users/jk1/temp/mimic/preprocessing/preprocessed_monitoring_df.csv'

In [None]:
preprocessed_dd = pd.read_csv(preprocessed_data_path)

In [None]:
preprocessed_dd.head()

In [None]:
preprocessed_dd.label.unique()

In [None]:
preprocessed_dd.valuenum.isna().value_counts()