In [8]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.cluster import KMeans

from sklearn.preprocessing import OneHotEncoder

SEED=90089

In [20]:
# Read Data
patient_df = pd.read_csv('../data/hyperglycemic_patients.csv')

#Print shape and data preview
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (13424, 17)
Dataframe Features: Index(['glucose_max', 'anchor_age', 'dod', 'gender', 'dbp_mean', 'sbp_mean',
       'glucose_mean', 'heart_rate_mean', 'spo2_mean', 'resp_rate_mean',
       'temperature_mean', 'apsiii', 'glucose_score', 'los', 'avg_bmi_value',
       'avg_sofa', 'admission_type'],
      dtype='object')


Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los,avg_bmi_value,avg_sofa,admission_type
0,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,37.18625,38,3,1.938414,26.75,3.0,OBSERVATION ADMIT
1,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,36.727586,87,5,4.472905,30.558477,7.0,EW EMER.
2,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,31.9,88,3,0.309664,30.558477,10.0,EW EMER.
3,406.0,24,0,0,57.333333,92.848485,292.0,86.631579,96.162162,15.289474,33.59,150,5,17.513646,30.558477,13.0,EW EMER.
4,398.0,31,0,0,78.6,133.8,285.0,100.826087,90.304348,25.173913,36.593333,34,5,2.794167,30.558477,1.0,EW EMER.


### Assign labels to `los` based on `>3Days` and `>7days` inspired based-on prev. researches:

for reference, see [Wang, S., McDermott, M.B., Chauhan, G., Ghassemi, M., Hughes, M.C. and Naumann, T., 2020, April. Mimic-extract: A data extraction, preprocessing, and representation pipeline for mimic-iii. In Proceedings of the ACM conference on health, inference, and learning (pp. 222-235).](https://arxiv.org/pdf/1907.08322v2.pdf)

`los` data was very right-skewed from the `exploratory_data_analysis.ipynb` from `visualizations`. 

In [21]:
labelled_patient_df = patient_df.copy()
label_lst = []
for idx, row in labelled_patient_df.iterrows():
    if row['los'] >= 7: 
        label_lst.append(2)
    elif row['los'] > 3:
        label_lst.append(1)
    else:
        label_lst.append(0)


labelled_patient_df['label'] = pd.Series(label_lst)

In [22]:
dict(labelled_patient_df.label.value_counts())

{0: 8365, 1: 3211, 2: 1848}

### One-hot encoding on `admission_type` columns

In [23]:
labelled_patient_df['admission_type']= labelled_patient_df['admission_type'].astype('category')

labelled_patient_df = pd.concat([labelled_patient_df, 
                                pd.get_dummies(labelled_patient_df['admission_type'])],
                                axis=1)
labelled_patient_df.head()

Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,...,label,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT
0,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,...,0,0,0,0,0,0,0,1,0,0
1,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,...,1,0,0,0,0,0,1,0,0,0
2,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,...,0,0,0,0,0,0,1,0,0,0
3,406.0,24,0,0,57.333333,92.848485,292.0,86.631579,96.162162,15.289474,...,2,0,0,0,0,0,1,0,0,0
4,398.0,31,0,0,78.6,133.8,285.0,100.826087,90.304348,25.173913,...,0,0,0,0,0,0,1,0,0,0


In [24]:
# Export label data to csv
labelled_patient_df.to_csv('../data/hyperglycemic_patients_w_bin_categories.csv')

In [78]:
# # Data split
# X = label_df.loc[:, label_df.columns != 'label']
# y = label_df.loc[:, label_df.columns == 'label']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=89)
# X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=89)