# Pre Process

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time
from sklearn.preprocessing import scale
from scipy import stats
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
data = pd.read_csv('data(with_label).csv')

### Remove Unwanted Columns

* **subject_id**: Categorical data with one category for each patient. Unusable.
* **hadm_id**: No information in the code assigned to each admission.
* **discharge_location**: One of the possible values for this feature shows that the patient died. This means that this column is unsuitable to predict patient death.
* **DOD**: Aka. Date of Death. Unsuitable to predict patient death.
* **deathtime**: Unsuitable to predict patient death.
* **time_before_death**: Unsuitable to predict patient death.
* **los_hospital**: Aka. Length of stay in hospital. Can't be used to predict patient death.
* **admittime**:
* **dischtime**:


In [3]:
str_cols = [
    'admittime',
    'dischtime',
    'subject_id',
    'hadm_id',
    'deathtime',
    'dod',
    'time_before_death',
    'discharge_location',
    'los_hospital',
]
data = data.drop(str_cols, axis = 1)

### Marital Status NaN to UNKNOWN (DEFAULT)

In [4]:
data.loc[pd.isna(data['marital_status']), 'marital_status'] = 'UNKNOWN (DEFAULT)'

### Diagnosis NaN to PNEUMONIA

In [5]:
data.loc[pd.isna(data['diagnosis']), 'diagnosis'] = 'PNEUMONIA'

### Correct Age

Patients that were older than 89 years old appear as older than 300. This value should not be considered an outlier, we just need to convert it back.

In [6]:
data['age'] = data['age'].apply(lambda x: x if x < 100 else 95)

### Convert Categorical Data to Number

Autoconvert every categorical data to numeric data. One integer is assigned to each column category with the highesst integer corresponding to the highest occuring category.

In [7]:
for n,c in enumerate(np.flip(data['first_careunit'].value_counts().index.values)):
    print(n+1,c)

1 TSICU
2 CCU
3 SICU
4 CSRU
5 MICU


In [8]:
def categ_to_num(df, cols):
    for col in cols:
        map_dict = {}
        for n, c in enumerate(np.flip(data[col].value_counts().index.values)):
            map_dict[c] = n + 1 
        df[col] = df[col].map(map_dict)
    return df

In [9]:
categorical = [
    'first_careunit', 
    'last_careunit',
    'gender', 
    'marital_status',
    'insurance', 
    'icustay_age_group', 
    'admission_type', 
    'admission_location',  
    'ethnicity', 
    'diagnosis',
]
data = categ_to_num(data, categorical)

### Split before filling and scaling

Split the data into train and test data

In [10]:
columns = list(data.columns)
X = data # fetures + expected outs
y = data[columns[-2::]] # 30 days + 1 Year

# The data, split between train and test sets:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

X_train shape: (38857, 60)
38857 train samples
19139 test samples


In [11]:
data_train_30d = X_train.copy()
data_test_30d = X_test.copy()
data_train_1y = X_train.copy()
data_test_1y = X_test.copy()

In [12]:
data_train_30d = data_train_30d.reset_index()
data_test_30d = data_test_30d.reset_index()
data_train_1y = data_train_1y.reset_index()
data_test_1y = data_test_1y.reset_index()

### Fill Missing Values

Fill the missing values with the mean of that feature taking into account the group that patient belongs to.
i.e. If a patient did not die in thirty days, replace the missing value with the average of all the patients that didn't die in thirty days.
Also appends a column with missing value indication.

In [13]:
def fill_missing_by_group(df, cols, group_col):
    gc_unique = df[group_col].unique()

    grp_means = []
    grp_inds = []

    for v in gc_unique:
        grp_means.append(df[df[group_col] == v].mean())
        grp_inds.append(df[group_col] == v)
        
    for col in cols:
        col_miss_ind = pd.isna(df[col])
        df[col+'_mv'] = col_miss_ind * 1

        for i in range(len(grp_inds)):
            grp_miss_inds = col_miss_ind & grp_inds[i]
            df.loc[grp_miss_inds, col] = grp_means[i][col]
    return df

In [14]:
cols = [
    'age',
    'urea_n_min', 
    'urea_n_max', 
    'urea_n_mean', 
    'platelets_min',
    'platelets_max', 
    'platelets_mean', 
    'magnesium_max', 
    'albumin_min',
    'calcium_min', 
    'resprate_min', 
    'resprate_max', 
    'resprate_mean',
    'glucose_min', 
    'glucose_max', 
    'glucose_mean', 
    'hr_min', 
    'hr_max',
    'hr_mean', 
    'sysbp_min', 
    'sysbp_max', 
    'sysbp_mean', 
    'diasbp_min',
    'diasbp_max', 
    'diasbp_mean', 
    'temp_min', 
    'temp_max', 
    'temp_mean',
    'sapsii', 
    'sofa', 
    'urine_min', 
    'urine_mean', 
    'urine_max',
    'elixhauser_vanwalraven', 
    'elixhauser_sid29', 
    'elixhauser_sid30', 
    'meanbp_min', 
    'meanbp_max', 
    'meanbp_mean', 
    'spo2_min',
    'spo2_max', 
    'spo2_mean', 
    'vent', 
    'rrt', 
    'urineoutput',
    'oasis', 
    'lods', 
    'sirs',    
]

In [15]:
data_train_30d = fill_missing_by_group(data_train_30d, cols, 'thirty_days')

In [16]:
data_test_30d = fill_missing_by_group(data_test_30d, cols, 'thirty_days')

In [17]:
data_train_1y = fill_missing_by_group(data_train_1y, cols, 'one_year')

In [18]:
data_test_1y = fill_missing_by_group(data_test_1y, cols, 'one_year')

### Convert Types

In [20]:
data_train_30d = data_train_30d.astype(np.float64)
data_test_30d = data_test_30d.astype(np.float64)
data_train_1y = data_train_1y.astype(np.float64)
data_test_1y = data_test_1y.astype(np.float64)

### Scale Values

In [21]:
cols = [
    'first_careunit',
    'last_careunit',
    'age',
    'gender',
    'marital_status',
    'insurance',
    'urea_n_min',
    'urea_n_max',
    'urea_n_mean',
    'platelets_min',
    'platelets_max',
    'platelets_mean',
    'magnesium_max',
    'albumin_min',
    'calcium_min',
    'resprate_min',
    'resprate_max',
    'resprate_mean',
    'glucose_min',
    'glucose_max',
    'glucose_mean',
    'hr_min',
    'hr_max',
    'hr_mean',
    'sysbp_min',
    'sysbp_max',
    'sysbp_mean',
    'diasbp_min',
    'diasbp_max',
    'diasbp_mean',
    'temp_min',
    'temp_max',
    'temp_mean',
    'sapsii',
    'sofa',
    'urine_min',
    'urine_mean',
    'urine_max',
    'elixhauser_vanwalraven',
    'elixhauser_sid29',
    'elixhauser_sid30',
    'meanbp_min',
    'meanbp_max',
    'meanbp_mean',
    'spo2_min',
    'spo2_max',
    'spo2_mean',
    'vent',
    'rrt',
    'urineoutput',
    'icustay_age_group',
    'oasis',
    'lods',
    'sirs',
    'admission_type',
    'admission_location',
    'ethnicity',
    'diagnosis',
]
for col in cols:
    data_train_30d[col] = scale(data_train_30d[col], axis=0, with_mean=True, with_std=True, copy=False)
    data_test_30d[col] = scale(data_test_30d[col], axis=0, with_mean=True, with_std=True, copy=False)
    data_train_1y[col] = scale(data_train_1y[col], axis=0, with_mean=True, with_std=True, copy=False)
    data_test_1y[col] = scale(data_test_1y[col], axis=0, with_mean=True, with_std=True, copy=False)

### Remove Outliers

The distributions of the features show that there is not a strong presence of outliers. Some of the features have values that can be mistakenly considered as outliers but are actually not. For example, age has values >300, but those cases correspond to patients with age > 89.

In [22]:
def remove_outliers(df, cols):
    # Find outliers based on 1.5*(Q3-Q1) outlier step
    outliers  = []
    # For each feature find the data points with extreme high or low values
    for col in cols:
        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(df[col],25)
        
        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(df[col],75)
        
        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = 1.5*(Q3-Q1)
        
        outliers_per_feature = df[(df[col] < Q1 - step) | (df[col] > Q3 + step)]
        outliers = outliers + list(outliers_per_feature.index)
    outliers.sort()
    outliers = list(set(outliers))
    return outliers

### Rearange Columns

In [23]:
cols_ordered = [
    'first_careunit',
    'last_careunit',
    'age',
    'gender',
    'marital_status',
    'insurance',
    'urea_n_min',
    'urea_n_max',
    'urea_n_mean',
    'platelets_min',
    'platelets_max',
    'platelets_mean',
    'magnesium_max',
    'albumin_min',
    'calcium_min',
    'resprate_min',
    'resprate_max',
    'resprate_mean',
    'glucose_min',
    'glucose_max',
    'glucose_mean',
    'hr_min',
    'hr_max',
    'hr_mean',
    'sysbp_min',
    'sysbp_max',
    'sysbp_mean',
    'diasbp_min',
    'diasbp_max',
    'diasbp_mean',
    'temp_min',
    'temp_max',
    'temp_mean',
    'sapsii',
    'sofa',
    'urine_min',
    'urine_mean',
    'urine_max',
    'elixhauser_vanwalraven',
    'elixhauser_sid29',
    'elixhauser_sid30',
    'meanbp_min',
    'meanbp_max',
    'meanbp_mean',
    'spo2_min',
    'spo2_max',
    'spo2_mean',
    'vent',
    'rrt',
    'urineoutput',
    'icustay_age_group',
    'oasis',
    'lods',
    'sirs',
    'admission_type',
    'admission_location',
    'ethnicity',
    'diagnosis',
    'age_mv',
    'urea_n_min_mv',
    'urea_n_max_mv',
    'urea_n_mean_mv',
    'platelets_min_mv',
    'platelets_max_mv',
    'platelets_mean_mv',
    'magnesium_max_mv',
    'albumin_min_mv',
    'calcium_min_mv',
    'resprate_min_mv',
    'resprate_max_mv',
    'resprate_mean_mv',
    'glucose_min_mv',
    'glucose_max_mv',
    'glucose_mean_mv',
    'hr_min_mv',
    'hr_max_mv',
    'hr_mean_mv',
    'sysbp_min_mv',
    'sysbp_max_mv',
    'sysbp_mean_mv',
    'diasbp_min_mv',
    'diasbp_max_mv',
    'diasbp_mean_mv',
    'temp_min_mv',
    'temp_max_mv',
    'temp_mean_mv',
    'sapsii_mv',
    'sofa_mv',
    'urine_min_mv',
    'urine_mean_mv',
    'urine_max_mv',
    'elixhauser_vanwalraven_mv',
    'elixhauser_sid29_mv',
    'elixhauser_sid30_mv',
    'meanbp_min_mv',
    'meanbp_max_mv',
    'meanbp_mean_mv',
    'spo2_min_mv',
    'spo2_max_mv',
    'spo2_mean_mv',
    'vent_mv',
    'rrt_mv',
    'urineoutput_mv',
    'oasis_mv',
    'lods_mv',
    'sirs_mv',
    'thirty_days',
    'one_year',
]

In [24]:
data_train_30d = data_train_30d[cols_ordered]
data_test_30d = data_test_30d[cols_ordered]
data_train_1y = data_train_1y[cols_ordered]
data_test_1y = data_test_1y[cols_ordered]

In [25]:
data_train_30d.to_csv('data_pp_train_30d.csv', index=False)
data_test_30d.to_csv('data_pp_test_30d.csv', index=False)
data_train_1y.to_csv('data_pp_train_1y.csv', index=False)
data_test_1y.to_csv('data_pp_test_1y.csv', index=False)