# Pre Process

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time
from sklearn.preprocessing import scale
from scipy import stats

%matplotlib inline

In [2]:
data = pd.read_csv('data(with_label).csv')

In [3]:
data.head(1)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,dod,first_careunit,last_careunit,age,gender,...,lods,sirs,admission_type,admission_location,discharge_location,ethnicity,diagnosis,time_before_death,thirty_days,one_year
0,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,2102-06-14 00:00:00,2102-06-14 00:00:00,MICU,MICU,76.526788,M,...,10,3,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,WHITE,HYPOTENSION,236 days 04:52:00.000000000,0,1


### Remove Unwanted Columns

In [4]:
str_cols = [
    'subject_id',
    'hadm_id',
    'deathtime',
    'dod',
    'time_before_death',
]
data = data.drop(str_cols, axis = 1)

### Marital Status NaN to UNKNOWN (DEFAULT)

In [5]:
data.loc[pd.isna(data['marital_status']), 'marital_status'] = 'UNKNOWN (DEFAULT)'

### Diagnosis NaN to PNEUMONIA

In [6]:
data.loc[pd.isna(data['diagnosis']), 'diagnosis'] = 'PNEUMONIA'

### Convert Categorical Data to Number

In [7]:
for n,c in enumerate(np.flip(data['first_careunit'].value_counts().index.values)):
    print(n+1,c)

1 TSICU
2 CCU
3 SICU
4 CSRU
5 MICU


In [8]:
def categ_to_num(df, cols):
    for col in cols:
        map_dict = {}
        for n, c in enumerate(np.flip(data[col].value_counts().index.values)):
            map_dict[c] = n + 1 
        df[col] = df[col].map(map_dict)
    return df

In [9]:
categorical = [
    'first_careunit', 
    'last_careunit',
    'gender', 
    'marital_status',
    'insurance', 
    'icustay_age_group', 
    'admission_type', 
    'admission_location', 
    'discharge_location', 
    'ethnicity', 
    'diagnosis',
]
data = categ_to_num(data, categorical)

### Convert String Time Data to Timestamp

In [10]:
def str_to_timestamp(row, cols):
    for c in cols:
        if not pd.isna(row[c]):
            row[c] = time.mktime(datetime.strptime(row[c], "%Y-%m-%d %H:%M:%S").timetuple())
    return row

In [11]:
str_time = [
    'admittime', 
    'dischtime' 
]

data = data.apply(lambda r: str_to_timestamp(r,str_time), axis=1)

### Fill Missing Values

#### Fill by 30 Day

In [12]:
def fill_missing_by_group(df, cols, group_col):
    gc_unique = df[group_col].unique()

    grp_means = []
    grp_inds = []

    for v in gc_unique:
        grp_means.append(df[df[group_col] == v].mean())
        grp_inds.append(df[group_col] == v)
        
    #living_means = df[data['thirty_days'] == 0].mean()

    for col in cols:
        col_miss_ind = pd.isna(df[col])
        df[col+'_mv'] = col_miss_ind * 1

        for i in range(len(grp_inds)):
            grp_miss_inds = col_miss_ind & grp_inds[i]
            df.loc[grp_miss_inds, col] = grp_means[i][col]
    return df

In [13]:
cols = [
    'age',
    'urea_n_min', 
    'urea_n_max', 
    'urea_n_mean', 
    'platelets_min',
    'platelets_max', 
    'platelets_mean', 
    'magnesium_max', 
    'albumin_min',
    'calcium_min', 
    'resprate_min', 
    'resprate_max', 
    'resprate_mean',
    'glucose_min', 
    'glucose_max', 
    'glucose_mean', 
    'hr_min', 
    'hr_max',
    'hr_mean', 
    'sysbp_min', 
    'sysbp_max', 
    'sysbp_mean', 
    'diasbp_min',
    'diasbp_max', 
    'diasbp_mean', 
    'temp_min', 
    'temp_max', 
    'temp_mean',
    'sapsii', 
    'sofa', 
    'urine_min', 
    'urine_mean', 
    'urine_max',
    'elixhauser_vanwalraven', 
    'elixhauser_sid29', 
    'elixhauser_sid30',
    'los_hospital', 
    'meanbp_min', 
    'meanbp_max', 
    'meanbp_mean', 
    'spo2_min',
    'spo2_max', 
    'spo2_mean', 
    'vent', 
    'rrt', 
    'urineoutput',
    'oasis', 
    'lods', 
    'sirs',    
]
data = fill_missing_by_group(data, cols, 'thirty_days')

In [14]:
data.head()

Unnamed: 0,admittime,dischtime,first_careunit,last_careunit,age,gender,marital_status,insurance,urea_n_min,urea_n_max,...,meanbp_mean_mv,spo2_min_mv,spo2_max_mv,spo2_mean_mv,vent_mv,rrt_mv,urineoutput_mv,oasis_mv,lods_mv,sirs_mv
0,4159293000.0,4160225000.0,5,5,76.526788,2,7,5,15.0,53.0,...,0,0,0,0,0,0,0,0,0,0
1,6482114000.0,6483528000.0,3,4,65.94067,1,7,5,16.0,91.0,...,0,0,0,0,0,0,0,0,0,0
2,5675768000.0,5676189000.0,5,5,41.790226,2,3,3,16.0,33.0,...,0,0,0,0,0,0,0,0,0,0
3,4247562000.0,4248659000.0,3,4,72.372364,2,7,5,28.0,41.0,...,0,0,0,0,0,0,0,0,0,0
4,6217429000.0,6218022000.0,2,3,39.866116,1,3,3,13.0,22.0,...,0,0,0,0,0,0,0,0,0,0


### Remove Unwanted Columns

In [15]:
str_cols = [
    'admittime',
    'dischtime',
]
data = data.drop(str_cols, axis = 1)

### Convert types

In [16]:
data = data.astype(np.float64)

### Scale Values

In [17]:
cols = [
    'first_careunit',
    'last_careunit',
    'age',
    'gender',
    'marital_status',
    'insurance',
    'urea_n_min',
    'urea_n_max',
    'urea_n_mean',
    'platelets_min',
    'platelets_max',
    'platelets_mean',
    'magnesium_max',
    'albumin_min',
    'calcium_min',
    'resprate_min',
    'resprate_max',
    'resprate_mean',
    'glucose_min',
    'glucose_max',
    'glucose_mean',
    'hr_min',
    'hr_max',
    'hr_mean',
    'sysbp_min',
    'sysbp_max',
    'sysbp_mean',
    'diasbp_min',
    'diasbp_max',
    'diasbp_mean',
    'temp_min',
    'temp_max',
    'temp_mean',
    'sapsii',
    'sofa',
    'urine_min',
    'urine_mean',
    'urine_max',
    'elixhauser_vanwalraven',
    'elixhauser_sid29',
    'elixhauser_sid30',
    'los_hospital',
    'meanbp_min',
    'meanbp_max',
    'meanbp_mean',
    'spo2_min',
    'spo2_max',
    'spo2_mean',
    'vent',
    'rrt',
    'urineoutput',
    'icustay_age_group',
    'oasis',
    'lods',
    'sirs',
    'admission_type',
    'admission_location',
    'discharge_location',
    'ethnicity',
    'diagnosis',
    'thirty_days',
    'one_year',
]
for col in cols:
    data[col] = scale(data[col])

### Remove Outliers

In [18]:
def remove_outliers(df):
    # Find outliers based on 1.5*(Q3-Q1) outlier step
    outliers  = []
    # For each feature find the data points with extreme high or low values
    for feature in df.keys():
        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(df[feature],25)
        
        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(df[feature],75)
        
        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = 1.5*(Q3-Q1)
        
        outliers_per_feature = df[~((df[feature] >= Q1 - step) & (df[feature] <= Q3 + step))]
        outliers = outliers + list(outliers_per_feature.index)
    outliers.sort()
    outliers = list(set(outliers))
    return outliers

We need to choose which feature to apply remove_outliers. 
'spo2_max' has almost every value near 100, both the 25 and 75 percentile are the same?!

In [19]:
Q1 = np.percentile(data['spo2_max'], 25)
print(Q1)

0.30086056591981153


In [20]:
Q2 = np.percentile(data['spo2_max'], 50)
print(Q2)

0.30086056591981153


### Rearange Columns

In [21]:
cols_ordered = [
    'first_careunit',
    'last_careunit',
    'age',
    'gender',
    'marital_status',
    'insurance',
    'urea_n_min',
    'urea_n_max',
    'urea_n_mean',
    'platelets_min',
    'platelets_max',
    'platelets_mean',
    'magnesium_max',
    'albumin_min',
    'calcium_min',
    'resprate_min',
    'resprate_max',
    'resprate_mean',
    'glucose_min',
    'glucose_max',
    'glucose_mean',
    'hr_min',
    'hr_max',
    'hr_mean',
    'sysbp_min',
    'sysbp_max',
    'sysbp_mean',
    'diasbp_min',
    'diasbp_max',
    'diasbp_mean',
    'temp_min',
    'temp_max',
    'temp_mean',
    'sapsii',
    'sofa',
    'urine_min',
    'urine_mean',
    'urine_max',
    'elixhauser_vanwalraven',
    'elixhauser_sid29',
    'elixhauser_sid30',
    'los_hospital',
    'meanbp_min',
    'meanbp_max',
    'meanbp_mean',
    'spo2_min',
    'spo2_max',
    'spo2_mean',
    'vent',
    'rrt',
    'urineoutput',
    'icustay_age_group',
    'oasis',
    'lods',
    'sirs',
    'admission_type',
    'admission_location',
    'discharge_location',
    'ethnicity',
    'diagnosis',
    'age_mv',
    'urea_n_min_mv',
    'urea_n_max_mv',
    'urea_n_mean_mv',
    'platelets_min_mv',
    'platelets_max_mv',
    'platelets_mean_mv',
    'magnesium_max_mv',
    'albumin_min_mv',
    'calcium_min_mv',
    'resprate_min_mv',
    'resprate_max_mv',
    'resprate_mean_mv',
    'glucose_min_mv',
    'glucose_max_mv',
    'glucose_mean_mv',
    'hr_min_mv',
    'hr_max_mv',
    'hr_mean_mv',
    'sysbp_min_mv',
    'sysbp_max_mv',
    'sysbp_mean_mv',
    'diasbp_min_mv',
    'diasbp_max_mv',
    'diasbp_mean_mv',
    'temp_min_mv',
    'temp_max_mv',
    'temp_mean_mv',
    'sapsii_mv',
    'sofa_mv',
    'urine_min_mv',
    'urine_mean_mv',
    'urine_max_mv',
    'elixhauser_vanwalraven_mv',
    'elixhauser_sid29_mv',
    'elixhauser_sid30_mv',
    'los_hospital_mv',
    'meanbp_min_mv',
    'meanbp_max_mv',
    'meanbp_mean_mv',
    'spo2_min_mv',
    'spo2_max_mv',
    'spo2_mean_mv',
    'vent_mv',
    'rrt_mv',
    'urineoutput_mv',
    'oasis_mv',
    'lods_mv',
    'sirs_mv',
    'thirty_days',
    'one_year',
]

In [22]:
data = data[cols_ordered]