# Data Preparation

In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.insert(0, os.path.abspath("../src"))
from data_preprocessing import (
    suggested_imputations,
    column_deletion,
    imputation,
    encode_categorical,
    high_corr_deletion
)
import time
import warnings
warnings.filterwarnings("ignore")
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo

def save_object(obj, filename): 
    with open(filename, 'wb') as output: 
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) 

## Data

In [None]:
support2 = fetch_ucirepo(id=880) 
  
X = support2.data.features 
y = support2.data.targets[['death']] 

data = pd.concat([X, y], axis=1)

print(data)

           age     sex            dzgroup             dzclass  num.co   edu  \
0     62.84998    male        Lung Cancer              Cancer       0  11.0   
1     60.33899  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
2     52.74698  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
3     42.38498  female        Lung Cancer              Cancer       2  11.0   
4     79.88495  female  ARF/MOSF w/Sepsis            ARF/MOSF       1   NaN   
...        ...     ...                ...                 ...     ...   ...   
9100  66.07300    male  ARF/MOSF w/Sepsis            ARF/MOSF       1   8.0   
9101  55.15399  female               Coma                Coma       1  11.0   
9102  70.38196    male  ARF/MOSF w/Sepsis            ARF/MOSF       1   NaN   
9103  47.01999    male       MOSF w/Malig            ARF/MOSF       1  13.0   
9104  81.53894  female  ARF/MOSF w/Sepsis            ARF/MOSF       1   8.0   

          income  scoma  charges      totcst  ...  

In [4]:
data.columns

Index(['age', 'sex', 'dzgroup', 'dzclass', 'num.co', 'edu', 'income', 'scoma',
       'charges', 'totcst', 'totmcst', 'avtisst', 'race', 'sps', 'aps',
       'surv2m', 'surv6m', 'hday', 'diabetes', 'dementia', 'ca', 'prg2m',
       'prg6m', 'dnr', 'dnrday', 'meanbp', 'wblc', 'hrt', 'resp', 'temp',
       'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose', 'bun', 'urine',
       'adlp', 'adls', 'adlsc', 'death'],
      dtype='object')

# Data Preparation

In [6]:
(data.isna().sum().sort_values(ascending=False)/9105)*100

adlp        61.954970
urine       53.399231
glucose     49.423394
bun         47.797913
totmcst     38.165843
alb         37.034596
income      32.751236
adls        31.488193
bili        28.566722
pafi        25.535420
ph          25.085118
prg2m       18.110928
edu         17.946183
prg6m       17.935200
totcst       9.752883
wblc         2.328391
charges      1.889072
avtisst      0.900604
crea         0.735859
race         0.461285
dnr          0.329489
dnrday       0.329489
scoma        0.010983
sod          0.010983
sps          0.010983
meanbp       0.010983
surv2m       0.010983
hrt          0.010983
resp         0.010983
temp         0.010983
aps          0.010983
surv6m       0.010983
adlsc        0.000000
age          0.000000
sex          0.000000
ca           0.000000
dementia     0.000000
diabetes     0.000000
hday         0.000000
num.co       0.000000
dzclass      0.000000
dzgroup      0.000000
death        0.000000
dtype: float64

## Dealing with missing values - imputing numbers suggested by the author of the dataset for features with high percentage of missing values

- Serum albumin (alb)	3.5
- PaO2/FiO2 ratio (pafi) 	333.3
- Bilirubin (bili)	1.01
- Creatinine (crea)	1.01
- bun	6.51
- White blood count (wblc)	9 (thousands)
- Urine output (urine)	2502

In [7]:
data = suggested_imputations(data)

In [9]:
(data.isna().sum().sort_values(ascending=False)/9105)*100

adlp        61.954970
glucose     49.423394
totmcst     38.165843
income      32.751236
adls        31.488193
ph          25.085118
prg2m       18.110928
edu         17.946183
prg6m       17.935200
totcst       9.752883
charges      1.889072
avtisst      0.900604
race         0.461285
dnrday       0.329489
dnr          0.329489
resp         0.010983
temp         0.010983
hrt          0.010983
meanbp       0.010983
sod          0.010983
surv2m       0.010983
surv6m       0.010983
aps          0.010983
sps          0.010983
scoma        0.010983
bun          0.000000
urine        0.000000
alb          0.000000
crea         0.000000
adlsc        0.000000
bili         0.000000
age          0.000000
pafi         0.000000
wblc         0.000000
sex          0.000000
ca           0.000000
dementia     0.000000
diabetes     0.000000
hday         0.000000
num.co       0.000000
dzclass      0.000000
dzgroup      0.000000
death        0.000000
dtype: float64

## Dealing with missing values - deleting few columns based on the high percentage of missing values

Columns we are going to delete: 
- adlp (61.890153%)
- glucose (49.638793%)
- totmcst (38.396833%)
- income (32.568036%)
- adls (31.509154%)
- ph (25.472538%)

In [10]:
columns_to_delete = ['adlp','glucose','totmcst','income','adls','ph']

In [11]:
data = column_deletion(data,columns_to_delete)

In [12]:
(data.isna().sum().sort_values(ascending=False)/9105)*100

prg2m       18.110928
edu         17.946183
prg6m       17.935200
totcst       9.752883
charges      1.889072
avtisst      0.900604
race         0.461285
dnrday       0.329489
dnr          0.329489
surv2m       0.010983
surv6m       0.010983
hrt          0.010983
meanbp       0.010983
sod          0.010983
resp         0.010983
temp         0.010983
aps          0.010983
sps          0.010983
scoma        0.010983
crea         0.000000
bili         0.000000
alb          0.000000
bun          0.000000
urine        0.000000
pafi         0.000000
adlsc        0.000000
age          0.000000
ca           0.000000
wblc         0.000000
sex          0.000000
dementia     0.000000
diabetes     0.000000
hday         0.000000
num.co       0.000000
dzclass      0.000000
dzgroup      0.000000
death        0.000000
dtype: float64

## Dealing with missing values - using KNN imputer for numerical values and deleting rows with empty categorical values

In [13]:
data = imputation(data,5)

In [14]:
data.isna().sum().sort_values(ascending=False)

age         0
prg2m       0
dnr         0
dnrday      0
meanbp      0
wblc        0
hrt         0
resp        0
temp        0
pafi        0
alb         0
bili        0
crea        0
sod         0
bun         0
urine       0
adlsc       0
prg6m       0
ca          0
sex         0
dementia    0
dzgroup     0
dzclass     0
num.co      0
edu         0
scoma       0
charges     0
totcst      0
avtisst     0
race        0
sps         0
aps         0
surv2m      0
surv6m      0
hday        0
diabetes    0
death       0
dtype: int64

In [15]:
data.describe()

Unnamed: 0,age,num.co,edu,scoma,charges,totcst,avtisst,sps,aps,surv2m,...,temp,pafi,alb,bili,crea,sod,bun,urine,adlsc,death
count,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,...,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0,9033.0
mean,62.677624,1.870475,11.72472,12.020702,60292.33,32756.196504,22.578699,25.503032,37.560943,0.636525,...,37.104041,263.550738,3.153353,2.112286,1.763744,137.562781,19.983742,2357.902676,1.88586,0.680726
std,15.591624,1.346043,3.174467,24.602534,102421.5,45831.007706,13.185062,9.841931,19.857789,0.247597,...,1.251305,103.170174,0.744674,4.553252,1.681046,6.028766,23.249991,1004.20083,2.00614,0.466221
min,18.04199,0.0,0.0,0.0,1169.0,0.0,1.0,0.199982,0.0,0.0,...,31.69922,12.0,0.399963,0.099991,0.099991,110.0,1.0,0.0,0.0,0.0
25%,52.82999,1.0,10.0,0.0,9865.195,6412.2891,12.0,19.0,23.0,0.509888,...,36.19531,180.0,2.699707,0.599976,0.899902,134.0,6.51,2080.0,0.0,0.0
50%,64.909,2.0,12.0,0.0,25441.0,16159.3828,19.5,23.898438,34.0,0.715942,...,36.69531,276.1875,3.5,1.01,1.199951,137.0,6.51,2502.0,1.0,1.0
75%,74.02899,3.0,13.0,9.0,65194.0,40000.1452,31.5,30.199219,48.0,0.825928,...,38.19531,333.3,3.5,1.299805,1.899902,141.0,24.0,2502.0,3.0,1.0
max,101.84796,9.0,31.0,100.0,1435423.0,633212.0,83.0,99.1875,143.0,0.969971,...,41.69531,890.375,29.0,63.0,21.5,181.0,300.0,9000.0,7.073242,1.0


In [16]:
data.dtypes

age         float64
sex          object
dzgroup      object
dzclass      object
num.co      float64
edu         float64
scoma       float64
charges     float64
totcst      float64
avtisst     float64
race         object
sps         float64
aps         float64
surv2m      float64
surv6m      float64
hday        float64
diabetes    float64
dementia    float64
ca           object
prg2m       float64
prg6m       float64
dnr          object
dnrday      float64
meanbp      float64
wblc        float64
hrt         float64
resp        float64
temp        float64
pafi        float64
alb         float64
bili        float64
crea        float64
sod         float64
bun         float64
urine       float64
adlsc       float64
death       float64
dtype: object

## Encoding of the categorical columns

In [17]:
data, label_encoders = encode_categorical(data)

In [18]:
data

Unnamed: 0,age,sex,num.co,edu,scoma,charges,totcst,avtisst,sps,aps,...,dzclass_Cancer,dzclass_Coma,race_black,race_hispanic,race_other,race_white,ca_no,ca_yes,dnr_dnr before sadm,dnr_no dnr
0,62.84998,1,0.0,11.0,0.0,9715.0,8606.34528,7.000000,33.898438,20.0,...,True,False,False,False,True,False,False,False,False,True
2,52.74698,0,2.0,12.0,0.0,41094.0,33709.64216,13.000000,20.500000,45.0,...,False,False,False,False,False,True,True,False,False,True
3,42.38498,0,2.0,11.0,0.0,3075.0,2439.67072,7.000000,20.097656,19.0,...,True,False,False,False,False,True,False,False,False,True
4,79.88495,0,1.0,11.2,26.0,50127.0,36290.29064,18.666656,23.500000,30.0,...,False,False,False,False,False,True,True,False,False,True
5,93.01599,1,1.0,14.0,55.0,6884.0,24111.78474,5.000000,19.398438,27.0,...,False,True,False,False,False,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,1,1.0,8.0,0.0,52870.0,34329.31250,20.333328,16.296875,22.0,...,False,False,False,False,False,True,True,False,False,True
9101,55.15399,0,1.0,11.0,41.0,35377.0,23558.50000,18.000000,25.796875,31.0,...,False,True,False,False,False,True,True,False,False,True
9102,70.38196,1,1.0,9.8,0.0,46564.0,31409.01560,23.000000,22.699219,39.0,...,False,False,False,False,False,True,True,False,False,True
9103,47.01999,1,1.0,13.0,0.0,58439.0,29165.55154,35.500000,40.195312,51.0,...,False,False,False,False,False,True,False,True,False,False


## Deleting high correlated columns with lower information gain

In [19]:
data, drop_cols = high_corr_deletion(data, 0.75)

In [20]:
data

Unnamed: 0,age,sex,num.co,edu,scoma,totcst,avtisst,surv6m,hday,diabetes,...,dzclass_Cancer,dzclass_Coma,race_black,race_hispanic,race_other,race_white,ca_no,ca_yes,dnr_dnr before sadm,dnr_no dnr
0,62.84998,1,0.0,11.0,0.0,8606.34528,7.000000,0.036995,1.0,0.0,...,True,False,False,False,True,False,False,False,False,True
2,52.74698,0,2.0,12.0,0.0,33709.64216,13.000000,0.664917,4.0,0.0,...,False,False,False,False,False,True,True,False,False,True
3,42.38498,0,2.0,11.0,0.0,2439.67072,7.000000,0.411987,1.0,0.0,...,True,False,False,False,False,True,False,False,False,True
4,79.88495,0,1.0,11.2,26.0,36290.29064,18.666656,0.532959,3.0,0.0,...,False,False,False,False,False,True,True,False,False,True
5,93.01599,1,1.0,14.0,55.0,24111.78474,5.000000,0.214996,1.0,0.0,...,False,True,False,False,False,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,1,1.0,8.0,0.0,34329.31250,20.333328,0.801880,13.0,0.0,...,False,False,False,False,False,True,True,False,False,True
9101,55.15399,0,1.0,11.0,41.0,23558.50000,18.000000,0.485962,1.0,0.0,...,False,True,False,False,False,True,True,False,False,True
9102,70.38196,1,1.0,9.8,0.0,31409.01560,23.000000,0.660889,18.0,0.0,...,False,False,False,False,False,True,True,False,False,True
9103,47.01999,1,1.0,13.0,0.0,29165.55154,35.500000,0.091995,22.0,0.0,...,False,False,False,False,False,True,False,True,False,False


In [21]:
drop_cols

['charges', 'sps', 'aps', 'surv2m', 'prg2m']

In [22]:
data.columns = data.columns.str.replace(r'[<>[\]]', '', regex=True)

In [23]:
data

Unnamed: 0,age,sex,num.co,edu,scoma,totcst,avtisst,surv6m,hday,diabetes,...,dzclass_Cancer,dzclass_Coma,race_black,race_hispanic,race_other,race_white,ca_no,ca_yes,dnr_dnr before sadm,dnr_no dnr
0,62.84998,1,0.0,11.0,0.0,8606.34528,7.000000,0.036995,1.0,0.0,...,True,False,False,False,True,False,False,False,False,True
2,52.74698,0,2.0,12.0,0.0,33709.64216,13.000000,0.664917,4.0,0.0,...,False,False,False,False,False,True,True,False,False,True
3,42.38498,0,2.0,11.0,0.0,2439.67072,7.000000,0.411987,1.0,0.0,...,True,False,False,False,False,True,False,False,False,True
4,79.88495,0,1.0,11.2,26.0,36290.29064,18.666656,0.532959,3.0,0.0,...,False,False,False,False,False,True,True,False,False,True
5,93.01599,1,1.0,14.0,55.0,24111.78474,5.000000,0.214996,1.0,0.0,...,False,True,False,False,False,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,1,1.0,8.0,0.0,34329.31250,20.333328,0.801880,13.0,0.0,...,False,False,False,False,False,True,True,False,False,True
9101,55.15399,0,1.0,11.0,41.0,23558.50000,18.000000,0.485962,1.0,0.0,...,False,True,False,False,False,True,True,False,False,True
9102,70.38196,1,1.0,9.8,0.0,31409.01560,23.000000,0.660889,18.0,0.0,...,False,False,False,False,False,True,True,False,False,True
9103,47.01999,1,1.0,13.0,0.0,29165.55154,35.500000,0.091995,22.0,0.0,...,False,False,False,False,False,True,False,True,False,False


In [24]:
data_final = data.to_csv('../data/data_final.csv', index = False)

### Data is now prepared for modeling