# Libraries

! pip install numpy pandas sklearn

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

# Import data

## X

In [2]:
# For training the model
X_train_realmean = pd.read_csv("../../X/Xtrainmean.csv", index_col=[0])

# For cross validation
X_valid_realmean = pd.read_csv("../../X/Xvalidmean.csv", index_col=[0])

# For prediction
X_test_realmean = pd.read_csv("../../X/Xtestmean.csv", index_col=[0])

## Task 1

In [3]:
y_train_t1 = pd.read_csv("../../Task1/Y_train.csv")
y_valid_t1 = pd.read_csv("../../Task1/Y_valid.csv")

In [4]:
y_train_t1_value=y_train_t1["mort_icu"]
y_valid_t1_value=y_valid_t1["mort_icu"]

## Task 2

In [5]:
y_train_t2 = pd.read_csv("../../Task2/Y_train.csv")
y_valid_t2 = pd.read_csv("../../Task2/Y_valid.csv")

In [6]:
y_train_t2_value=y_train_t2["los_icu"]
y_valid_t2_value=y_valid_t2["los_icu"]

# Data Pre-processing

In [7]:
X_train_realmean

Unnamed: 0,alanine aminotransferase,albumin,albumin ascites,albumin pleural,albumin urine,alkaline phosphate,anion gap,asparate aminotransferase,basophils,bicarbonate,...,tidal volume set,tidal volume spontaneous,total protein,total protein urine,troponin-i,troponin-t,venous pvo2,weight,white blood cell count,white blood cell count urine
3_145834_211552,-0.254460,-1.979855,0.0,0.0,0.0,-0.318615,0.931458,-0.226618,0.000000,-1.694872,...,0.440628,0.000000,0.0,0.0,0.0,0.000000,0.0,1.026004,0.634186,0.160067
6_107064_228232,-0.256599,-0.251806,0.0,0.0,0.0,-0.422405,1.723627,-0.254291,-0.742403,-1.444821,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,-0.181589,0.000000
9_150750_220597,-0.269432,0.000000,0.0,0.0,0.0,-0.367050,-0.330641,-0.271686,2.097036,1.019964,...,1.039571,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,-0.181589,0.000000
11_194540_229441,0.000000,0.000000,0.0,0.0,0.0,0.000000,-0.088963,0.000000,0.000000,0.484141,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,-0.139623,0.000000
12_112213_232669,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.240269,0.000000,0.000000,-1.694872,...,0.680205,-1.333059,0.0,0.0,0.0,-0.317391,0.0,0.000000,-0.414731,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99966_167228_252173,0.000000,1.044231,0.0,0.0,0.0,0.000000,-0.572320,0.000000,0.290121,0.091204,...,0.000000,0.000000,0.0,0.0,0.0,-0.361918,0.0,0.584864,-0.156720,0.000000
99973_150202_275083,0.000000,0.000000,0.0,0.0,0.0,0.000000,-0.290362,0.000000,-0.677870,-2.230694,...,0.000000,0.000000,0.0,0.0,0.0,-0.364145,0.0,0.973702,0.429243,0.000000
99982_151454_221194,-0.260876,1.044231,0.0,0.0,0.0,-0.297857,-0.330641,-0.261407,0.000000,1.448622,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,-0.443803,-0.682067,0.000000
99991_151118_226241,0.000000,0.000000,0.0,0.0,0.0,0.000000,-0.753579,0.000000,0.000000,-0.051682,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.782719,-0.633884,-0.303768


## Process 2: Elimination of features containing 70% 0 value (call is"nozero") and Imputation

In [8]:
# Eliminate feature containing 70% 0 value
import copy
X_train_nozero=copy.deepcopy(X_train_realmean)
X_valid_nozero=copy.deepcopy(X_valid_realmean)
X_test_nozero=copy.deepcopy(X_test_realmean)
for i in X_train_realmean.columns:
    if (X_train_nozero[i] == 0).sum()> 12000:
        X_train_nozero.drop(i, axis=1, inplace=True)

headnozero=list(X_train_nozero.columns.values)
X_valid_nozero = X_valid_nozero[X_train_nozero.columns]
X_test_nozero = X_test_nozero[X_train_nozero.columns]

In [9]:
# Impute the 0 with mean
imp = SimpleImputer(missing_values=0, strategy='mean')
X_train_nozero = pd.DataFrame(imp.fit_transform(X_train_nozero))
X_train_nozero.columns=headnozero
X_valid_nozero = pd.DataFrame(imp.fit_transform(X_valid_nozero))
X_valid_nozero.columns=headnozero
X_test_nozero = pd.DataFrame(imp.fit_transform(X_test_nozero))
X_test_nozero.columns=headnozero
X_train_nozero

Unnamed: 0,alanine aminotransferase,albumin,alkaline phosphate,anion gap,asparate aminotransferase,basophils,bicarbonate,bilirubin,blood urea nitrogen,calcium,...,respiratory rate set,sodium,systolic blood pressure,temperature,tidal volume observed,tidal volume set,tidal volume spontaneous,troponin-t,weight,white blood cell count
0,-0.254460,-1.979855,-0.318615,0.931458,-0.226618,0.014621,-1.694872,-0.298127,0.792807,-1.004507,...,0.280562,0.260075,-0.638602,-0.002273,0.800876,0.440628,0.000309,-0.053738,1.026004,0.634186
1,-0.256599,-0.251806,-0.422405,1.723627,-0.254291,-0.742403,-1.444821,-0.444605,1.737977,-0.016061,...,-0.075873,-0.417323,1.537514,-0.518339,0.069184,0.052263,0.000309,-0.053738,-0.010417,-0.181589
2,-0.269432,0.061017,-0.367050,-0.330641,-0.271686,2.097036,1.019964,-0.395779,-0.439621,0.636039,...,-0.133014,-0.047833,1.723172,0.101098,0.801063,1.039571,0.000309,-0.053738,-0.010417,-0.181589
3,-0.140093,0.061017,-0.052863,-0.088963,-0.136461,0.014621,0.484141,-0.115158,-0.439621,0.910607,...,-0.075873,0.598773,-0.661305,-0.188482,0.069184,0.052263,0.000309,-0.053738,-0.010417,-0.139623
4,-0.140093,0.061017,-0.052863,1.240269,-0.136461,0.014621,-1.694872,-0.115158,0.348021,0.224186,...,0.035793,0.875890,0.603034,-0.157922,0.476344,0.680205,-1.333059,-0.317391,-0.010417,-0.414731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16755,-0.140093,1.044231,-0.052863,-0.572320,-0.136461,0.290121,0.091204,-0.115158,-0.200239,0.407232,...,-0.075873,-0.232578,0.667703,-1.013160,0.069184,0.052263,0.000309,-0.361918,0.584864,-0.156720
16756,-0.140093,0.061017,-0.052863,-0.290362,-0.136461,-0.677870,-2.230694,-0.115158,-0.331513,-0.599519,...,-0.075873,0.260075,-0.389591,-0.067515,0.069184,0.052263,0.000309,-0.364145,0.973702,0.429243
16757,-0.260876,1.044231,-0.297857,-0.330641,-0.261407,0.014621,1.448622,-0.151648,0.116362,0.361470,...,-0.075873,-0.546644,-1.128922,-0.353408,0.069184,0.052263,0.000309,-0.053738,-0.443803,-0.682067
16758,-0.140093,0.061017,-0.052863,-0.753579,-0.136461,0.014621,-0.051682,-0.115158,-0.219544,-0.788284,...,-0.075873,1.014449,2.021813,0.149653,0.069184,0.052263,0.000309,-0.053738,0.782719,-0.633884


In [14]:
X_train_selected_t2=X_train_nozero
X_test_selected_t2=X_test_nozero
X_valid_selected_t2=X_valid_nozero

## Normalization

In [15]:
scaler = preprocessing.StandardScaler()

X_train_selected_t1=X_train_selected_t1.values
X_train_selected_t1_scaled = scaler.fit_transform(X_train_selected_t1)
X_train_selected_t1_norm=pd.DataFrame(X_train_selected_t1_scaled)

X_train_selected_t2=X_train_selected_t2.values
X_train_selected_t2_scaled = scaler.fit_transform(X_train_selected_t2)
X_train_selected_t2_norm=pd.DataFrame(X_train_selected_t2_scaled)

X_valid_selected_t1=X_valid_selected_t1.values
X_valid_selected_t1_scaled = scaler.fit_transform(X_valid_selected_t1)
X_valid_selected_t1_norm=pd.DataFrame(X_valid_selected_t1_scaled)

X_valid_selected_t2=X_valid_selected_t2.values
X_valid_selected_t2_scaled = scaler.fit_transform(X_valid_selected_t2)
X_valid_selected_t2_norm=pd.DataFrame(X_valid_selected_t2_scaled)

X_test_selected_t1=X_test_selected_t1.values
X_test_selected_t1_scaled = scaler.fit_transform(X_test_selected_t1)
X_test_selected_t1_norm=pd.DataFrame(X_test_selected_t1_scaled)

X_test_selected_t2=X_test_selected_t2.values
X_test_selected_t2_scaled = scaler.fit_transform(X_test_selected_t2)
X_test_selected_t2_norm=pd.DataFrame(X_test_selected_t2_scaled)


# Data report

In [16]:
print("*"*60)
print("There are 6 set of X")
print("X_train_selected_t1, X_train_selected_t2, X_valid_selected_t1,X_valid_selected_t2,X_test_selected_t1,X_test_selected_t2")
print("-"*60)
print("Normalized version")
print("X_train_selected_t1_norm, X_train_selected_t2_norm, X_valid_selected_t1_norm,X_valid_selected_t2_norm,X_test_selected_t1_norm,X_test_selected_t2_norm")
print("-"*60)
print("There are 4 set of Y")
print("y_train_t1, y_train_t2, y_valid_t1, y_valid_t2")
print("when training, please use: 'y_train_t1_value,y_train_t2_value,y_valid_t1_value,y_valid_t2_value'")
print("*"*60)

************************************************************
There are 6 set of X
X_train_selected_t1, X_train_selected_t2, X_valid_selected_t1,X_valid_selected_t2,X_test_selected_t1,X_test_selected_t2
------------------------------------------------------------
Normalized version
X_train_selected_t1_norm, X_train_selected_t2_norm, X_valid_selected_t1_norm,X_valid_selected_t2_norm,X_test_selected_t1_norm,X_test_selected_t2_norm
------------------------------------------------------------
There are 4 set of Y
y_train_t1, y_train_t2, y_valid_t1, y_valid_t2
when training, please use: 'y_train_t1_value,y_train_t2_value,y_valid_t1_value,y_valid_t2_value'
************************************************************
