In [22]:
import pandas as pd
import numpy as np
import os
import math
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_curve
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pkl
%matplotlib inline

In [3]:
train_df = pd.read_csv('../dataset/train_cleaned.csv', sep='|')
val_df = pd.read_csv('../dataset/val_cleaned.csv', sep='|')

In [4]:
train_df.head()

Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,...,Magnesium,Potassium,Hct,Hgb,WBC,Age,Gender,HospAdmTime,ICULOS,SepsisLabel
0,1,,,,,,,,,,...,,,,,,84.31,1.0,-0.03,1.0,0.0
1,1,87.0,98.0,,94.5,71.5,,21.0,,,...,,,,,,84.31,1.0,-0.03,2.0,0.0
2,1,85.0,97.0,,89.0,62.0,,22.0,,,...,,,,,,84.31,1.0,-0.03,3.0,0.0
3,1,83.0,97.0,36.28,104.0,66.0,,22.0,,30.0,...,2.1,4.0,32.6,10.7,9.5,84.31,1.0,-0.03,4.0,0.0
4,1,81.0,98.0,,87.0,67.0,,18.0,,,...,,,,,,84.31,1.0,-0.03,5.0,0.0


In [5]:
train_df[train_df.patient_id == 5].head()

Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,...,Magnesium,Potassium,Hct,Hgb,WBC,Age,Gender,HospAdmTime,ICULOS,SepsisLabel
178,5,,,,,,,,,,...,,,,,,65.28,0.0,-123.97,1.0,0.0
179,5,99.5,100.0,34.72,96.5,80.5,78.0,20.5,,,...,,,,,,65.28,0.0,-123.97,2.0,0.0
180,5,93.0,100.0,35.61,121.0,96.0,82.0,20.0,-1.0,,...,,,,,,65.28,0.0,-123.97,3.0,0.0
181,5,91.0,100.0,35.94,119.0,102.0,91.0,22.0,,,...,,,,,,65.28,0.0,-123.97,4.0,0.0
182,5,84.5,99.0,36.06,106.5,98.0,85.5,22.0,0.0,,...,,,,,,65.28,0.0,-123.97,5.0,0.0


In [6]:
# data operation

def fill_temp(df):
    pid = 1
    last_temp = 36.3
    for i in range(df.shape[0]):
        if df.iloc[i].patient_id == pid:
            if math.isnan(df.iloc[i].Temp):
                df.at[i, 'Temp'] = last_temp
            else:
                last_temp = df.iloc[i].Temp
            
        else:
            pid = df.iloc[i].patient_id
            last_temp = 37
            if math.isnan(df.iloc[i].Temp):
                df.at[i, 'Temp'] = last_temp
            else:
                last_temp = df.iloc[i].Temp
                
        # fill hr
        if math.isnan(df.iloc[i].HR):
            for j in range(i+1, df.shape[0]):
                if df.iloc[j].patient_id == df.iloc[i].patient_id:
                    if math.isnan(df.iloc[j].HR) == False:
                        df.at[i, 'HR'] = df.iloc[j].HR
                        break
                else:
                    df.at[i, 'HR'] = df.iloc[i-1].HR
                    break
                    
        # fill sbp
        if math.isnan(df.iloc[i].SBP):
            for j in range(i+1, df.shape[0]):
                if df.iloc[j].patient_id == df.iloc[i].patient_id:
                    if math.isnan(df.iloc[j].SBP) == False:
                        df.at[i, 'SBP'] = df.iloc[j].SBP
                        break
                else:
                    df.at[i, 'SBP'] = df.iloc[i-1].SBP
                    break
                    
        # fill Resp
        if math.isnan(df.iloc[i].Resp):
            for j in range(i+1, df.shape[0]):
                if df.iloc[j].patient_id == df.iloc[i].patient_id:
                    if math.isnan(df.iloc[j].Resp) == False:
                        df.at[i, 'Resp'] = df.iloc[j].Resp
                        break
                else:
                    df.at[i, 'Resp'] = df.iloc[i-1].Resp
                    break
        # fill O2Sat
        if math.isnan(df.iloc[i].O2Sat):
            for j in range(i+1, df.shape[0]):
                if df.iloc[j].patient_id == df.iloc[i].patient_id:
                    if math.isnan(df.iloc[j].O2Sat) == False:
                        df.at[i, 'O2Sat'] = df.iloc[j].O2Sat
                        break
                else:
                    df.at[i, 'O2Sat'] = df.iloc[i-1].O2Sat
                    break
                    
        # fill MAP
        if math.isnan(df.iloc[i].MAP):
            for j in range(i+1, df.shape[0]):
                if df.iloc[j].patient_id == df.iloc[i].patient_id:
                    if math.isnan(df.iloc[j].MAP) == False:
                        df.at[i, 'MAP'] = df.iloc[j].MAP
                        break
                else:
                    df.at[i, 'MAP'] = df.iloc[i-1].MAP
                    break
    return df


In [7]:
train_df = fill_temp(train_df)
val_df = fill_temp(val_df)

In [8]:
missing_pct = (train_df.isnull().sum() / train_df.shape[0]) * 100
missing_pct

patient_id      0.000000
HR              0.000000
O2Sat           0.000000
Temp            0.000000
SBP             0.000000
MAP             0.000000
DBP            48.482230
Resp            0.000000
BaseExcess     89.641083
HCO3           91.979409
FiO2           85.851514
pH             88.594990
PaCO2          91.261501
BUN            91.867048
Chloride       91.719837
Glucose        87.757280
Magnesium      92.252776
Potassium      89.166593
Hct            88.230863
Hgb            91.207772
WBC            92.525780
Age             0.000000
Gender          0.000000
HospAdmTime     0.000000
ICULOS          0.000000
SepsisLabel     0.000000
dtype: float64

In [9]:
train_df.fillna(-999, inplace=True)
val_df.fillna(-999, inplace=True)