In [90]:
import pandas as pd

In [110]:
train_df = pd.read_csv('../dataset/train_cleaned.csv', sep='|')
val_df = pd.read_csv('../dataset/val_cleaned.csv', sep='|')

In [92]:
missing_pct = (train_df[train_df == -999].count() / train_df.shape[0]) * 100
missing_pct

patient_id      0.000000
HR              0.002020
O2Sat           0.000000
Temp            0.000000
SBP             0.000000
MAP             0.000000
DBP            31.333650
Resp            0.000092
FiO2           91.721048
Glucose        82.780607
Potassium      90.722248
Hct            91.161863
Hgb            92.643718
Age             0.000000
Gender          0.000000
HospAdmTime     0.000000
ICULOS          0.000000
SepsisLabel     0.000000
dtype: float64

In [93]:
train_df

Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,Glucose,Potassium,Hct,Hgb,Age,Gender,HospAdmTime,ICULOS,SepsisLabel
0,1,87.0,98.0,36.30,94.5,71.5,-999.0,21.0,-999.0,-999.0,-999.0,-999.0,-999.0,84.31,1.0,-0.03,1.0,0.0
1,1,87.0,98.0,36.30,94.5,71.5,-999.0,21.0,-999.0,-999.0,-999.0,-999.0,-999.0,84.31,1.0,-0.03,2.0,0.0
2,1,85.0,97.0,36.30,89.0,62.0,-999.0,22.0,-999.0,-999.0,-999.0,-999.0,-999.0,84.31,1.0,-0.03,3.0,0.0
3,1,83.0,97.0,36.28,104.0,66.0,-999.0,22.0,-999.0,109.0,4.0,32.6,10.7,84.31,1.0,-0.03,4.0,0.0
4,1,81.0,98.0,36.28,87.0,67.0,-999.0,18.0,-999.0,-999.0,-999.0,-999.0,-999.0,84.31,1.0,-0.03,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1088901,28236,-999.0,95.0,36.40,137.0,91.0,63.0,22.0,-999.0,-999.0,-999.0,-999.0,-999.0,42.00,0.0,-4.24,18.0,0.0
1088902,28236,-999.0,97.0,36.40,164.0,120.0,90.0,18.0,-999.0,111.0,-999.0,-999.0,-999.0,42.00,0.0,-4.24,19.0,0.0
1088903,28236,-999.0,97.0,36.40,162.0,116.0,84.0,18.0,-999.0,-999.0,4.1,-999.0,-999.0,42.00,0.0,-4.24,20.0,0.0
1088904,28236,-999.0,95.0,36.40,154.0,107.0,76.0,19.0,-999.0,-999.0,-999.0,-999.0,-999.0,42.00,0.0,-4.24,21.0,0.0


### HR

In [94]:
def get_hr_label(df):
    df.loc[df.HR > 90, 'hr_label'] = 1
    df.loc[df.HR <= 90, 'hr_label'] = 0
    
    return df

In [95]:
train_df = get_hr_label(train_df)
val_df = get_hr_label(val_df)

### Age

In [111]:
def get_age_label(df):
    df.loc[df.Age >= 80, 'age_label'] = 1 # old
    df.loc[(df.Age >= 18) & (df.Age < 80), 'age_label'] = 0 # adult
    df.loc[df.Age < 18, 'age_label'] = 2 # adult
    
    return df

In [112]:
train_df = get_age_label(train_df)
val_df = get_age_label(val_df)

### Temp

In [98]:
def get_temp_label(df):
    df.loc[(df.Temp >= 38) | (df.Temp <= 36), 'temp_label'] = 0
    df.loc[(df.Temp < 38) & (df.Temp > 36), 'temp_label'] = 1
    
    return df

In [99]:
train_df = get_temp_label(train_df)
val_df = get_temp_label(val_df)

### Resp

normal respiration rate:
- 0 ~ 1 30 ~ 60
- 1 ~ 3 24 ~ 40
- 3 ~ 6 22 ~ 34
- 6 ~ 12 18 ~ 30
- 12 ~ 18 12 ~ 16
- 18+ 12 ~ 20

In [100]:
def get_resp_label(df):
    # normal
    df.loc[(df.Resp.between(30, 60)) & (df.Age <= 1), 'resp_label'] = '0'
    df.loc[(df.Resp.between(24, 40)) & (df.Age.between(2, 3)), 'resp_label'] = '0'
    df.loc[(df.Resp.between(22, 34)) & (df.Age.between(4, 6)), 'resp_label'] = '0'
    df.loc[(df.Resp.between(18, 30)) & (df.Age.between(7, 12)), 'resp_label'] = '0'
    df.loc[(df.Resp.between(12, 16)) & (df.Age.between(13, 17)), 'resp_label'] = '0'
    df.loc[(df.Resp.between(12, 20)) & (df.Age >= 18), 'resp_label'] = '0'
    
    # abnormal
    df.loc[((df.Resp < 30) | (df.Resp > 60)) & (df.Age <= 1), 'resp_label'] = '1'
    df.loc[((df.Resp < 24) | (df.Resp > 40)) & (df.Age.between(2, 3)), 'resp_label'] = '1'
    df.loc[((df.Resp < 22) | (df.Resp > 34)) & (df.Age.between(4, 6)), 'resp_label'] = '1'
    df.loc[((df.Resp < 18) | (df.Resp > 30)) & (df.Age.between(7, 12)), 'resp_label'] = '1'
    df.loc[((df.Resp < 12) | (df.Resp > 16)) & (df.Age.between(13, 17)), 'resp_label'] = '1'
    df.loc[((df.Resp < 12) | (df.Resp > 20)) & (df.Age >= 18), 'resp_label'] = '1'
    
    return df

In [101]:
train_df = get_resp_label(train_df)
val_df = get_resp_label(val_df)

### SBP

In [102]:
def get_sbp_label(df):
    df.loc[df.SBP <= 100, 'sbp_label'] = 1
    df.loc[df.SBP > 100, 'sbp_label'] = 0
    
    return df

In [103]:
train_df = get_sbp_label(train_df)
val_df = get_sbp_label(val_df)

### MAP

In [104]:
# mean arterial pressure
# normal between 70 and 100
def get_map_label(df):
    df.loc[df.MAP.between(70, 100), 'map_label'] = 0
    df.loc[(df.MAP > 100) | (df.MAP < 70), 'map_label'] = 1
    
    return df

In [105]:
train_df = get_map_label(train_df)
val_df = get_map_label(val_df)

### DBP

In [106]:
def get_dbp_label(df):
    df.loc[df.DBP >= 80, 'dbp_label'] = 1
    df.loc[(df.DBP < 80) & (df.DBP > -999), 'dbp_label'] = 0
    df.loc[df.DBP <= -999, 'dbp_label'] = 2
    
    return df

In [107]:
train_df = get_dbp_label(train_df)
val_df = get_dbp_label(val_df)

In [108]:
train_df.head()

Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,Glucose,...,HospAdmTime,ICULOS,SepsisLabel,hr_label,age_label,temp_label,resp_label,sbp_label,map_label,dbp_label
0,1,87.0,98.0,36.3,94.5,71.5,-999.0,21.0,-999.0,-999.0,...,-0.03,1.0,0.0,0.0,1.0,1.0,1,1.0,0.0,2.0
1,1,87.0,98.0,36.3,94.5,71.5,-999.0,21.0,-999.0,-999.0,...,-0.03,2.0,0.0,0.0,1.0,1.0,1,1.0,0.0,2.0
2,1,85.0,97.0,36.3,89.0,62.0,-999.0,22.0,-999.0,-999.0,...,-0.03,3.0,0.0,0.0,1.0,1.0,1,1.0,1.0,2.0
3,1,83.0,97.0,36.28,104.0,66.0,-999.0,22.0,-999.0,109.0,...,-0.03,4.0,0.0,0.0,1.0,1.0,1,0.0,1.0,2.0
4,1,81.0,98.0,36.28,87.0,67.0,-999.0,18.0,-999.0,-999.0,...,-0.03,5.0,0.0,0.0,1.0,1.0,0,1.0,1.0,2.0


In [113]:
train_df.to_csv('../dataset/train_cleaned.csv', sep='|', index=False)
val_df.to_csv('../dataset/val_cleaned.csv', sep='|', index=False)