# Imports

In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Loading dataset

In [4]:
from pypotsModify.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-29 08:42:13 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-29 08:42:13 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-29 08:42:13 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-29 08:42:13 [INFO]: Loaded successfully!


# Training data

<h4>Loading training dataset</h4>

In [5]:
train_X = physionet2012_dataset['train_X']

<h4>Sum of 48 hours of all patients</h4>

In [55]:
total_pacientes = train_X.groupby("RecordID").first().reset_index()
total_pacientes = total_pacientes["RecordID"].count()
total_pacientes = total_pacientes*48
total_pacientes

368208

<h4>Female gender missing rate</h4>

In [65]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = round((female_gender_missing_rate.isna().sum()/total_pacientes) * 100, 2)
female_gender_missing_rate

RecordID        0.00
level_1         0.00
Time            0.00
ALP            43.03
ALT            43.01
AST            43.01
Age             2.67
Albumin        43.19
BUN            40.60
Bilirubin      43.00
Cholesterol    43.67
Creatinine     40.58
DiasABP        21.28
FiO2           36.98
GCS            29.54
Gender         42.84
Glucose        40.74
HCO3           40.65
HCT            39.71
HR              4.29
Height          2.67
ICUType        42.84
K              40.40
Lactate        42.03
MAP            21.42
MechVent       37.11
Mg             40.65
NIDiasABP      24.49
NIMAP          24.74
NISysABP       24.47
Na             40.62
PaCO2          39.02
PaO2           39.02
Platelets      40.61
RespRate       32.42
SaO2           42.11
SysABP         21.28
Temp           28.18
TroponinI      43.65
TroponinT      43.28
Urine          13.35
WBC            40.84
Weight         20.58
pH             38.84
dtype: float64

<h4>Female gender measurements</h4>

In [66]:
female_gender_measurements_training = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_measurements_training = round((female_gender_measurements_training.count()/total_pacientes)*100, 2)
female_gender_measurements_training

RecordID       43.75
level_1        43.75
Time           43.75
ALP             0.72
ALT             0.74
AST             0.74
Age            41.08
Albumin         0.56
BUN             3.15
Bilirubin       0.75
Cholesterol     0.07
Creatinine      3.16
DiasABP        22.47
FiO2            6.77
GCS            14.21
Gender          0.91
Glucose         3.01
HCO3            3.10
HCT             4.04
HR             39.46
Height         41.08
ICUType         0.91
K               3.35
Lactate         1.72
MAP            22.33
MechVent        6.64
Mg              3.10
NIDiasABP      19.26
NIMAP          19.01
NISysABP       19.28
Na              3.13
PaCO2           4.73
PaO2            4.73
Platelets       3.13
RespRate       11.33
SaO2            1.64
SysABP         22.47
Temp           15.57
TroponinI       0.10
TroponinT       0.47
Urine          30.40
WBC             2.91
Weight         23.17
pH              4.91
dtype: float64

<h4>Male gender missing rate</h4>

In [67]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = round((male_gender_missing_rate.isna().sum()/total_pacientes)*100, 2)
male_gender_missing_rate

RecordID        0.00
level_1         0.00
Time            0.00
ALP            55.18
ALT            55.15
AST            55.15
Age             3.31
Albumin        55.41
BUN            52.05
Bilirubin      55.15
Cholesterol    56.02
Creatinine     52.03
DiasABP        24.54
FiO2           47.28
GCS            38.19
Gender         54.95
Glucose        52.31
HCO3           52.15
HCT            50.67
HR              5.59
Height          3.31
ICUType        54.95
K              51.88
Lactate        53.75
MAP            24.72
MechVent       47.67
Mg             52.13
NIDiasABP      33.51
NIMAP          33.79
NISysABP       33.49
Na             52.17
PaCO2          49.34
PaO2           49.35
Platelets      51.89
RespRate       43.63
SaO2           53.74
SysABP         24.53
Temp           34.52
TroponinI      56.00
TroponinT      55.51
Urine          17.35
WBC            52.31
Weight         27.01
pH             48.98
dtype: float64

<h4>Male gender measurements</h4>

In [68]:
male_gender_measurements_training = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_measurements_training  = round((male_gender_measurements_training.count()/total_pacientes)*100, 2)
male_gender_measurements_training 

RecordID       56.12
level_1        56.12
Time           56.12
ALP             0.94
ALT             0.97
AST             0.97
Age            52.81
Albumin         0.71
BUN             4.07
Bilirubin       0.97
Cholesterol     0.10
Creatinine      4.09
DiasABP        31.58
FiO2            8.84
GCS            17.93
Gender          1.17
Glucose         3.81
HCO3            3.97
HCT             5.45
HR             50.53
Height         52.81
ICUType         1.17
K               4.24
Lactate         2.37
MAP            31.41
MechVent        8.46
Mg              3.99
NIDiasABP      22.61
NIMAP          22.33
NISysABP       22.63
Na              3.95
PaCO2           6.78
PaO2            6.77
Platelets       4.23
RespRate       12.49
SaO2            2.38
SysABP         31.59
Temp           21.60
TroponinI       0.12
TroponinT       0.61
Urine          38.77
WBC             3.81
Weight         29.12
pH              7.14
dtype: float64

<h4>Undefined gender missing rate</h4> 

In [69]:
undefined_gender_ids = train_X[train_X["Gender"] == -1.0]
undefined_gender_ids = undefined_gender_ids["RecordID"]
undefined_gender_missing_rate = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_missing_rate = round((undefined_gender_missing_rate.isna().sum()/total_pacientes)*100, 2)
undefined_gender_missing_rate

RecordID       0.00
level_1        0.00
Time           0.00
ALP            0.13
ALT            0.13
AST            0.13
Age            0.04
Albumin        0.13
BUN            0.12
Bilirubin      0.13
Cholesterol    0.13
Creatinine     0.12
DiasABP        0.08
FiO2           0.12
GCS            0.10
Gender         0.13
Glucose        0.12
HCO3           0.12
HCT            0.12
HR             0.04
Height         0.04
ICUType        0.13
K              0.12
Lactate        0.13
MAP            0.08
MechVent       0.12
Mg             0.12
NIDiasABP      0.09
NIMAP          0.09
NISysABP       0.09
Na             0.12
PaCO2          0.13
PaO2           0.13
Platelets      0.12
RespRate       0.09
SaO2           0.13
SysABP         0.08
Temp           0.10
TroponinI      0.13
TroponinT      0.13
Urine          0.07
WBC            0.12
Weight         0.07
pH             0.12
dtype: float64

<h4>Undefined gender measurements</h4>

In [70]:
undefined_gender_measurements_training = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_measurements_training = round((undefined_gender_measurements_training.count()/total_pacientes)*100, 2)
undefined_gender_measurements_training

RecordID       0.13
level_1        0.13
Time           0.13
ALP            0.00
ALT            0.00
AST            0.00
Age            0.09
Albumin        0.00
BUN            0.01
Bilirubin      0.00
Cholesterol    0.00
Creatinine     0.01
DiasABP        0.05
FiO2           0.01
GCS            0.03
Gender         0.00
Glucose        0.01
HCO3           0.01
HCT            0.01
HR             0.09
Height         0.09
ICUType        0.00
K              0.01
Lactate        0.00
MAP            0.05
MechVent       0.01
Mg             0.01
NIDiasABP      0.05
NIMAP          0.05
NISysABP       0.05
Na             0.01
PaCO2          0.00
PaO2           0.00
Platelets      0.01
RespRate       0.04
SaO2           0.00
SysABP         0.05
Temp           0.03
TroponinI      0.00
TroponinT      0.00
Urine          0.06
WBC            0.01
Weight         0.06
pH             0.01
dtype: float64

<h4>ICUType 1 missing rate</h4>

In [73]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = round((ICUType_1_training.isna().sum()/total_pacientes)*100, 2)
ICUType_1_training_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            14.30
ALT            14.29
AST            14.29
Age             1.24
Albumin        14.36
BUN            13.48
Bilirubin      14.30
Cholesterol    14.43
Creatinine     13.46
DiasABP         8.47
FiO2           12.89
GCS            10.79
Gender         14.22
Glucose        13.53
HCO3           13.52
HCT            13.27
HR              1.87
Height          1.24
ICUType        14.22
K              13.31
Lactate        14.18
MAP             8.49
MechVent       12.99
Mg             13.49
NIDiasABP       7.57
NIMAP           7.61
NISysABP        7.57
Na             13.52
PaCO2          13.28
PaO2           13.28
Platelets      13.49
RespRate        9.41
SaO2           13.79
SysABP          8.47
Temp           10.15
TroponinI      14.47
TroponinT      14.21
Urine           6.33
WBC            13.59
Weight          7.94
pH             13.25
dtype: float64

<h4>ICUType 1 mesurements</h4>

In [75]:
ICUType_1_measurements_training = round((ICUType_1_training.count()/total_pacientes)*100, 2)
ICUType_1_measurements_training

RecordID       14.52
level_1        14.52
Time           14.52
ALP             0.22
ALT             0.23
AST             0.23
Age            13.28
Albumin         0.17
BUN             1.04
Bilirubin       0.22
Cholesterol     0.09
Creatinine      1.06
DiasABP         6.05
FiO2            1.64
GCS             3.74
Gender          0.30
Glucose         1.00
HCO3            1.00
HCT             1.26
HR             12.66
Height         13.28
ICUType         0.30
K               1.21
Lactate         0.34
MAP             6.03
MechVent        1.53
Mg              1.03
NIDiasABP       6.95
NIMAP           6.91
NISysABP        6.95
Na              1.00
PaCO2           1.24
PaO2            1.24
Platelets       1.03
RespRate        5.12
SaO2            0.73
SysABP          6.05
Temp            4.38
TroponinI       0.05
TroponinT       0.31
Urine           8.19
WBC             0.93
Weight          6.58
pH              1.27
dtype: float64

<h4>ICUType missing rate</h4>

In [76]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = round((ICUType_2_training.isna().sum()/total_pacientes)*100,2)
ICUType_2_training_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            21.12
ALT            21.11
AST            21.12
Age             0.78
Albumin        21.17
BUN            19.89
Bilirubin      21.12
Cholesterol    21.26
Creatinine     19.89
DiasABP         4.58
FiO2           17.72
GCS            15.65
Gender         20.82
Glucose        20.26
HCO3           19.99
HCT            18.94
HR              1.82
Height          0.78
ICUType        20.82
K              20.13
Lactate        20.53
MAP             4.57
MechVent       17.76
Mg             19.93
NIDiasABP      16.75
NIMAP          16.79
NISysABP       16.74
Na             20.19
PaCO2          17.00
PaO2           17.01
Platelets      19.55
RespRate       20.32
SaO2           19.00
SysABP          4.58
Temp            8.92
TroponinI      21.23
TroponinT      21.21
Urine           3.47
WBC            19.83
Weight         10.20
pH             16.59
dtype: float64

<h4>ICUType 2 measurements</h4>

In [77]:
ICUType_2_measurements_training = round((ICUType_2_training.count()/total_pacientes)*100, 2)
ICUType_2_measurements_training

RecordID       21.26
level_1        21.26
Time           21.26
ALP             0.14
ALT             0.15
AST             0.15
Age            20.48
Albumin         0.10
BUN             1.37
Bilirubin       0.14
Cholesterol     0.01
Creatinine      1.37
DiasABP        16.68
FiO2            3.54
GCS             5.61
Gender          0.44
Glucose         1.00
HCO3            1.28
HCT             2.32
HR             19.45
Height         20.48
ICUType         0.44
K               1.14
Lactate         0.74
MAP            16.69
MechVent        3.51
Mg              1.34
NIDiasABP       4.51
NIMAP           4.47
NISysABP        4.52
Na              1.07
PaCO2           4.26
PaO2            4.25
Platelets       1.72
RespRate        0.94
SaO2            2.26
SysABP         16.68
Temp           12.34
TroponinI       0.03
TroponinT       0.05
Urine          17.80
WBC             1.43
Weight         11.07
pH              4.68
dtype: float64

<h4>ICUType 3 missing rate</h4>

In [78]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = round((ICUType_3_training.isna().sum()/total_pacientes)*100, 2)
ICUType_3_training_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            35.03
ALT            35.00
AST            35.00
Age             2.71
Albumin        35.21
BUN            33.11
Bilirubin      34.97
Cholesterol    35.77
Creatinine     33.11
DiasABP        23.35
FiO2           30.61
GCS            26.26
Gender         35.06
Glucose        33.12
HCO3           33.11
HCT            32.65
HR              3.94
Height          2.71
ICUType        35.06
K              32.89
Lactate        34.44
MAP            23.50
MechVent       30.95
Mg             33.26
NIDiasABP      15.35
NIMAP          15.70
NISysABP       15.33
Na             33.05
PaCO2          33.12
PaO2           33.12
Platelets      33.37
RespRate       24.88
SaO2           35.38
SysABP         23.35
Temp           25.60
TroponinI      35.72
TroponinT      35.32
Urine          14.04
WBC            33.48
Weight         13.04
pH             33.08
dtype: float64

<h4>ICUType 3 measurements</h4>

In [79]:
ICUType_3_measurements_training = round((ICUType_3_training.count()/total_pacientes)*100, 2)
ICUType_3_measurements_training

RecordID       35.81
level_1        35.81
Time           35.81
ALP             0.78
ALT             0.81
AST             0.81
Age            33.10
Albumin         0.60
BUN             2.70
Bilirubin       0.84
Cholesterol     0.04
Creatinine      2.70
DiasABP        12.46
FiO2            5.20
GCS             9.55
Gender          0.75
Glucose         2.69
HCO3            2.70
HCT             3.16
HR             31.87
Height         33.10
ICUType         0.75
K               2.92
Lactate         1.37
MAP            12.31
MechVent        4.86
Mg              2.55
NIDiasABP      20.46
NIMAP          20.11
NISysABP       20.48
Na              2.76
PaCO2           2.69
PaO2            2.69
Platelets       2.44
RespRate       10.93
SaO2            0.43
SysABP         12.46
Temp           10.22
TroponinI       0.09
TroponinT       0.49
Urine          21.77
WBC             2.33
Weight         22.77
pH              2.73
dtype: float64

<h4>ICUType 4 missing rate</h4>

In [80]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = round((ICUType_4_training.isna().sum()/total_pacientes)*100, 2)
ICUType_4_training_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            27.89
ALT            27.88
AST            27.88
Age             1.28
Albumin        28.00
BUN            26.28
Bilirubin      27.89
Cholesterol    28.37
Creatinine     26.28
DiasABP         9.50
FiO2           23.16
GCS            15.13
Gender         27.81
Glucose        26.26
HCO3           26.31
HCT            25.65
HR              2.31
Height          1.28
ICUType        27.81
K              26.07
Lactate        26.76
MAP             9.65
MechVent       23.21
Mg             26.23
NIDiasABP      18.41
NIMAP          18.52
NISysABP       18.40
Na             26.15
PaCO2          25.08
PaO2           25.09
Platelets      26.23
RespRate       21.54
SaO2           27.81
SysABP          9.50
Temp           18.14
TroponinI      28.36
TroponinT      28.19
Urine           6.93
WBC            26.37
Weight         16.47
pH             25.03
dtype: float64

<h4>ICUType 4 measurements</h4>

In [81]:
ICUType_4_measurements_training = round((ICUType_4_training.count()/total_pacientes)*100,2)
ICUType_4_measurements_training 

RecordID       28.41
level_1        28.41
Time           28.41
ALP             0.51
ALT             0.52
AST             0.52
Age            27.12
Albumin         0.41
BUN             2.12
Bilirubin       0.52
Cholesterol     0.04
Creatinine      2.12
DiasABP        18.91
FiO2            5.24
GCS            13.28
Gender          0.59
Glucose         2.14
HCO3            2.10
HCT             2.76
HR             26.10
Height         27.12
ICUType         0.59
K               2.33
Lactate         1.64
MAP            18.76
MechVent        5.20
Mg              2.18
NIDiasABP       9.99
NIMAP           9.88
NISysABP       10.01
Na              2.26
PaCO2           3.32
PaO2            3.31
Platelets       2.18
RespRate        6.86
SaO2            0.60
SysABP         18.91
Temp           10.27
TroponinI       0.05
TroponinT       0.21
Urine          21.47
WBC             2.04
Weight         11.93
pH              3.37
dtype: float64

<h4>+65 missing rate</h4>

In [82]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = round((more_than_or_equal_to_65_train.isna().sum()/total_pacientes)*100,2)
more_than_or_equal_to_65_train_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            53.63
ALT            53.62
AST            53.62
Age             3.07
Albumin        53.74
BUN            50.54
Bilirubin      53.60
Cholesterol    54.24
Creatinine     50.52
DiasABP        24.58
FiO2           45.86
GCS            37.35
Gender         53.22
Glucose        50.79
HCO3           50.62
HCT            49.25
HR              5.11
Height          3.07
ICUType        53.22
K              50.37
Lactate        52.25
MAP            24.74
MechVent       46.29
Mg             50.60
NIDiasABP      31.43
NIMAP          31.68
NISysABP       31.41
Na             50.66
PaCO2          48.02
PaO2           48.04
Platelets      50.45
RespRate       40.95
SaO2           51.90
SysABP         24.58
Temp           33.40
TroponinI      54.19
TroponinT      53.60
Urine          15.98
WBC            50.79
Weight         25.54
pH             47.72
dtype: float64

<h4>+65 measurements</h4>

In [83]:
age_65_and_above_measurements_training = round((more_than_or_equal_to_65_train.count()/total_pacientes)*100,2)
age_65_and_above_measurements_training

RecordID       54.35
level_1        54.35
Time           54.35
ALP             0.72
ALT             0.73
AST             0.73
Age            51.27
Albumin         0.60
BUN             3.81
Bilirubin       0.75
Cholesterol     0.11
Creatinine      3.83
DiasABP        29.76
FiO2            8.49
GCS            17.00
Gender          1.13
Glucose         3.55
HCO3            3.73
HCT             5.10
HR             49.24
Height         51.27
ICUType         1.13
K               3.97
Lactate         2.10
MAP            29.61
MechVent        8.06
Mg              3.75
NIDiasABP      22.92
NIMAP          22.67
NISysABP       22.94
Na              3.69
PaCO2           6.32
PaO2            6.31
Platelets       3.90
RespRate       13.40
SaO2            2.45
SysABP         29.76
Temp           20.95
TroponinI       0.16
TroponinT       0.75
Urine          38.37
WBC             3.56
Weight         28.81
pH              6.63
dtype: float64

<h4>-65 missing rate</h4>

In [84]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = round((less_than_65_train.isna().sum()/total_pacientes)*100,2)
less_than_65_train_missing


RecordID        0.00
level_1         0.00
Time            0.00
ALP            44.71
ALT            44.68
AST            44.68
Age             2.94
Albumin        44.99
BUN            42.23
Bilirubin      44.68
Cholesterol    45.58
Creatinine     42.22
DiasABP        21.31
FiO2           38.52
GCS            30.48
Gender         44.70
Glucose        42.38
HCO3           42.30
HCT            41.25
HR              4.82
Height          2.94
ICUType        44.70
K              42.03
Lactate        43.66
MAP            21.47
MechVent       38.61
Mg             42.31
NIDiasABP      26.65
NIMAP          26.93
NISysABP       26.63
Na             42.25
PaCO2          40.46
PaO2           40.47
Platelets      42.19
RespRate       35.20
SaO2           44.08
SysABP         21.31
Temp           29.40
TroponinI      45.59
TroponinT      45.32
Urine          14.79
WBC            42.48
Weight         22.12
pH             40.22
dtype: float64

<h4>-65 measurements</h4>

In [86]:
age_under_65_measurements_training  = round((less_than_65_train.count()/total_pacientes)*100,2)
age_under_65_measurements_training 

RecordID       45.65
level_1        45.65
Time           45.65
ALP             0.94
ALT             0.97
AST             0.97
Age            42.71
Albumin         0.66
BUN             3.42
Bilirubin       0.98
Cholesterol     0.07
Creatinine      3.43
DiasABP        24.34
FiO2            7.13
GCS            15.17
Gender          0.95
Glucose         3.27
HCO3            3.35
HCT             4.40
HR             40.83
Height         42.71
ICUType         0.95
K               3.62
Lactate         1.99
MAP            24.18
MechVent        7.04
Mg              3.34
NIDiasABP      19.00
NIMAP          18.72
NISysABP       19.02
Na              3.40
PaCO2           5.20
PaO2            5.19
Platelets       3.47
RespRate       10.46
SaO2            1.57
SysABP         24.34
Temp           16.25
TroponinI       0.07
TroponinT       0.33
Urine          30.86
WBC             3.17
Weight         23.53
pH              5.43
dtype: float64

<h4>Filtering only patients who have the same height and weight</h4>

In [24]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

<h4>Classify BMI</h4>

In [25]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

<h4>Set the height to meters</h4>

In [26]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 100877, dtype: float64

<h4>BMI Calculation and Classification</h4>

In [27]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


<h4>Taking only the first occurrence of each patient</h4>

In [29]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.00,37.50,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.30,,,600.0,11.5,84.6,,26.0,Sobrepeso
2,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.30,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
3,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.00,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
4,132555,0,0.0,,,,74.0,,19.0,,...,98.00,34.80,,,35.0,9.0,66.1,7.39,21.5,Peso normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4001,163003,0,0.0,124.0,254.0,204.0,36.0,2.6,3.0,30.3,...,,36.30,,,50.0,10.6,57.7,7.47,21.2,Peso normal
4002,163007,0,0.0,42.0,30.0,40.0,19.0,2.8,16.0,0.6,...,0.00,40.55,,1.00,150.0,14.1,114.3,7.36,34.2,Obesidade grau 1
4003,163008,0,0.0,,,,59.0,,24.0,,...,97.00,37.60,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4004,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.00,36.50,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso


In [30]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1377
Peso normal         1200
Obesidade grau 1     702
Obesidade grau 2     319
Obesidade grau 3     292
Baixo peso           116
Name: count, dtype: int64

<h4>Classification Undefined missing rate</h4>

In [105]:
classificacao_undefined_ids = bmi_data_train["RecordID"]
classificacao_undefined = train_X[~train_X["RecordID"].isin(classificacao_undefined_ids)]
classificacao_undefined_missing = round((classificacao_undefined.isna().sum()/total_pacientes)*100,2)
classificacao_undefined_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            46.97
ALT            46.94
AST            46.94
Age             3.85
Albumin        47.13
BUN            44.43
Bilirubin      46.93
Cholesterol    47.70
Creatinine     44.42
DiasABP        28.99
FiO2           41.25
GCS            31.81
Gender         46.78
Glucose        44.44
HCO3           44.46
HCT            43.69
HR              5.58
Height          3.85
ICUType        46.78
K              44.10
Lactate        46.17
MAP            29.19
MechVent       41.67
Mg             44.52
NIDiasABP      22.40
NIMAP          22.78
NISysABP       22.38
Na             44.31
PaCO2          44.41
PaO2           44.42
Platelets      44.66
RespRate       31.74
SaO2           47.24
SysABP         28.99
Temp           34.93
TroponinI      47.69
TroponinT      47.20
Urine          17.61
WBC            44.79
Weight         22.83
pH             44.35
dtype: float64

<h4>Classification Undefined measurements</h4>

In [88]:
classification_undefined_measurements = round((classificacao_undefined.count()/total_pacientes)*100,2)
classification_undefined_measurements

RecordID       47.78
level_1        47.78
Time           47.78
ALP             0.81
ALT             0.84
AST             0.84
Age            43.93
Albumin         0.65
BUN             3.34
Bilirubin       0.85
Cholesterol     0.08
Creatinine      3.36
DiasABP        18.79
FiO2            6.53
GCS            15.97
Gender          1.00
Glucose         3.34
HCO3            3.32
HCT             4.08
HR             42.20
Height         43.93
ICUType         1.00
K               3.67
Lactate         1.61
MAP            18.58
MechVent        6.11
Mg              3.26
NIDiasABP      25.37
NIMAP          24.99
NISysABP       25.39
Na              3.46
PaCO2           3.37
PaO2            3.36
Platelets       3.12
RespRate       16.04
SaO2            0.54
SysABP         18.79
Temp           12.85
TroponinI       0.08
TroponinT       0.58
Urine          30.17
WBC             2.98
Weight         24.95
pH              3.43
dtype: float64

<h4>Low weight classification missing rate</h4>

In [89]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = round((classificacao_baixo_peso.isna().sum()/total_pacientes)*100,2)
classificacao_baixo_peso_missing

RecordID       0.00
level_1        0.00
Time           0.00
ALP            1.49
ALT            1.49
AST            1.49
Age            0.04
Albumin        1.49
BUN            1.40
Bilirubin      1.49
Cholesterol    1.51
Creatinine     1.40
DiasABP        0.49
FiO2           1.24
GCS            1.03
Gender         1.48
Glucose        1.40
HCO3           1.40
HCT            1.37
HR             0.09
Height         0.04
ICUType        1.48
K              1.39
Lactate        1.43
MAP            0.49
MechVent       1.23
Mg             1.40
NIDiasABP      0.99
NIMAP          1.00
NISysABP       0.99
Na             1.40
PaCO2          1.30
PaO2           1.30
Platelets      1.40
RespRate       1.24
SaO2           1.42
SysABP         0.49
Temp           0.85
TroponinI      1.51
TroponinT      1.49
Urine          0.38
WBC            1.41
Weight         0.67
pH             1.29
dtype: float64

<h4>Classificacao low weight measurements</h4>

In [90]:
classificacao_baixo_peso_measurements = round((classificacao_baixo_peso.count()/total_pacientes)*100,2)
classificacao_baixo_peso_measurements

RecordID       1.51
level_1        1.51
Time           1.51
ALP            0.02
ALT            0.02
AST            0.02
Age            1.47
Albumin        0.02
BUN            0.11
Bilirubin      0.02
Cholesterol    0.00
Creatinine     0.12
DiasABP        1.03
FiO2           0.27
GCS            0.49
Gender         0.03
Glucose        0.11
HCO3           0.11
HCT            0.15
HR             1.42
Height         1.47
ICUType        0.03
K              0.12
Lactate        0.08
MAP            1.03
MechVent       0.28
Mg             0.11
NIDiasABP      0.52
NIMAP          0.51
NISysABP       0.52
Na             0.11
PaCO2          0.21
PaO2           0.21
Platelets      0.12
RespRate       0.27
SaO2           0.09
SysABP         1.03
Temp           0.66
TroponinI      0.01
TroponinT      0.02
Urine          1.13
WBC            0.11
Weight         0.84
pH             0.22
dtype: float64

<h4>Classification normal weight missing rate</h4>

In [91]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = round((classificacao_normal_peso.isna().sum()/total_pacientes)*100,2)
classificacao_normal_peso_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            15.39
ALT            15.39
AST            15.39
Age             0.63
Albumin        15.45
BUN            14.49
Bilirubin      15.38
Cholesterol    15.62
Creatinine     14.49
DiasABP         5.29
FiO2           13.07
GCS            10.55
Gender         15.32
Glucose        14.59
HCO3           14.53
HCT            14.03
HR              1.27
Height          0.63
ICUType        15.32
K              14.45
Lactate        14.91
MAP             5.33
MechVent       13.05
Mg             14.50
NIDiasABP      10.44
NIMAP          10.47
NISysABP       10.43
Na             14.55
PaCO2          13.38
PaO2           13.39
Platelets      14.37
RespRate       13.09
SaO2           14.74
SysABP          5.29
Temp            8.66
TroponinI      15.60
TroponinT      15.50
Urine           4.09
WBC            14.53
Weight          7.68
pH             13.23
dtype: float64

<h4>Classification normal weight measurements</h4>

In [92]:
classificacao_normal_peso_measurements = round((classificacao_normal_peso.count()/total_pacientes)*100,2)
classificacao_normal_peso_measurements

RecordID       15.64
level_1        15.64
Time           15.64
ALP             0.25
ALT             0.26
AST             0.26
Age            15.01
Albumin         0.19
BUN             1.15
Bilirubin       0.26
Cholesterol     0.03
Creatinine      1.16
DiasABP        10.36
FiO2            2.57
GCS             5.09
Gender          0.33
Glucose         1.05
HCO3            1.12
HCT             1.61
HR             14.38
Height         15.01
ICUType         0.33
K               1.19
Lactate         0.73
MAP            10.32
MechVent        2.60
Mg              1.15
NIDiasABP       5.21
NIMAP           5.17
NISysABP        5.21
Na              1.10
PaCO2           2.26
PaO2            2.25
Platelets       1.27
RespRate        2.55
SaO2            0.90
SysABP         10.36
Temp            6.98
TroponinI       0.04
TroponinT       0.14
Urine          11.55
WBC             1.12
Weight          7.96
pH              2.41
dtype: float64

<h4>Classification overweight missing rate</h4>

In [94]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = round((classificacao_sobrepeso.isna().sum()/total_pacientes)*100,2)
classificacao_sobrepeso_missing

RecordID        0.00
level_1         0.00
Time            0.00
ALP            17.66
ALT            17.65
AST            17.65
Age             0.83
Albumin        17.75
BUN            16.61
Bilirubin      17.65
Cholesterol    17.92
Creatinine     16.61
DiasABP         5.70
FiO2           14.87
GCS            12.43
Gender         17.58
Glucose        16.76
HCO3           16.66
HCT            16.07
HR              1.62
Height          0.83
ICUType        17.58
K              16.62
Lactate        17.17
MAP             5.73
MechVent       14.91
Mg             16.63
NIDiasABP      12.39
NIMAP          12.46
NISysABP       12.39
Na             16.72
PaCO2          15.14
PaO2           15.15
Platelets      16.47
RespRate       15.26
SaO2           16.69
SysABP          5.70
Temp            9.46
TroponinI      17.90
TroponinT      17.79
Urine           4.62
WBC            16.65
Weight          8.76
pH             14.96
dtype: float64

<h4>Classification overweight measurements</h4>

In [95]:
classificacao_sobrepeso_measurements = round((classificacao_sobrepeso.count()/total_pacientes)*100,2)
classificacao_baixo_peso_measurements

RecordID       1.51
level_1        1.51
Time           1.51
ALP            0.02
ALT            0.02
AST            0.02
Age            1.47
Albumin        0.02
BUN            0.11
Bilirubin      0.02
Cholesterol    0.00
Creatinine     0.12
DiasABP        1.03
FiO2           0.27
GCS            0.49
Gender         0.03
Glucose        0.11
HCO3           0.11
HCT            0.15
HR             1.42
Height         1.47
ICUType        0.03
K              0.12
Lactate        0.08
MAP            1.03
MechVent       0.28
Mg             0.11
NIDiasABP      0.52
NIMAP          0.51
NISysABP       0.52
Na             0.11
PaCO2          0.21
PaO2           0.21
Platelets      0.12
RespRate       0.27
SaO2           0.09
SysABP         1.03
Temp           0.66
TroponinI      0.01
TroponinT      0.02
Urine          1.13
WBC            0.11
Weight         0.84
pH             0.22
dtype: float64

<h4>Grade 1 obesity missing rate</h4>

In [97]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1 = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = round((classificacao_obesidade_1.isna().sum()/total_pacientes)*100,2)
classificacao_obesidade_1_missing

RecordID       0.00
level_1        0.00
Time           0.00
ALP            9.00
ALT            9.00
AST            9.00
Age            0.33
Albumin        9.05
BUN            8.47
Bilirubin      9.00
Cholesterol    9.13
Creatinine     8.47
DiasABP        2.82
FiO2           7.51
GCS            6.38
Gender         8.96
Glucose        8.55
HCO3           8.50
HCT            8.18
HR             0.72
Height         0.33
ICUType        8.96
K              8.47
Lactate        8.70
MAP            2.85
MechVent       7.55
Mg             8.49
NIDiasABP      6.34
NIMAP          6.36
NISysABP       6.34
Na             8.53
PaCO2          7.62
PaO2           7.62
Platelets      8.41
RespRate       7.90
SaO2           8.49
SysABP         2.82
Temp           4.68
TroponinI      9.12
TroponinT      9.06
Urine          2.10
WBC            8.50
Weight         4.25
pH             7.53
dtype: float64

<h4>Grade 1 obesity measurements </h4>

In [98]:
classificacao_obesidade_1_measurements = round((classificacao_obesidade_1.count()/total_pacientes)*100,2)
classificacao_obesidade_1_measurements

RecordID       9.15
level_1        9.15
Time           9.15
ALP            0.15
ALT            0.15
AST            0.15
Age            8.82
Albumin        0.11
BUN            0.68
Bilirubin      0.15
Cholesterol    0.02
Creatinine     0.68
DiasABP        6.33
FiO2           1.64
GCS            2.77
Gender         0.19
Glucose        0.60
HCO3           0.65
HCT            0.97
HR             8.43
Height         8.82
ICUType        0.19
K              0.68
Lactate        0.45
MAP            6.30
MechVent       1.60
Mg             0.66
NIDiasABP      2.81
NIMAP          2.79
NISysABP       2.81
Na             0.62
PaCO2          1.53
PaO2           1.53
Platelets      0.74
RespRate       1.25
SaO2           0.66
SysABP         6.33
Temp           4.47
TroponinI      0.03
TroponinT      0.09
Urine          7.05
WBC            0.65
Weight         4.90
pH             1.62
dtype: float64

<h4>Grade 2 Obesity missing rate</h4>

In [99]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2 = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = round((classificacao_obesidade_2.isna().sum()/total_pacientes)*100,2)
classificacao_obesidade_2_missing

RecordID       0.00
level_1        0.00
Time           0.00
ALP            4.09
ALT            4.09
AST            4.09
Age            0.18
Albumin        4.11
BUN            3.84
Bilirubin      4.09
Cholesterol    4.15
Creatinine     3.84
DiasABP        1.30
FiO2           3.40
GCS            2.91
Gender         4.07
Glucose        3.88
HCO3           3.85
HCT            3.71
HR             0.35
Height         0.18
ICUType        4.07
K              3.85
Lactate        3.93
MAP            1.30
MechVent       3.44
Mg             3.85
NIDiasABP      2.90
NIMAP          2.91
NISysABP       2.90
Na             3.86
PaCO2          3.46
PaO2           3.46
Platelets      3.81
RespRate       3.53
SaO2           3.85
SysABP         1.30
Temp           2.06
TroponinI      4.15
TroponinT      4.12
Urine          1.02
WBC            3.85
Weight         1.84
pH             3.43
dtype: float64

<h4>Grade 2 Obesity measurements</h4>

In [100]:
classificacao_obesidade_2_measurements = round((classificacao_obesidade_2.count()/total_pacientes)*100,2)
classificacao_obesidade_2_measurements

RecordID       4.16
level_1        4.16
Time           4.16
ALP            0.07
ALT            0.07
AST            0.07
Age            3.98
Albumin        0.05
BUN            0.32
Bilirubin      0.07
Cholesterol    0.01
Creatinine     0.32
DiasABP        2.86
FiO2           0.75
GCS            1.24
Gender         0.09
Glucose        0.28
HCO3           0.31
HCT            0.44
HR             3.81
Height         3.98
ICUType        0.09
K              0.31
Lactate        0.22
MAP            2.86
MechVent       0.72
Mg             0.31
NIDiasABP      1.25
NIMAP          1.24
NISysABP       1.26
Na             0.29
PaCO2          0.70
PaO2           0.70
Platelets      0.35
RespRate       0.62
SaO2           0.31
SysABP         2.86
Temp           2.10
TroponinI      0.01
TroponinT      0.04
Urine          3.14
WBC            0.31
Weight         2.32
pH             0.73
dtype: float64

<h4>Grade 3 Obesity missing rate</h4>

In [101]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3 = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = round((classificacao_obesidade_3.isna().sum()/total_pacientes)*100,2)
classificacao_obesidade_3_missing

RecordID       0.00
level_1        0.00
Time           0.00
ALP            3.74
ALT            3.74
AST            3.74
Age            0.15
Albumin        3.76
BUN            3.52
Bilirubin      3.74
Cholesterol    3.80
Creatinine     3.52
DiasABP        1.32
FiO2           3.04
GCS            2.71
Gender         3.73
Glucose        3.55
HCO3           3.53
HCT            3.45
HR             0.30
Height         0.15
ICUType        3.73
K              3.52
Lactate        3.59
MAP            1.33
MechVent       3.05
Mg             3.53
NIDiasABP      2.61
NIMAP          2.62
NISysABP       2.61
Na             3.54
PaCO2          3.17
PaO2           3.17
Platelets      3.52
RespRate       3.38
SaO2           3.54
SysABP         1.32
Temp           2.16
TroponinI      3.80
TroponinT      3.77
Urine          0.95
WBC            3.55
Weight         1.62
pH             3.15
dtype: float64

<h4>Grade 3 Obesity measurements</h4>

In [102]:
classificacao_obesidade_3_measurements = round((classificacao_obesidade_3.count()/total_pacientes)*100,2)
classificacao_obesidade_3_measurements

RecordID       3.81
level_1        3.81
Time           3.81
ALP            0.06
ALT            0.07
AST            0.07
Age            3.66
Albumin        0.05
BUN            0.28
Bilirubin      0.07
Cholesterol    0.01
Creatinine     0.28
DiasABP        2.49
FiO2           0.77
GCS            1.09
Gender         0.08
Glucose        0.26
HCO3           0.28
HCT            0.36
HR             3.51
Height         3.66
ICUType        0.08
K              0.29
Lactate        0.22
MAP            2.48
MechVent       0.76
Mg             0.28
NIDiasABP      1.20
NIMAP          1.18
NISysABP       1.20
Na             0.26
PaCO2          0.64
PaO2           0.64
Platelets      0.29
RespRate       0.42
SaO2           0.26
SysABP         2.49
Temp           1.65
TroponinI      0.01
TroponinT      0.04
Urine          2.85
WBC            0.26
Weight         2.19
pH             0.66
dtype: float64

<h4>Columns for tables</h4>

In [50]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

<h4>Building the missing rate table</h4>

In [106]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["Undefined gender"] = undefined_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
df_missing_transpose["Undefined classification"] = classificacao_undefined_missing
df_missing_transpose = df_missing_transpose.drop("RecordID", axis=0)
df_missing_transpose = df_missing_transpose.drop("level_1", axis=0)
df_missing_transpose = df_missing_transpose.drop("Time", axis=0)
df_missing_transpose = df_missing_transpose.drop("Age", axis=0)
df_missing_transpose = df_missing_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,43.03,55.18,0.13,14.3,21.12,35.03,27.89,53.63,44.71,1.49,15.39,17.66,9.0,4.09,3.74,46.97
ALT,43.01,55.15,0.13,14.29,21.11,35.0,27.88,53.62,44.68,1.49,15.39,17.65,9.0,4.09,3.74,46.94
AST,43.01,55.15,0.13,14.29,21.12,35.0,27.88,53.62,44.68,1.49,15.39,17.65,9.0,4.09,3.74,46.94
Albumin,43.19,55.41,0.13,14.36,21.17,35.21,28.0,53.74,44.99,1.49,15.45,17.75,9.05,4.11,3.76,47.13
BUN,40.6,52.05,0.12,13.48,19.89,33.11,26.28,50.54,42.23,1.4,14.49,16.61,8.47,3.84,3.52,44.43
Bilirubin,43.0,55.15,0.13,14.3,21.12,34.97,27.89,53.6,44.68,1.49,15.38,17.65,9.0,4.09,3.74,46.93
Cholesterol,43.67,56.02,0.13,14.43,21.26,35.77,28.37,54.24,45.58,1.51,15.62,17.92,9.13,4.15,3.8,47.7
Creatinine,40.58,52.03,0.12,13.46,19.89,33.11,26.28,50.52,42.22,1.4,14.49,16.61,8.47,3.84,3.52,44.42
DiasABP,21.28,24.54,0.08,8.47,4.58,23.35,9.5,24.58,21.31,0.49,5.29,5.7,2.82,1.3,1.32,28.99
FiO2,36.98,47.28,0.12,12.89,17.72,30.61,23.16,45.86,38.52,1.24,13.07,14.87,7.51,3.4,3.04,41.25


<h4>Building the measurements table</h4>

In [107]:
df_measurements = pd.DataFrame(columns=df_columns)
df_measurements_transpose = df_measurements.T
df_measurements_transpose["Female"] = female_gender_measurements_training
df_measurements_transpose["Male"] = male_gender_measurements_training
df_measurements_transpose["Undefined gender"] = undefined_gender_measurements_training
df_measurements_transpose["ICUType 1"] = ICUType_1_measurements_training
df_measurements_transpose["ICUType 2"] = ICUType_2_measurements_training
df_measurements_transpose["ICUType 3"] = ICUType_3_measurements_training
df_measurements_transpose["ICUType 4"] = ICUType_4_measurements_training
df_measurements_transpose["Age 65+"] = age_65_and_above_measurements_training
df_measurements_transpose["Age 65-"] = age_under_65_measurements_training
df_measurements_transpose["Low Weight"] = classificacao_baixo_peso_measurements
df_measurements_transpose["Normal Weight"] = classificacao_normal_peso_measurements
df_measurements_transpose["Overweight"] = classificacao_sobrepeso_measurements
df_measurements_transpose["Obesity Grade 1"] = classificacao_obesidade_1_measurements
df_measurements_transpose["Obesity Grade 2"] = classificacao_obesidade_2_measurements
df_measurements_transpose["Obesity Grade 3"] = classificacao_obesidade_3_measurements
df_measurements_transpose["Undefined classification"] = classification_undefined_measurements
df_measurements_transpose = df_measurements_transpose.drop("RecordID", axis=0)
df_measurements_transpose = df_measurements_transpose.drop("level_1", axis=0)
df_measurements_transpose = df_measurements_transpose.drop("Time", axis=0)
df_measurements_transpose = df_measurements_transpose.drop("Age", axis=0)
df_measurements_transpose = df_measurements_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Repeated Measurements per Variable by Demographics - Train Set</h2>"))
df_measurements_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,0.72,0.94,0.0,0.22,0.14,0.78,0.51,0.72,0.94,0.02,0.25,0.29,0.15,0.07,0.06,0.81
ALT,0.74,0.97,0.0,0.23,0.15,0.81,0.52,0.73,0.97,0.02,0.26,0.3,0.15,0.07,0.07,0.84
AST,0.74,0.97,0.0,0.23,0.15,0.81,0.52,0.73,0.97,0.02,0.26,0.3,0.15,0.07,0.07,0.84
Albumin,0.56,0.71,0.0,0.17,0.1,0.6,0.41,0.6,0.66,0.02,0.19,0.2,0.11,0.05,0.05,0.65
BUN,3.15,4.07,0.01,1.04,1.37,2.7,2.12,3.81,3.42,0.11,1.15,1.34,0.68,0.32,0.28,3.34
Bilirubin,0.75,0.97,0.0,0.22,0.14,0.84,0.52,0.75,0.98,0.02,0.26,0.3,0.15,0.07,0.07,0.85
Cholesterol,0.07,0.1,0.0,0.09,0.01,0.04,0.04,0.11,0.07,0.0,0.03,0.03,0.02,0.01,0.01,0.08
Creatinine,3.16,4.09,0.01,1.06,1.37,2.7,2.12,3.83,3.43,0.12,1.16,1.34,0.68,0.32,0.28,3.36
DiasABP,22.47,31.58,0.05,6.05,16.68,12.46,18.91,29.76,24.34,1.03,10.36,12.25,6.33,2.86,2.49,18.79
FiO2,6.77,8.84,0.01,1.64,3.54,5.2,5.24,8.49,7.13,0.27,2.57,3.08,1.64,0.75,0.77,6.53


# Validation data

In [138]:
validation_X = physionet2012_dataset['val_X']

In [139]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            39880
ALT            39859
AST            39858
Age             2487
Albumin        40005
BUN            37661
Bilirubin      39845
Cholesterol    40501
Creatinine     37641
DiasABP        20758
FiO2           34366
GCS            27942
Gender         39715
Glucose        37806
HCO3           37705
HCT            36908
HR              4074
Height          2487
ICUType        39715
K              37497
Lactate        38918
MAP            20946
MechVent       34582
Mg             37765
NIDiasABP      21833
NIMAP          22119
NISysABP       21812
Na             37667
PaCO2          36312
PaO2           36325
Platelets      37677
RespRate       29363
SaO2           39138
SysABP         20755
Temp           26846
TroponinI      40509
TroponinT      40122
Urine          12457
WBC            37916
Weight         18136
pH             36157
dtype: int64

In [140]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            50553
ALT            50528
AST            50524
Age             3082
Albumin        50778
BUN            47622
Bilirubin      50506
Cholesterol    51314
Creatinine     47607
DiasABP        22892
FiO2           43325
GCS            35166
Gender         50337
Glucose        47921
HCO3           47720
HCT            46207
HR              5196
Height          3082
ICUType        50337
K              47512
Lactate        49334
MAP            23094
MechVent       43634
Mg             47737
NIDiasABP      30305
NIMAP          30608
NISysABP       30288
Na             47733
PaCO2          45095
PaO2           45103
Platelets      47361
RespRate       39973
SaO2           49118
SysABP         22891
Temp           31498
TroponinI      51322
TroponinT      50876
Urine          16053
WBC            47774
Weight         24097
pH             44755
dtype: int64

In [141]:
undefined_gender_ids_validation = validation_X[validation_X["Gender"] == -1.0]
undefined_gender_ids_validation = undefined_gender_ids_validation["RecordID"]
undefined_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(undefined_gender_ids_validation)]
undefined_gender_missing_rate_validation = undefined_gender_missing_rate_validation.isna().sum()
undefined_gender_missing_rate_validation

RecordID        0
level_1         0
Time            0
ALP            96
ALT            96
AST            96
Age            46
Albumin        95
BUN            90
Bilirubin      96
Cholesterol    96
Creatinine     90
DiasABP        50
FiO2           85
GCS            81
Gender         94
Glucose        90
HCO3           90
HCT            90
HR             51
Height         46
ICUType        94
K              90
Lactate        96
MAP            52
MechVent       89
Mg             91
NIDiasABP      86
NIMAP          86
NISysABP       86
Na             90
PaCO2          89
PaO2           89
Platelets      90
RespRate       96
SaO2           96
SysABP         50
Temp           74
TroponinI      96
TroponinT      93
Urine          56
WBC            90
Weight         60
pH             89
dtype: int64

In [142]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            13056
ALT            13046
AST            13044
Age             1036
Albumin        13107
BUN            12316
Bilirubin      13044
Cholesterol    13165
Creatinine     12301
DiasABP         8175
FiO2           11931
GCS             9814
Gender         12972
Glucose        12369
HCO3           12354
HCT            12095
HR              1615
Height          1036
ICUType        12972
K              12145
Lactate        12978
MAP             8196
MechVent       12054
Mg             12308
NIDiasABP       6459
NIMAP           6489
NISysABP        6450
Na             12363
PaCO2          12278
PaO2           12278
Platelets      12283
RespRate        8255
SaO2           12660
SysABP          8175
Temp            9262
TroponinI      13218
TroponinT      12970
Urine           5716
WBC            12392
Weight          7190
pH             12254
dtype: int64

In [143]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            19447
ALT            19444
AST            19445
Age              616
Albumin        19489
BUN            18315
Bilirubin      19442
Cholesterol    19578
Creatinine     18312
DiasABP         4147
FiO2           16176
GCS            14404
Gender         19176
Glucose        18695
HCO3           18411
HCT            17364
HR              1588
Height           616
ICUType        19176
K              18582
Lactate        18813
MAP             4131
MechVent       16286
Mg             18388
NIDiasABP      15243
NIMAP          15275
NISysABP       15234
Na             18612
PaCO2          15398
PaO2           15413
Platelets      17921
RespRate       18758
SaO2           17328
SysABP          4147
Temp            8218
TroponinI      19574
TroponinT      19537
Urine           2958
WBC            18223
Weight          9685
pH             15003
dtype: int64

In [144]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            33716
ALT            33688
AST            33687
Age             2821
Albumin        33888
BUN            31879
Bilirubin      33646
Cholesterol    34437
Creatinine     31864
DiasABP        23066
FiO2           29546
GCS            25529
Gender         33746
Glucose        31885
HCO3           31866
HCT            31433
HR              4030
Height          2821
ICUType        33746
K              31676
Lactate        33175
MAP            23225
MechVent       29985
Mg             32048
NIDiasABP      14576
NIMAP          14947
NISysABP       14571
Na             31788
PaCO2          31933
PaO2           31936
Platelets      32118
RespRate       23825
SaO2           34042
SysABP         23063
Temp           24994
TroponinI      34394
TroponinT      34042
Urine          14045
WBC            32215
Weight         11346
pH             31901
dtype: int64

In [145]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            24310
ALT            24305
AST            24302
Age             1142
Albumin        24394
BUN            22863
Bilirubin      24315
Cholesterol    24731
Creatinine     22861
DiasABP         8312
FiO2           20123
GCS            13442
Gender         24252
Glucose        22868
HCO3           22884
HCT            22313
HR              2088
Height          1142
ICUType        24252
K              22696
Lactate        23382
MAP             8540
MechVent       19980
Mg             22849
NIDiasABP      15946
NIMAP          16102
NISysABP       15931
Na             22727
PaCO2          21887
PaO2           21890
Platelets      22806
RespRate       18594
SaO2           24322
SysABP          8311
Temp           15944
TroponinI      24741
TroponinT      24542
Urine           5847
WBC            22950
Weight         14072
pH             21843
dtype: int64

In [146]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            49697
ALT            49685
AST            49684
Age             2908
Albumin        49837
BUN            46834
Bilirubin      49670
Cholesterol    50303
Creatinine     46813
DiasABP        24103
FiO2           42583
GCS            35045
Gender         49350
Glucose        47149
HCO3           46921
HCT            45566
HR              4814
Height          2908
ICUType        49350
K              46739
Lactate        48479
MAP            24225
MechVent       43171
Mg             46961
NIDiasABP      28178
NIMAP          28447
NISysABP       28156
Na             46981
PaCO2          44596
PaO2           44610
Platelets      46659
RespRate       37471
SaO2           48074
SysABP         24101
Temp           31263
TroponinI      50302
TroponinT      49719
Urine          14529
WBC            47024
Weight         22608
pH             44300
dtype: int64

In [147]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            40832
ALT            40798
AST            40794
Age             2707
Albumin        41041
BUN            38539
Bilirubin      40777
Cholesterol    41608
Creatinine     38525
DiasABP        19597
FiO2           35193
GCS            28144
Gender         40796
Glucose        38668
HCO3           38594
HCT            37639
HR              4507
Height          2707
ICUType        40796
K              38360
Lactate        39869
MAP            19867
MechVent       35134
Mg             38632
NIDiasABP      24046
NIMAP          24366
NISysABP       24030
Na             38509
PaCO2          36900
PaO2           36907
Platelets      38469
RespRate       31961
SaO2           40278
SysABP         19595
Temp           27155
TroponinI      41625
TroponinT      41372
Urine          14037
WBC            38756
Weight         19685
pH             36701
dtype: int64

In [148]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [149]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575034    1.600
575035    1.600
575037    1.600
575038    1.600
575039    1.600
Name: Height, Length: 25076, dtype: float64

In [150]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [151]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.0,37.5,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
2,132555,0,0.0,,,,74.0,,19.0,,...,98.0,34.8,,,35.0,9.0,66.1,7.39,21.5,Peso normal
3,132575,0,0.0,,,,78.0,,18.0,,...,122.0,37.4,,,38.0,12.5,63.0,7.34,22.4,Peso normal
4,132588,0,0.0,,,,48.0,,,,...,,,,,,,42.3,,17.6,Baixo peso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,162926,0,0.0,,,,83.0,,18.0,,...,117.0,36.9,,,60.0,12.3,104.5,7.34,35.0,Obesidade grau 2
976,162942,0,0.0,67.0,61.0,92.0,40.0,3.3,12.0,0.3,...,,36.3,,,600.0,20.6,120.7,7.38,37.1,Obesidade grau 2
977,162952,0,0.0,,,,64.0,,,,...,,,,,,,47.7,,16.5,Baixo peso
978,162983,0,0.0,95.0,369.0,366.0,75.0,3.1,28.0,6.4,...,124.0,35.3,1.2,,80.0,25.0,90.0,7.33,31.1,Obesidade grau 1


In [152]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           326
Peso normal         299
Obesidade grau 1    181
Obesidade grau 2     84
Obesidade grau 3     60
Baixo peso           30
Name: count, dtype: int64

In [153]:
classificacao_undefined_ids_validation = bmi_data_validation["RecordID"]
classificacao_undefined_missing_validation = validation_X[~validation_X["RecordID"].isin(classificacao_undefined_ids_validation)]
classificacao_undefined_missing_validation = classificacao_undefined_missing_validation.isna().sum()
classificacao_undefined_missing_validation

RecordID           0
level_1            0
Time               0
ALP            44265
ALT            44239
AST            44235
Age             3566
Albumin        44398
BUN            41819
Bilirubin      44218
Cholesterol    44950
Creatinine     41796
DiasABP        28025
FiO2           38879
GCS            30328
Gender         44086
Glucose        41827
HCO3           41831
HCT            41167
HR              5172
Height          3566
ICUType        44086
K              41532
Lactate        43524
MAP            28284
MechVent       39303
Mg             41937
NIDiasABP      20562
NIMAP          21021
NISysABP       20547
Na             41672
PaCO2          41930
PaO2           41939
Platelets      42057
RespRate       29758
SaO2           44509
SysABP         28023
Temp           33064
TroponinI      44977
TroponinT      44473
Urine          16586
WBC            42170
Weight         20329
pH             41879
dtype: int64

In [154]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1410
ALT            1410
AST            1410
Age              42
Albumin        1417
BUN            1325
Bilirubin      1409
Cholesterol    1437
Creatinine     1326
DiasABP         514
FiO2           1223
GCS            1024
Gender         1410
Glucose        1332
HCO3           1329
HCT            1280
HR              117
Height           42
ICUType        1410
K              1320
Lactate        1349
MAP             517
MechVent       1206
Mg             1328
NIDiasABP       888
NIMAP           892
NISysABP        887
Na             1329
PaCO2          1272
PaO2           1273
Platelets      1321
RespRate       1084
SaO2           1398
SysABP          513
Temp            835
TroponinI      1436
TroponinT      1427
Urine           435
WBC            1332
Weight          655
pH             1262
dtype: int64

In [155]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

30

In [156]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            14122
ALT            14116
AST            14116
Age              531
Albumin        14179
BUN            13277
Bilirubin      14115
Cholesterol    14327
Creatinine     13275
DiasABP         4404
FiO2           11993
GCS             9849
Gender         14053
Glucose        13396
HCO3           13319
HCT            12855
HR              1130
Height           531
ICUType        14053
K              13278
Lactate        13736
MAP             4453
MechVent       11886
Mg             13322
NIDiasABP       9958
NIMAP           9982
NISysABP        9953
Na             13338
PaCO2          12187
PaO2           12192
Platelets      13139
RespRate       12218
SaO2           13424
SysABP          4404
Temp            7637
TroponinI      14320
TroponinT      14243
Urine           3738
WBC            13303
Weight          6754
pH             12028
dtype: int64

In [157]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

299

In [158]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            15394
ALT            15390
AST            15388
Age              777
Albumin        15475
BUN            14522
Bilirubin      15385
Cholesterol    15623
Creatinine     14511
DiasABP         5868
FiO2           13010
GCS            10893
Gender         15322
Glucose        14668
HCO3           14564
HCT            13935
HR              1527
Height           777
ICUType        15322
K              14520
Lactate        14901
MAP             5888
MechVent       13143
Mg             14557
NIDiasABP       9930
NIMAP           9983
NISysABP        9922
Na             14617
PaCO2          13250
PaO2           13257
Platelets      14290
RespRate       12815
SaO2           14607
SysABP          5867
Temp            8888
TroponinI      15621
TroponinT      15510
Urine           4101
WBC            14497
Weight          7667
pH             13109
dtype: int64

In [159]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

326

In [160]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8555
ALT            8551
AST            8550
Age             384
Albumin        8599
BUN            8052
Bilirubin      8545
Cholesterol    8677
Creatinine     8051
DiasABP        2674
FiO2           7130
GCS            6157
Gender         8507
Glucose        8148
HCO3           8080
HCT            7805
HR              786
Height          384
ICUType        8507
K              8050
Lactate        8324
MAP            2709
MechVent       7123
Mg             8066
NIDiasABP      6093
NIMAP          6120
NISysABP       6085
Na             8108
PaCO2          7202
PaO2           7203
Platelets      8005
RespRate       7703
SaO2           8030
SysABP         2674
Temp           4316
TroponinI      8677
TroponinT      8605
Urine          1937
WBC            8096
Weight         3973
pH             7118
dtype: int64

In [161]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

181

In [162]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3953
ALT            3949
AST            3950
Age             216
Albumin        3975
BUN            3710
Bilirubin      3949
Cholesterol    4023
Creatinine     3710
DiasABP        1214
FiO2           3301
GCS            2905
Gender         3948
Glucose        3754
HCO3           3717
HCT            3579
HR              383
Height          216
ICUType        3948
K              3730
Lactate        3788
MAP            1231
MechVent       3329
Mg             3727
NIDiasABP      2874
NIMAP          2889
NISysABP       2873
Na             3737
PaCO2          3297
PaO2           3300
Platelets      3666
RespRate       3294
SaO2           3713
SysABP         1214
Temp           2035
TroponinI      4022
TroponinT      3988
Urine          1038
WBC            3711
Weight         1719
pH             3271
dtype: int64

In [163]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

84

In [164]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            2830
ALT            2828
AST            2829
Age              99
Albumin        2835
BUN            2668
Bilirubin      2826
Cholesterol    2874
Creatinine     2669
DiasABP        1001
FiO2           2240
GCS            2033
Gender         2820
Glucose        2692
HCO3           2675
HCT            2584
HR              206
Height           99
ICUType        2820
K              2669
Lactate        2726
MAP            1010
MechVent       2315
Mg             2656
NIDiasABP      1919
NIMAP          1926
NISysABP       1919
Na             2689
PaCO2          2358
PaO2           2353
Platelets      2650
RespRate       2560
SaO2           2671
SysABP         1001
Temp           1643
TroponinI      2874
TroponinT      2845
Urine           731
WBC            2671
Weight         1196
pH             2334
dtype: int64

In [165]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

60

In [166]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["Undefined gender"] = undefined_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
df_missing_transpose_validation["Undefined classification"] = classificacao_undefined_missing_validation
df_missing_transpose_validation = df_missing_transpose_validation.drop("RecordID", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("level_1", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Time", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Age", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,39880,50553,96,13056,19447,33716,24310,49697,40832,1410,14122,15394,8555,3953,2830,44265
ALT,39859,50528,96,13046,19444,33688,24305,49685,40798,1410,14116,15390,8551,3949,2828,44239
AST,39858,50524,96,13044,19445,33687,24302,49684,40794,1410,14116,15388,8550,3950,2829,44235
Albumin,40005,50778,95,13107,19489,33888,24394,49837,41041,1417,14179,15475,8599,3975,2835,44398
BUN,37661,47622,90,12316,18315,31879,22863,46834,38539,1325,13277,14522,8052,3710,2668,41819
Bilirubin,39845,50506,96,13044,19442,33646,24315,49670,40777,1409,14115,15385,8545,3949,2826,44218
Cholesterol,40501,51314,96,13165,19578,34437,24731,50303,41608,1437,14327,15623,8677,4023,2874,44950
Creatinine,37641,47607,90,12301,18312,31864,22861,46813,38525,1326,13275,14511,8051,3710,2669,41796
DiasABP,20758,22892,50,8175,4147,23066,8312,24103,19597,514,4404,5868,2674,1214,1001,28025
FiO2,34366,43325,85,11931,16176,29546,20123,42583,35193,1223,11993,13010,7130,3301,2240,38879


<h3>Test data</h3>

In [167]:
test_X = physionet2012_dataset['test_X']

In [168]:
test_X["Gender"].value_counts()

Gender
 1.0    1329
 0.0    1066
-1.0       4
Name: count, dtype: int64

In [169]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            50277
ALT            50258
AST            50258
Age             2732
Albumin        50473
BUN            47452
Bilirubin      50247
Cholesterol    51078
Creatinine     47439
DiasABP        25131
FiO2           43317
GCS            34195
Gender         50102
Glucose        47583
HCO3           47502
HCT            46324
HR              4638
Height          2732
ICUType        50102
K              47222
Lactate        49202
MAP            25301
MechVent       43649
Mg             47495
NIDiasABP      27861
NIMAP          28188
NISysABP       27847
Na             47482
PaCO2          45856
PaO2           45867
Platelets      47402
RespRate       36878
SaO2           49415
SysABP         25131
Temp           32762
TroponinI      51071
TroponinT      50668
Urine          15175
WBC            47697
Weight         23577
pH             45649
dtype: int64

In [170]:
female_gender_test_ids.unique().size

1066

In [171]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            62696
ALT            62662
AST            62657
Age             3389
Albumin        62980
BUN            59107
Bilirubin      62646
Cholesterol    63659
Creatinine     59091
DiasABP        27553
FiO2           52860
GCS            43760
Gender         62463
Glucose        59458
HCO3           59242
HCT            57463
HR              6036
Height          3389
ICUType        62463
K              58945
Lactate        60953
MAP            27693
MechVent       53627
Mg             59223
NIDiasABP      37848
NIMAP          38165
NISysABP       37822
Na             59299
PaCO2          55729
PaO2           55745
Platelets      58953
RespRate       50392
SaO2           61105
SysABP         27549
Temp           38899
TroponinI      63668
TroponinT      63055
Urine          19083
WBC            59405
Weight         30330
pH             55308
dtype: int64

In [172]:
male_gender_test_ids.unique().size

1329

In [173]:
undefined_gender_ids_test = test_X[test_X["Gender"] == -1.0]
undefined_gender_ids_test = undefined_gender_ids_test["RecordID"]
undefined_gender_missing_rate_test = test_X[test_X["RecordID"].isin(undefined_gender_ids_test)]
undefined_gender_missing_rate_test = undefined_gender_missing_rate_test.isna().sum()
undefined_gender_missing_rate_test

RecordID         0
level_1          0
Time             0
ALP            189
ALT            189
AST            189
Age              6
Albumin        191
BUN            180
Bilirubin      189
Cholesterol    192
Creatinine     180
DiasABP         66
FiO2           176
GCS            119
Gender         188
Glucose        181
HCO3           180
HCT            182
HR               9
Height           6
ICUType        188
K              181
Lactate        190
MAP             68
MechVent       166
Mg             181
NIDiasABP      101
NIMAP          101
NISysABP       101
Na             181
PaCO2          183
PaO2           183
Platelets      182
RespRate       150
SaO2           191
SysABP          66
Temp           147
TroponinI      192
TroponinT      192
Urine           87
WBC            181
Weight          65
pH             182
dtype: int64

In [174]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            15168
ALT            15153
AST            15152
Age             1283
Albumin        15202
BUN            14259
Bilirubin      15160
Cholesterol    15292
Creatinine     14240
DiasABP         9261
FiO2           13756
GCS            11459
Gender         15087
Glucose        14292
HCO3           14289
HCT            14023
HR              1954
Height          1283
ICUType        15087
K              14060
Lactate        15050
MAP             9291
MechVent       13843
Mg             14301
NIDiasABP       7773
NIMAP           7824
NISysABP        7772
Na             14290
PaCO2          14163
PaO2           14164
Platelets      14251
RespRate        9554
SaO2           14724
SysABP          9261
Temp           10882
TroponinI      15354
TroponinT      15042
Urine           6854
WBC            14384
Weight          8942
pH             14139
dtype: int64

In [175]:
ICUType_1_test_ids.unique().size

321

In [176]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            24337
ALT            24332
AST            24331
Age              800
Albumin        24403
BUN            22925
Bilirubin      24336
Cholesterol    24517
Creatinine     22927
DiasABP         5403
FiO2           20258
GCS            18058
Gender         24017
Glucose        23384
HCO3           23047
HCT            21845
HR              2061
Height           800
ICUType        24017
K              23212
Lactate        23643
MAP             5333
MechVent       20483
Mg             22929
NIDiasABP      19196
NIMAP          19218
NISysABP       19186
Na             23296
PaCO2          19511
PaO2           19533
Platelets      22539
RespRate       23517
SaO2           21893
SysABP          5401
Temp           10270
TroponinI      24495
TroponinT      24469
Urine           3670
WBC            22874
Weight         11691
pH             19019
dtype: int64

In [177]:
ICUType_2_test_ids.unique().size

511

In [178]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            40640
ALT            40618
AST            40615
Age             2683
Albumin        40892
BUN            38426
Bilirubin      40579
Cholesterol    41524
Creatinine     38418
DiasABP        26561
FiO2           35155
GCS            30359
Gender         40702
Glucose        38434
HCO3           38426
HCT            37722
HR              4102
Height          2683
ICUType        40702
K              38149
Lactate        39934
MAP            26787
MechVent       35797
Mg             38590
NIDiasABP      17872
NIMAP          18295
NISysABP       17859
Na             38356
PaCO2          38342
PaO2           38327
Platelets      38678
RespRate       28814
SaO2           41077
SysABP         26560
Temp           29208
TroponinI      41481
TroponinT      41008
Urine          16008
WBC            38796
Weight         15052
pH             38282
dtype: int64

In [179]:
ICUType_3_test_ids.unique().size

866

In [180]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            33017
ALT            33006
AST            33006
Age             1361
Albumin        33147
BUN            31129
Bilirubin      33007
Cholesterol    33596
Creatinine     31125
DiasABP        11525
FiO2           27184
GCS            18198
Gender         32947
Glucose        31112
HCO3           31162
HCT            30379
HR              2566
Height          1361
ICUType        32947
K              30927
Lactate        31718
MAP            11651
MechVent       27319
Mg             31079
NIDiasABP      20969
NIMAP          21117
NISysABP       20953
Na             31020
PaCO2          29752
PaO2           29771
Platelets      31069
RespRate       25535
SaO2           33017
SysABP         11524
Temp           21448
TroponinI      33601
TroponinT      33396
Urine           7813
WBC            31229
Weight         18287
pH             29699
dtype: int64

In [181]:
ICUType_4_test_ids.unique().size

701

In [182]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            61303
ALT            61284
AST            61276
Age             3149
Albumin        61437
BUN            57719
Bilirubin      61271
Cholesterol    62030
Creatinine     57703
DiasABP        27707
FiO2           52014
GCS            42493
Gender         60865
Glucose        58008
HCO3           57822
HCT            56158
HR              5548
Height          3149
ICUType        60865
K              57510
Lactate        59611
MAP            27879
MechVent       52663
Mg             57801
NIDiasABP      35810
NIMAP          36110
NISysABP       35793
Na             57889
PaCO2          54825
PaO2           54833
Platelets      57586
RespRate       46914
SaO2           59522
SysABP         27706
Temp           37962
TroponinI      62007
TroponinT      61281
Urine          17440
WBC            58000
Weight         29064
pH             54459
dtype: int64

In [183]:
more_than_or_equal_to_65_test_ids.unique().size

1295

In [184]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            51859
ALT            51825
AST            51828
Age             2978
Albumin        52207
BUN            49020
Bilirubin      51811
Cholesterol    52899
Creatinine     49007
DiasABP        25043
FiO2           44339
GCS            35581
Gender         51888
Glucose        49214
HCO3           49102
HCT            47811
HR              5135
Height          2978
ICUType        51888
K              48838
Lactate        50734
MAP            25183
MechVent       44779
Mg             49098
NIDiasABP      30000
NIMAP          30344
NISysABP       29977
Na             49073
PaCO2          46943
PaO2           46962
Platelets      48951
RespRate       40506
SaO2           51189
SysABP         25040
Temp           33846
TroponinI      52924
TroponinT      52634
Urine          16905
WBC            49283
Weight         24908
pH             46680
dtype: int64

In [185]:
less_than_65_test_ids.unique().size

1104

In [186]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [187]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

528       1.575
549       1.575
550       1.575
551       1.575
552       1.575
          ...  
573884    1.600
573885    1.600
573886    1.600
573887    1.600
574752    1.473
Name: Height, Length: 32426, dtype: float64

In [188]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
528,132567,0,0.0,,,,71.0,,,,...,111.5,35.6,,,,,56.0,7.44,22.6,Peso normal
549,132567,21,21.0,,,,71.0,,,,...,110.0,37.5,,,15.0,,55.8,,22.5,Peso normal
550,132567,22,22.0,,,,71.0,,,,...,106.0,37.6,,,20.0,,55.8,,22.5,Peso normal
551,132567,23,23.0,,,,71.0,,,,...,129.0,37.7,,,30.0,,55.8,,22.5,Peso normal
552,132567,24,24.0,,,,71.0,,,,...,94.0,37.8,,,20.0,,55.8,,22.5,Peso normal


In [189]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132567,0,0.0,,,,71.0,,9.0,,...,111.500000,35.600000,,,15.0,9.0,56.0,7.44,22.6,Peso normal
1,132573,0,0.0,,,,77.0,,,,...,,36.900000,,,120.0,,90.1,,34.1,Obesidade grau 1
2,132602,0,0.0,,,,80.0,,,,...,,37.300000,,,150.0,,70.0,,21.5,Peso normal
3,132614,0,0.0,,,,77.0,,,,...,,,,,,,59.0,,22.3,Peso normal
4,132622,0,0.0,,,,71.0,,64.0,,...,,37.400000,19.0,,80.0,7.2,79.0,,30.9,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,162899,0,0.0,,,,64.0,,11.0,,...,91.666667,36.966667,,,50.0,10.1,74.7,7.39,24.3,Peso normal
1249,162912,0,0.0,34.0,17.0,24.0,63.0,2.3,6.0,0.4,...,123.133333,35.000000,,,370.0,5.4,80.0,7.03,25.3,Sobrepeso
1250,162929,0,0.0,,,,63.0,,26.0,,...,113.000000,37.000000,,,55.0,13.0,100.0,7.41,36.7,Obesidade grau 2
1251,162944,0,0.0,,,,58.0,,21.0,,...,115.000000,35.150000,,,400.0,19.6,121.1,7.56,47.3,Obesidade grau 3


In [190]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           460
Peso normal         356
Obesidade grau 1    205
Obesidade grau 2    115
Obesidade grau 3     82
Baixo peso           35
Name: count, dtype: int64

In [191]:
classificacao_undefined_ids_test = bmi_data_test["RecordID"]
classificacao_undefined_missing_test = test_X[~test_X["RecordID"].isin(classificacao_undefined_ids_test)]
classificacao_undefined_missing_test = classificacao_undefined_missing_test.isna().sum()
classificacao_undefined_missing_test

RecordID           0
level_1            0
Time               0
ALP            54082
ALT            54052
AST            54053
Age             3760
Albumin        54274
BUN            51138
Bilirubin      54038
Cholesterol    54912
Creatinine     51119
DiasABP        33253
FiO2           46835
GCS            36523
Gender         53862
Glucose        51134
HCO3           51168
HCT            50178
HR              5729
Height          3760
ICUType        53862
K              50761
Lactate        53151
MAP            33489
MechVent       47669
Mg             51256
NIDiasABP      24923
NIMAP          25356
NISysABP       24907
Na             51023
PaCO2          50980
PaO2           50981
Platelets      51375
RespRate       36717
SaO2           54460
SysABP         33251
Temp           39831
TroponinI      54915
TroponinT      54362
Urine          19248
WBC            51528
Weight         26254
pH             50905
dtype: int64

In [192]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1654
ALT            1654
AST            1654
Age              73
Albumin        1660
BUN            1553
Bilirubin      1653
Cholesterol    1676
Creatinine     1552
DiasABP         599
FiO2           1389
GCS            1126
Gender         1645
Glucose        1563
HCO3           1554
HCT            1495
HR              122
Height           73
ICUType        1645
K              1550
Lactate        1604
MAP             585
MechVent       1396
Mg             1552
NIDiasABP      1134
NIMAP          1137
NISysABP       1130
Na             1547
PaCO2          1466
PaO2           1462
Platelets      1532
RespRate       1390
SaO2           1564
SysABP          599
Temp            998
TroponinI      1679
TroponinT      1660
Urine           460
WBC            1551
Weight          903
pH             1447
dtype: int64

In [193]:
classificacao_baixo_peso_ids_test.unique().size

35

In [194]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            16803
ALT            16794
AST            16793
Age              734
Albumin        16869
BUN            15804
Bilirubin      16788
Cholesterol    17055
Creatinine     15799
DiasABP         5978
FiO2           14125
GCS            11588
Gender         16732
Glucose        15911
HCO3           15833
HCT            15312
HR              1438
Height           734
ICUType        16732
K              15754
Lactate        16221
MAP             6016
MechVent       14244
Mg             15788
NIDiasABP      11147
NIMAP          11226
NISysABP       11142
Na             15880
PaCO2          14575
PaO2           14587
Platelets      15680
RespRate       14309
SaO2           16066
SysABP          5977
Temp            9198
TroponinI      17046
TroponinT      16932
Urine           4343
WBC            15850
Weight          8451
pH             14430
dtype: int64

In [195]:
classificacao_normal_peso_ids_test.unique().size

356

In [196]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            21696
ALT            21685
AST            21684
Age              933
Albumin        21799
BUN            20416
Bilirubin      21683
Cholesterol    22022
Creatinine     20413
DiasABP         6561
FiO2           18190
GCS            15405
Gender         21620
Glucose        20622
HCO3           20491
HCT            19703
HR              1937
Height           933
ICUType        21620
K              20454
Lactate        21111
MAP             6584
MechVent       18210
Mg             20412
NIDiasABP      15594
NIMAP          15654
NISysABP       15588
Na             20581
PaCO2          18627
PaO2           18639
Platelets      20248
RespRate       19201
SaO2           20616
SysABP          6560
Temp           11422
TroponinI      22035
TroponinT      21863
Urine           5505
WBC            20459
Weight         10051
pH             18381
dtype: int64

In [197]:
classificacao_sobrepeso_ids_test.unique().size

460

In [198]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID          0
level_1           0
Time              0
ALP            9614
ALT            9611
AST            9608
Age             316
Albumin        9698
BUN            9067
Bilirubin      9611
Cholesterol    9820
Creatinine     9068
DiasABP        3205
FiO2           7967
GCS            6781
Gender         9635
Glucose        9156
HCO3           9094
HCT            8764
HR              721
Height          316
ICUType        9635
K              9057
Lactate        9265
MAP            3219
MechVent       8055
Mg             9085
NIDiasABP      6496
NIMAP          6517
NISysABP       6491
Na             9129
PaCO2          8224
PaO2           8227
Platelets      8996
RespRate       8235
SaO2           9223
SysABP         3205
Temp           5270
TroponinI      9815
TroponinT      9723
Urine          2235
WBC            9092
Weight         4377
pH             8141
dtype: int64

In [199]:
classificacao_obesidade_1_ids_test.unique().size

205

In [200]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            5429
ALT            5430
AST            5429
Age             179
Albumin        5453
BUN            5100
Bilirubin      5426
Cholesterol    5513
Creatinine     5097
DiasABP        1763
FiO2           4504
GCS            3849
Gender         5405
Glucose        5146
HCO3           5116
HCT            4928
HR              449
Height          179
ICUType        5405
K              5108
Lactate        5222
MAP            1766
MechVent       4553
Mg             5122
NIDiasABP      3838
NIMAP          3871
NISysABP       3837
Na             5126
PaCO2          4566
PaO2           4568
Platelets      5045
RespRate       4445
SaO2           5112
SysABP         1763
Temp           2901
TroponinI      5507
TroponinT      5467
Urine          1493
WBC            5112
Weight         2504
pH             4533
dtype: int64

In [201]:
classificacao_obesidade_2_ids_test.unique().size

115

In [202]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            3884
ALT            3883
AST            3883
Age             132
Albumin        3891
BUN            3661
Bilirubin      3883
Cholesterol    3931
Creatinine     3662
DiasABP        1391
FiO2           3343
GCS            2802
Gender         3854
Glucose        3690
HCO3           3668
HCT            3589
HR              287
Height          132
ICUType        3854
K              3664
Lactate        3771
MAP            1403
MechVent       3315
Mg             3684
NIDiasABP      2678
NIMAP          2693
NISysABP       2675
Na             3676
PaCO2          3330
PaO2           3331
Platelets      3661
RespRate       3123
SaO2           3670
SysABP         1391
Temp           2188
TroponinI      3934
TroponinT      3908
Urine          1061
WBC            3691
Weight         1432
pH             3302
dtype: int64

In [203]:
classificacao_obesidade_3_ids_test.unique().size

82

In [204]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test["Undefined gender"] = undefined_gender_missing_rate_test
df_missing_transpose_test["ICUType 1"] = ICUType_1_test_missing
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
df_missing_transpose_test["Undefined classification"] = classificacao_undefined_missing_test
df_missing_transpose_test = df_missing_transpose_test.drop("RecordID", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("level_1", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Time", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Age", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,50277,62696,189,15168,24337,40640,33017,61303,51859,1654,16803,21696,9614,5429,3884,54082
ALT,50258,62662,189,15153,24332,40618,33006,61284,51825,1654,16794,21685,9611,5430,3883,54052
AST,50258,62657,189,15152,24331,40615,33006,61276,51828,1654,16793,21684,9608,5429,3883,54053
Albumin,50473,62980,191,15202,24403,40892,33147,61437,52207,1660,16869,21799,9698,5453,3891,54274
BUN,47452,59107,180,14259,22925,38426,31129,57719,49020,1553,15804,20416,9067,5100,3661,51138
Bilirubin,50247,62646,189,15160,24336,40579,33007,61271,51811,1653,16788,21683,9611,5426,3883,54038
Cholesterol,51078,63659,192,15292,24517,41524,33596,62030,52899,1676,17055,22022,9820,5513,3931,54912
Creatinine,47439,59091,180,14240,22927,38418,31125,57703,49007,1552,15799,20413,9068,5097,3662,51119
DiasABP,25131,27553,66,9261,5403,26561,11525,27707,25043,599,5978,6561,3205,1763,1391,33253
FiO2,43317,52860,176,13756,20258,35155,27184,52014,44339,1389,14125,18190,7967,4504,3343,46835
