In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-06 12:01:35 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-06 12:01:35 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-06 12:01:35 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-06 12:01:36 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [3]:
train_X = physionet2012_dataset['train_X']

In [4]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            160184
ALT            160123
AST            160128
Age              9445
Albumin        160700
BUN            151006
Bilirubin      160065
Cholesterol    162533
Creatinine     150959
DiasABP         77685
FiO2           137551
GCS            110132
Gender         159424
Glucose        151558
HCO3           151207
HCT            147826
HR              15560
Height           9445
ICUType        159424
K              150363
Lactate        156328
MAP             78071
MechVent       137994
Mg             151221
NIDiasABP       91969
NIMAP           93103
NISysABP        91910
Na             151109
PaCO2          144971
PaO2           145001
Platelets      151184
RespRate       119554
SaO2           156539
SysABP          77679
Temp           103846
TroponinI      162455
TroponinT      161101
Urine           48745
WBC            152012
Weight          75129
pH             144293
dtype: int64

In [5]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            201531
ALT            201441
AST            201440
Age             12005
Albumin        202384
BUN            190032
Bilirubin      201412
Cholesterol    204530
Creatinine     189962
DiasABP         89869
FiO2           172101
GCS            139478
Gender         200643
Glucose        191034
HCO3           190418
HCT            184872
HR              20625
Height          12005
ICUType        200643
K              189456
Lactate        196246
MAP             90609
MechVent       173848
Mg             190320
NIDiasABP      121470
NIMAP          122534
NISysABP       121403
Na             190484
PaCO2          179862
PaO2           179913
Platelets      189387
RespRate       160192
SaO2           196169
SysABP          89862
Temp           126293
TroponinI      204507
TroponinT      202640
Urine           64314
WBC            190966
Weight          97626
pH             178599
dtype: int64

In [6]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Gender"] != -1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training_missing = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training_missing.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            53387
ALT            53344
AST            53347
Age             4620
Albumin        53566
BUN            50333
Bilirubin      53356
Cholesterol    53853
Creatinine     50269
DiasABP        32057
FiO2           48323
GCS            40259
Gender         53063
Glucose        50488
HCO3           50452
HCT            49432
HR              7077
Height          4620
ICUType        53063
K              49707
Lactate        52988
MAP            32125
MechVent       48660
Mg             50339
NIDiasABP      28091
NIMAP          28231
NISysABP       28073
Na             50470
PaCO2          49680
PaO2           49677
Platelets      50248
RespRate       34966
SaO2           51520
SysABP         32055
Temp           37545
TroponinI      54015
TroponinT      53046
Urine          23519
WBC            50693
Weight         29326
pH             49591
dtype: int64

In [7]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Gender"] != -1.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training_missing = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training_missing.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            78128
ALT            78115
AST            78116
Age             2578
Albumin        78300
BUN            73541
Bilirubin      78117
Cholesterol    78650
Creatinine     73530
DiasABP        16622
FiO2           65170
GCS            57731
Gender         77033
Glucose        74965
HCO3           73944
HCT            70130
HR              6403
Height          2578
ICUType        77033
K              74475
Lactate        75909
MAP            16530
MechVent       65510
Mg             73658
NIDiasABP      61634
NIMAP          61787
NISysABP       61596
Na             74690
PaCO2          62702
PaO2           62753
Platelets      72331
RespRate       74972
SaO2           70061
SysABP         16619
Temp           32786
TroponinI      78554
TroponinT      78457
Urine          12490
WBC            73382
Weight         37331
pH             61189
dtype: int64

In [8]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Gender"] != -1.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training_missing = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training_missing.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            126500
ALT            126431
AST            126433
Age              9424
Albumin        127116
BUN            119457
Bilirubin      126305
Cholesterol    129126
Creatinine     119425
DiasABP         83422
FiO2           110282
GCS             94876
Gender         126571
Glucose        119484
HCO3           119433
HCT            117835
HR              13973
Height           9424
ICUType        126571
K              118675
Lactate        124276
MAP             84093
MechVent       111451
Mg             120001
NIDiasABP       55725
NIMAP           57143
NISysABP        55686
Na             119193
PaCO2          119373
PaO2           119367
Platelets      120440
RespRate        90085
SaO2           127709
SysABP          83418
Temp            92252
TroponinI      128956
TroponinT      127488
Urine           51142
WBC            120853
Weight          44950
pH             119223
dtype: int64

In [9]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Gender"] != -1.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training_missing = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training_missing.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            103700
ALT            103674
AST            103672
Age              4828
Albumin        104102
BUN             97707
Bilirubin      103699
Cholesterol    105434
Creatinine      97697
DiasABP         35453
FiO2            85877
GCS             56744
Gender         103400
Glucose         97655
HCO3            97796
HCT             95301
HR               8732
Height           4828
ICUType        103400
K               96962
Lactate         99401
MAP             35932
MechVent        86221
Mg              97543
NIDiasABP       67989
NIMAP           68476
NISysABP        67958
Na              97240
PaCO2           93078
PaO2            93117
Platelets       97552
RespRate        79723
SaO2           103418
SysABP          35449
Temp            67556
TroponinI      105437
TroponinT      104750
Urine           25908
WBC             98050
Weight          61148
pH              92889
dtype: int64

In [10]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Gender"] != -1.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train_missing = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train_missing.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            197236
ALT            197187
AST            197186
Age             11030
Albumin        197671
BUN            185688
Bilirubin      197122
Cholesterol    199519
Creatinine     185618
DiasABP         90249
FiO2           168150
GCS            137528
Gender         195755
Glucose        186726
HCO3           186043
HCT            180921
HR              18747
Height          11030
ICUType        195755
K              185162
Lactate        192104
MAP             90809
MechVent       170156
Mg             185916
NIDiasABP      115711
NIMAP          116769
NISysABP       115641
Na             186207
PaCO2          176351
PaO2           176390
Platelets      185373
RespRate       150565
SaO2           190703
SysABP          90244
Temp           122184
TroponinI      199390
TroponinT      197122
Urine           57768
WBC            186708
Weight          93467
pH             175229
dtype: int64

In [11]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Gender"] != -1.0]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train_missing = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train_missing.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            164479
ALT            164377
AST            164382
Age             10420
Albumin        165413
BUN            155350
Bilirubin      164355
Cholesterol    167544
Creatinine     155303
DiasABP         77305
FiO2           141502
GCS            112082
Gender         164312
Glucose        155866
HCO3           155582
HCT            151777
HR              17438
Height          10420
ICUType        164312
K              154657
Lactate        160470
MAP             77871
MechVent       141686
Mg             155625
NIDiasABP       97728
NIMAP           98868
NISysABP        97672
Na             155386
PaCO2          148482
PaO2           148524
Platelets      155198
RespRate       129181
SaO2           162005
SysABP          77297
Temp           107955
TroponinI      167572
TroponinT      166619
Urine           55291
WBC            156270
Weight          79288
pH             147663
dtype: int64

In [12]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [13]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [14]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 102487, dtype: float64

In [15]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [16]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.00,37.50,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
2,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.00,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
3,132555,0,0.0,,,,74.0,,19.0,,...,98.00,34.80,,,35.0,9.0,66.1,7.39,21.5,Peso normal
4,132567,0,0.0,,,,71.0,,9.0,,...,111.50,35.60,,,15.0,9.0,56.0,7.44,22.6,Peso normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,163007,0,0.0,42.0,30.0,40.0,19.0,2.8,16.0,0.6,...,0.00,40.55,,1.00,150.0,14.1,114.3,7.36,34.2,Obesidade grau 1
4016,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.00,36.50,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso
4017,163016,0,0.0,,27.0,120.0,65.0,,29.0,0.4,...,101.00,38.10,,,75.0,8.0,63.6,7.37,24.8,Peso normal
4018,163029,0,0.0,,,,61.0,,,,...,,,,,,,85.0,,28.5,Sobrepeso


In [17]:
bmi_data_train  = bmi_data_train[bmi_data_train["Gender"] != -1.0]
bmi_data_train['RecordID'].count()

4018

In [18]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1383
Peso normal         1191
Obesidade grau 1     740
Obesidade grau 2     289
Obesidade grau 3     286
Baixo peso           129
Name: count, dtype: int64

In [33]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6100
ALT            6098
AST            6098
Age             231
Albumin        6113
BUN            5717
Bilirubin      6098
Cholesterol    6180
Creatinine     5715
DiasABP        2050
FiO2           5182
GCS            4191
Gender         6063
Glucose        5734
HCO3           5726
HCT            5554
HR              465
Height          231
ICUType        6063
K              5694
Lactate        5852
MAP            2025
MechVent       5111
Mg             5706
NIDiasABP      4122
NIMAP          4163
NISysABP       4121
Na             5735
PaCO2          5327
PaO2           5323
Platelets      5712
RespRate       5161
SaO2           5838
SysABP         2049
Temp           3555
TroponinI      6169
TroponinT      6130
Urine          1652
WBC            5756
Weight         2939
pH             5283
dtype: int64

In [34]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

129

In [35]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            56255
ALT            56235
AST            56236
Age             2320
Albumin        56450
BUN            52967
Bilirubin      56220
Cholesterol    57060
Creatinine     52946
DiasABP        19108
FiO2           47448
GCS            38921
Gender         55977
Glucose        53334
HCO3           53100
HCT            51299
HR              4737
Height          2320
ICUType        55977
K              52866
Lactate        54473
MAP            19326
MechVent       47520
Mg             52993
NIDiasABP      38435
NIMAP          38598
NISysABP       38417
Na             53186
PaCO2          48680
PaO2           48711
Platelets      52520
RespRate       48398
SaO2           53646
SysABP         19107
Temp           31120
TroponinI      57024
TroponinT      56614
Urine          15128
WBC            53113
Weight         27376
pH             48158
dtype: int64

In [36]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1191

In [37]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            65300
ALT            65277
AST            65282
Age             2926
Albumin        65605
BUN            61418
Bilirubin      65284
Cholesterol    66251
Creatinine     61392
DiasABP        21082
FiO2           55045
GCS            45929
Gender         65001
Glucose        62035
HCO3           61619
HCT            59356
HR              5913
Height          2926
ICUType        65001
K              61503
Lactate        63317
MAP            21170
MechVent       55264
Mg             61467
NIDiasABP      45500
NIMAP          45744
NISysABP       45471
Na             61868
PaCO2          55856
PaO2           55880
Platelets      60897
RespRate       56426
SaO2           61721
SysABP         21077
Temp           34535
TroponinI      66232
TroponinT      65762
Urine          16765
WBC            61586
Weight         31974
pH             55225
dtype: int64

In [38]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1383

In [39]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            34928
ALT            34917
AST            34918
Age             1337
Albumin        35089
BUN            32842
Bilirubin      34903
Cholesterol    35467
Creatinine     32832
DiasABP        10914
FiO2           29067
GCS            24515
Gender         34780
Glucose        33151
HCO3           32960
HCT            31767
HR              2856
Height          1337
ICUType        34780
K              32836
Lactate        33841
MAP            11048
MechVent       29177
Mg             32910
NIDiasABP      24828
NIMAP          24955
NISysABP       24818
Na             33051
PaCO2          29656
PaO2           29659
Platelets      32626
RespRate       30675
SaO2           33076
SysABP         10914
Temp           18424
TroponinI      35428
TroponinT      35157
Urine           8267
WBC            32975
Weight         16579
pH             29331
dtype: int64

In [40]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

740

In [41]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            13651
ALT            13651
AST            13649
Age              640
Albumin        13723
BUN            12826
Bilirubin      13646
Cholesterol    13848
Creatinine     12820
DiasABP         4405
FiO2           11342
GCS             9701
Gender         13583
Glucose        12958
HCO3           12862
HCT            12387
HR              1237
Height           640
ICUType        13583
K              12850
Lactate        13196
MAP             4402
MechVent       11446
Mg             12852
NIDiasABP       9601
NIMAP           9668
NISysABP        9597
Na             12908
PaCO2          11495
PaO2           11502
Platelets      12701
RespRate       11858
SaO2           12775
SysABP          4404
Temp            7138
TroponinI      13843
TroponinT      13738
Urine           3575
WBC            12837
Weight          6068
pH             11397
dtype: int64

In [42]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

289

In [43]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13494
ALT            13490
AST            13491
Age              498
Albumin        13569
BUN            12702
Bilirubin      13483
Cholesterol    13707
Creatinine     12703
DiasABP         4764
FiO2           10994
GCS             9771
Gender         13442
Glucose        12803
HCO3           12725
HCT            12435
HR              1062
Height           498
ICUType        13442
K              12690
Lactate        12963
MAP             4758
MechVent       11034
Mg             12719
NIDiasABP       9392
NIMAP           9453
NISysABP        9386
Na             12769
PaCO2          11456
PaO2           11456
Platelets      12711
RespRate       11849
SaO2           12818
SysABP          4764
Temp            7767
TroponinI      13702
TroponinT      13572
Urine           3495
WBC            12798
Weight          5523
pH             11376
dtype: int64

In [44]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

286

In [31]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [32]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,160184,201531,53387,78128,126500,103700,197236,164479,6100,56255,65300,34928,13651,13494
ALT,160123,201441,53344,78115,126431,103674,197187,164377,6098,56235,65277,34917,13651,13490
AST,160128,201440,53347,78116,126433,103672,197186,164382,6098,56236,65282,34918,13649,13491
Age,9445,12005,4620,2578,9424,4828,11030,10420,231,2320,2926,1337,640,498
Albumin,160700,202384,53566,78300,127116,104102,197671,165413,6113,56450,65605,35089,13723,13569
BUN,151006,190032,50333,73541,119457,97707,185688,155350,5717,52967,61418,32842,12826,12702
Bilirubin,160065,201412,53356,78117,126305,103699,197122,164355,6098,56220,65284,34903,13646,13483


<h3>Validation data</h3>

In [74]:
validation_X = physionet2012_dataset['val_X']

In [112]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            39248
ALT            39229
AST            39230
Age             2407
Albumin        39382
BUN            37020
Bilirubin      39213
Cholesterol    39824
Creatinine     37012
DiasABP        19563
FiO2           33762
GCS            27136
Gender         39057
Glucose        37162
HCO3           37057
HCT            36148
HR              3943
Height          2407
ICUType        39057
K              36840
Lactate        38200
MAP            19682
MechVent       33870
Mg             37064
NIDiasABP      22234
NIMAP          22542
NISysABP       22222
Na             37060
PaCO2          35580
PaO2           35595
Platelets      36965
RespRate       28957
SaO2           38373
SysABP         19560
Temp           25522
TroponinI      39814
TroponinT      39429
Urine          12187
WBC            37192
Weight         18270
pH             35434
dtype: int64

In [113]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            51264
ALT            51247
AST            51247
Age             3026
Albumin        51476
BUN            48351
Bilirubin      51232
Cholesterol    52025
Creatinine     48336
DiasABP        23065
FiO2           43622
GCS            35945
Gender         51042
Glucose        48614
HCO3           48469
HCT            47029
HR              5164
Height          3026
ICUType        51042
K              48201
Lactate        49919
MAP            23265
MechVent       44162
Mg             48457
NIDiasABP      30742
NIMAP          30962
NISysABP       30724
Na             48465
PaCO2          45890
PaO2           45906
Platelets      48143
RespRate       41166
SaO2           49895
SysABP         23064
Temp           32428
TroponinI      52035
TroponinT      51535
Urine          16665
WBC            48546
Weight         24538
pH             45579
dtype: int64

In [115]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Gender"] != -1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            13976
ALT            13968
AST            13967
Age             1163
Albumin        14034
BUN            13173
Bilirubin      13968
Cholesterol    14125
Creatinine     13159
DiasABP         8059
FiO2           12532
GCS            10587
Gender         13912
Glucose        13212
HCO3           13205
HCT            12958
HR              1851
Height          1163
ICUType        13912
K              12995
Lactate        13827
MAP             8086
MechVent       12616
Mg             13163
NIDiasABP       7655
NIMAP           7672
NISysABP        7648
Na             13213
PaCO2          12955
PaO2           12958
Platelets      13158
RespRate        9900
SaO2           13461
SysABP          8058
Temp            9830
TroponinI      14171
TroponinT      13871
Urine           6210
WBC            13283
Weight          8046
pH             12916
dtype: int64

In [116]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Gender"] != -1.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            19323
ALT            19320
AST            19320
Age              742
Albumin        19375
BUN            18201
Bilirubin      19321
Cholesterol    19479
Creatinine     18197
DiasABP         4403
FiO2           16036
GCS            14306
Gender         19082
Glucose        18577
HCO3           18297
HCT            17307
HR              1739
Height           742
ICUType        19082
K              18430
Lactate        18718
MAP             4403
MechVent       16197
Mg             18311
NIDiasABP      15177
NIMAP          15206
NISysABP       15171
Na             18483
PaCO2          15621
PaO2           15639
Platelets      17867
RespRate       18649
SaO2           17410
SysABP          4402
Temp            8332
TroponinI      19471
TroponinT      19447
Urine           3334
WBC            18124
Weight          9512
pH             15267
dtype: int64

In [117]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Gender"] != -1.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            32026
ALT            32012
AST            32013
Age             2537
Albumin        32150
BUN            30264
Bilirubin      31971
Cholesterol    32604
Creatinine     30259
DiasABP        21425
FiO2           27847
GCS            23874
Gender         31960
Glucose        30258
HCO3           30263
HCT            29764
HR              3661
Height          2537
ICUType        31960
K              30066
Lactate        31386
MAP            21596
MechVent       28247
Mg             30376
NIDiasABP      14230
NIMAP          14571
NISysABP       14224
Na             30195
PaCO2          30147
PaO2           30149
Platelets      30405
RespRate       22147
SaO2           32224
SysABP         21424
Temp           23375
TroponinI      32573
TroponinT      32197
Urine          13144
WBC            30527
Weight         11190
pH             30119
dtype: int64

In [118]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Gender"] != -1.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            25187
ALT            25176
AST            25177
Age              991
Albumin        25299
BUN            23733
Bilirubin      25185
Cholesterol    25641
Creatinine     23733
DiasABP         8741
FiO2           20969
GCS            14314
Gender         25145
Glucose        23729
HCO3           23761
HCT            23148
HR              1856
Height           991
ICUType        25145
K              23550
Lactate        24188
MAP             8862
MechVent       20972
Mg             23671
NIDiasABP      15914
NIMAP          16055
NISysABP       15903
Na             23634
PaCO2          22747
PaO2           22755
Platelets      23678
RespRate       19427
SaO2           25173
SysABP          8740
Temp           16413
TroponinI      25634
TroponinT      25449
Urine           6164
WBC            23804
Weight         14060
pH             22711
dtype: int64

In [120]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Gender"] != -1]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            48272
ALT            48260
AST            48259
Age             2815
Albumin        48400
BUN            45450
Bilirubin      48246
Cholesterol    48867
Creatinine     45438
DiasABP        22281
FiO2           40938
GCS            34356
Gender         47940
Glucose        45738
HCO3           45539
HCT            44236
HR              4722
Height          2815
ICUType        47940
K              45327
Lactate        46827
MAP            22409
MechVent       41473
Mg             45522
NIDiasABP      28336
NIMAP          28626
NISysABP       28322
Na             45601
PaCO2          43183
PaO2           43199
Platelets      45328
RespRate       37398
SaO2           46712
SysABP         22278
Temp           30042
TroponinI      48843
TroponinT      48263
Urine          14465
WBC            45653
Weight         22326
pH             42937
dtype: int64

In [121]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Gender"] != -1.0]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            42240
ALT            42216
AST            42218
Age             2618
Albumin        42458
BUN            39921
Bilirubin      42199
Cholesterol    42982
Creatinine     39910
DiasABP        20347
FiO2           36446
GCS            28725
Gender         42159
Glucose        40038
HCO3           39987
HCT            38941
HR              4385
Height          2618
ICUType        42159
K              39714
Lactate        41292
MAP            20538
MechVent       36559
Mg             39999
NIDiasABP      24640
NIMAP          24878
NISysABP       24624
Na             39924
PaCO2          38287
PaO2           38302
Platelets      39780
RespRate       32725
SaO2           41556
SysABP         20346
Temp           27908
TroponinI      43006
TroponinT      42701
Urine          14387
WBC            40085
Weight         20482
pH             38076
dtype: int64

In [83]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [84]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

960       1.549
2688      1.676
2736      1.803
2754      1.803
2755      1.803
          ...  
574988    1.524
574989    1.524
574990    1.524
574991    1.524
575088    1.727
Name: Height, Length: 25843, dtype: float64

In [85]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
960,132588,0,0.0,,,,48.0,,,,...,,,,,,,42.3,,17.6294,Baixo peso
2688,132666,0,0.0,,,,53.0,,81.0,,...,,,,,,,62.0,,22.0721,Peso normal
2736,132669,0,0.0,,,,74.0,,,,...,,,,,,,81.8,,25.162967,Sobrepeso
2754,132669,18,18.0,,,,74.0,,,,...,87.0,37.0,,,60.0,,91.5,,28.14684,Sobrepeso
2755,132669,19,19.0,,,,74.0,,10.0,,...,95.666667,37.1,,,23.0,8.6,91.5,7.41,28.14684,Sobrepeso


In [None]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids

RecordID           0
level_1            0
Time               0
ALP              560
ALT              560
AST              560
Age                0
Albumin          567
BUN              530
Bilirubin        560
Cholesterol      573
Creatinine       529
DiasABP          194
FiO2             457
GCS              352
Gender           548
Glucose          530
HCO3             530
HCT              524
HR                12
Height             0
ICUType          548
K                528
Lactate          535
MAP              195
MechVent         461
Mg               532
NIDiasABP        337
NIMAP            337
NISysABP         336
Na               530
PaCO2            516
PaO2             517
Platelets        533
RespRate         417
SaO2             559
SysABP           194
Temp             378
TroponinI        568
TroponinT        558
Urine            200
WBC              539
Weight             0
pH               514
BMI                0
Classificacao      0
dtype: int64

In [87]:
classificacao_peso_normal_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID            0
level_1             0
Time                0
ALP              6077
ALT              6074
AST              6075
Age                 0
Albumin          6092
BUN              5735
Bilirubin        6077
Cholesterol      6174
Creatinine       5734
DiasABP          2440
FiO2             5121
GCS              4108
Gender           5877
Glucose          5764
HCO3             5745
HCT              5559
HR                156
Height              0
ICUType          5877
K                5707
Lactate          5906
MAP              2484
MechVent         5158
Mg               5748
NIDiasABP        3455
NIMAP            3491
NISysABP         3453
Na               5746
PaCO2            5430
PaO2             5431
Platelets        5686
RespRate         4927
SaO2             5858
SysABP           2440
Temp             3498
TroponinI        6174
TroponinT        6107
Urine            1761
WBC              5732
Weight              0
pH               5382
BMI                 0
Classifica

In [88]:
classificacao_sobrepeso_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID            0
level_1             0
Time                0
ALP              8451
ALT              8449
AST              8450
Age                 0
Albumin          8510
BUN              7971
Bilirubin        8447
Cholesterol      8618
Creatinine       7969
DiasABP          2570
FiO2             7018
GCS              6018
Gender           8279
Glucose          8034
HCO3             7995
HCT              7739
HR                189
Height              0
ICUType          8279
K                7954
Lactate          8195
MAP              2570
MechVent         7118
Mg               7974
NIDiasABP        5520
NIMAP            5564
NISysABP         5514
Na               8017
PaCO2            7429
PaO2             7431
Platelets        7942
RespRate         7319
SaO2             8034
SysABP           2569
Temp             4472
TroponinI        8607
TroponinT        8531
Urine            1940
WBC              8007
Weight              0
pH               7355
BMI                 0
Classifica

In [89]:
classificacao_obesidade_grau_1_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_grau_1_missing_validation = classificacao_obesidade_grau_1_validation.isna().sum()
classificacao_obesidade_grau_1_missing_validation

RecordID            0
level_1             0
Time                0
ALP              5898
ALT              5899
AST              5898
Age                 0
Albumin          5928
BUN              5582
Bilirubin        5896
Cholesterol      5980
Creatinine       5579
DiasABP          1849
FiO2             4926
GCS              4182
Gender           5797
Glucose          5623
HCO3             5609
HCT              5401
HR                101
Height              0
ICUType          5797
K                5572
Lactate          5740
MAP              1844
MechVent         4966
Mg               5554
NIDiasABP        3796
NIMAP            3805
NISysABP         3795
Na               5605
PaCO2            5187
PaO2             5189
Platelets        5563
RespRate         5275
SaO2             5635
SysABP           1849
Temp             2972
TroponinI        5970
TroponinT        5926
Urine            1137
WBC              5599
Weight              0
pH               5120
BMI                 0
Classifica

In [90]:
classificacao_obesidade_grau_2_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_grau_2_missing_validation = classificacao_obesidade_grau_2_validation.isna().sum()
classificacao_obesidade_grau_2_missing_validation

RecordID            0
level_1             0
Time                0
ALP              2096
ALT              2096
AST              2096
Age                 0
Albumin          2096
BUN              1988
Bilirubin        2097
Cholesterol      2118
Creatinine       1986
DiasABP           581
FiO2             1824
GCS              1531
Gender           2057
Glucose          1994
HCO3             1993
HCT              1955
HR                 33
Height              0
ICUType          2057
K                1978
Lactate          2047
MAP               585
MechVent         1846
Mg               1987
NIDiasABP        1331
NIMAP            1352
NISysABP         1331
Na               1995
PaCO2            1849
PaO2             1848
Platelets        1990
RespRate         1892
SaO2             1931
SysABP            581
Temp             1097
TroponinI        2119
TroponinT        2110
Urine             284
WBC              1997
Weight              0
pH               1805
BMI                 0
Classifica

In [91]:
classificacao_obesidade_grau_3_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_grau_3_missing_validation = classificacao_obesidade_grau_3_validation.isna().sum()
classificacao_obesidade_grau_3_missing_validation

RecordID            0
level_1             0
Time                0
ALP              1921
ALT              1920
AST              1920
Age                 0
Albumin          1925
BUN              1807
Bilirubin        1915
Cholesterol      1946
Creatinine       1807
DiasABP           569
FiO2             1599
GCS              1360
Gender           1890
Glucose          1820
HCO3             1812
HCT              1762
HR                 35
Height              0
ICUType          1890
K                1798
Lactate          1825
MAP               581
MechVent         1615
Mg               1805
NIDiasABP        1311
NIMAP            1332
NISysABP         1309
Na               1813
PaCO2            1681
PaO2             1681
Platelets        1815
RespRate         1623
SaO2             1858
SysABP            569
Temp             1032
TroponinI        1945
TroponinT        1918
Urine             361
WBC              1829
Weight              0
pH               1666
BMI                 0
Classifica

In [92]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_grau_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_grau_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_grau_3_missing_validation
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,793,1040,273,401,646,514,45457,39624,560,6077,8451,5898,2096,1921
ALT,792,1040,273,401,645,514,45445,39600,560,6074,8449,5899,2096,1920
AST,793,1040,274,401,645,514,45444,39602,560,6075,8450,5898,2096,1920
Age,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Albumin,796,1046,278,401,648,516,45585,39842,567,6092,8510,5928,2096,1925
BUN,728,958,244,386,608,449,42635,37305,530,5735,7971,5582,1988,1807
Bilirubin,792,1041,272,402,646,514,45431,39583,560,6077,8447,5896,2097,1915


<h3>Test data</h3>

In [93]:
test_X = physionet2012_dataset['test_X']

In [94]:
female_gender_distribution_test = test_X[test_X['Gender'] == 0.0]
female_gender_missing_rate_test = female_gender_distribution_test.isna().sum()
female_gender_missing_rate_test

RecordID          0
level_1           0
Time              0
ALP             981
ALT             978
AST             979
Age               0
Albumin         984
BUN             891
Bilirubin       982
Cholesterol    1021
Creatinine      891
DiasABP         852
FiO2            832
GCS             655
Gender            0
Glucose         904
HCO3            903
HCT             877
HR              500
Height            0
ICUType           0
K               904
Lactate         903
MAP             857
MechVent        837
Mg              930
NIDiasABP       633
NIMAP           639
NISysABP        632
Na              904
PaCO2           812
PaO2            816
Platelets       877
RespRate        865
SaO2            991
SysABP          852
Temp            631
TroponinI      1023
TroponinT       999
Urine           674
WBC             891
Weight            0
pH              811
dtype: int64

In [95]:
male_gender_distribution_test = test_X[test_X['Gender'] == 1.0]
male_gender_missing_rate_test = male_gender_distribution_test.isna().sum()
male_gender_missing_rate_test

RecordID          0
level_1           0
Time              0
ALP            1308
ALT            1302
AST            1302
Age               0
Albumin        1312
BUN            1198
Bilirubin      1308
Cholesterol    1363
Creatinine     1197
DiasABP        1097
FiO2           1130
GCS             879
Gender            0
Glucose        1219
HCO3           1214
HCT            1183
HR              690
Height            0
ICUType           0
K              1217
Lactate        1218
MAP            1101
MechVent       1136
Mg             1258
NIDiasABP       863
NIMAP           865
NISysABP        863
Na             1216
PaCO2          1038
PaO2           1043
Platelets      1173
RespRate       1183
SaO2           1317
SysABP         1097
Temp            847
TroponinI      1366
TroponinT      1325
Urine           957
WBC            1193
Weight            0
pH             1036
dtype: int64

In [96]:
ICUType_1_test = test_X[test_X['ICUType'] == 1.0]
ICUType_1_test_missing = ICUType_1_test.isna().sum()
ICUType_1_test_missing

RecordID         0
level_1          0
Time             0
ALP            340
ALT            338
AST            338
Age              0
Albumin        340
BUN            313
Bilirubin      345
Cholesterol    353
Creatinine     313
DiasABP        273
FiO2           320
GCS            208
Gender           0
Glucose        314
HCO3           313
HCT            312
HR             149
Height           0
ICUType          0
K              312
Lactate        346
MAP            272
MechVent       326
Mg             335
NIDiasABP      209
NIMAP          210
NISysABP       208
Na             313
PaCO2          326
PaO2           326
Platelets      311
RespRate       269
SaO2           336
SysABP         273
Temp           214
TroponinI      358
TroponinT      339
Urine          231
WBC            314
Weight           0
pH             325
dtype: int64

In [97]:
ICUType_2_test = test_X[test_X['ICUType'] == 2.0]
ICUType_2_test_missing = ICUType_2_test.isna().sum()
ICUType_2_test_missing

RecordID         0
level_1          0
Time             0
ALP            502
ALT            501
AST            501
Age              0
Albumin        502
BUN            480
Bilirubin      501
Cholesterol    507
Creatinine     479
DiasABP        419
FiO2           448
GCS            415
Gender           0
Glucose        489
HCO3           483
HCT            457
HR             382
Height           0
ICUType          0
K              489
Lactate        478
MAP            420
MechVent       433
Mg             486
NIDiasABP      457
NIMAP          457
NISysABP       457
Na             489
PaCO2          209
PaO2           213
Platelets      447
RespRate       495
SaO2           487
SysABP         419
Temp           407
TroponinI      509
TroponinT      503
Urine          415
WBC            466
Weight           0
pH             209
dtype: int64

In [98]:
ICUType_3_test = test_X[test_X['ICUType'] == 3.0]
ICUType_3_test_missing = ICUType_3_test.isna().sum()
ICUType_3_test_missing

RecordID         0
level_1          0
Time             0
ALP            763
ALT            760
AST            761
Age              0
Albumin        768
BUN            703
Bilirubin      762
Cholesterol    810
Creatinine     703
DiasABP        754
FiO2           654
GCS            491
Gender           0
Glucose        704
HCO3           706
HCT            702
HR             347
Height           0
ICUType          0
K              705
Lactate        706
MAP            759
MechVent       669
Mg             731
NIDiasABP      383
NIMAP          388
NISysABP       383
Na             704
PaCO2          750
PaO2           752
Platelets      710
RespRate       681
SaO2           803
SysABP         754
Temp           477
TroponinI      808
TroponinT      781
Urine          548
WBC            710
Weight           0
pH             748
dtype: int64

In [99]:
ICUType_4_test = test_X[test_X['ICUType'] == 4.0]
ICUType_4_test_missing = ICUType_4_test.isna().sum()
ICUType_4_test_missing

RecordID         0
level_1          0
Time             0
ALP            686
ALT            683
AST            683
Age              0
Albumin        688
BUN            595
Bilirubin      684
Cholesterol    716
Creatinine     595
DiasABP        505
FiO2           542
GCS            420
Gender           0
Glucose        618
HCO3           617
HCT            591
HR             312
Height           0
ICUType          0
K              617
Lactate        593
MAP            509
MechVent       547
Mg             638
NIDiasABP      447
NIMAP          449
NISysABP       447
Na             616
PaCO2          567
PaO2           570
Platelets      584
RespRate       604
SaO2           684
SysABP         505
Temp           381
TroponinI      716
TroponinT      703
Urine          439
WBC            596
Weight           0
pH             567
dtype: int64

In [100]:
more_than_or_equal_to_65_test = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            59176
ALT            59153
AST            59150
Age                0
Albumin        59334
BUN            55533
Bilirubin      59141
Cholesterol    59949
Creatinine     55510
DiasABP        25043
FiO2           50094
GCS            40147
Gender         58749
Glucose        55846
HCO3           55638
HCT            54054
HR              2351
Height             0
ICUType        58749
K              55359
Lactate        57538
MAP            25177
MechVent       50603
Mg             55573
NIDiasABP      33553
NIMAP          33868
NISysABP       33528
Na             55714
PaCO2          52491
PaO2           52507
Platelets      55446
RespRate       45031
SaO2           57129
SysABP         25041
Temp           35253
TroponinI      59903
TroponinT      59167
Urine          14577
WBC            55842
Weight         26170
pH             52114
dtype: int64

In [101]:
less_than_65_test = test_X[test_X["Age"] < 65]
less_than_65_test_missing = less_than_65_test.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            47547
ALT            47516
AST            47519
Age                0
Albumin        47867
BUN            44719
Bilirubin      47508
Cholesterol    48495
Creatinine     44691
DiasABP        20780
FiO2           40459
GCS            31429
Gender         47508
Glucose        44886
HCO3           44806
HCT            43710
HR              2201
Height             0
ICUType        47508
K              44512
Lactate        46217
MAP            20965
MechVent       40560
Mg             44809
NIDiasABP      27182
NIMAP          27586
NISysABP       27158
Na             44741
PaCO2          42613
PaO2           42631
Platelets      44718
RespRate       36633
SaO2           46659
SysABP         20778
Temp           30139
TroponinI      48533
TroponinT      48230
Urine          13501
WBC            45017
Weight         22062
pH             42371
dtype: int64

In [102]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [103]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

144       1.803
145       1.803
146       1.803
147       1.803
148       1.803
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 31372, dtype: float64

In [104]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,,11.5,84.6,,26.024291,Sobrepeso
145,132543,1,1.0,,,,68.0,,,,...,,,,,,,84.6,,26.024291,Sobrepeso
146,132543,2,2.0,,,,68.0,,,,...,,,,,,,84.6,,26.024291,Sobrepeso
147,132543,3,3.0,,,,68.0,,,,...,,36.4,,,,,84.6,,26.024291,Sobrepeso
148,132543,4,4.0,,,,68.0,,,,...,,,,,,,84.6,,26.024291,Sobrepeso


In [105]:
classificacao_baixo_peso_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP              528
ALT              528
AST              528
Age                0
Albumin          532
BUN              497
Bilirubin        528
Cholesterol      535
Creatinine       497
DiasABP          201
FiO2             442
GCS              347
Gender           509
Glucose          502
HCO3             498
HCT              490
HR                15
Height             0
ICUType          509
K                500
Lactate          510
MAP              202
MechVent         404
Mg               500
NIDiasABP        307
NIMAP            315
NISysABP         307
Na               499
PaCO2            452
PaO2             451
Platelets        501
RespRate         452
SaO2             515
SysABP           201
Temp             361
TroponinI        538
TroponinT        536
Urine            158
WBC              502
Weight             0
pH               451
BMI                0
Classificacao      0
dtype: int64

In [106]:
classificacao_peso_normal_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_peso_normal_missing_test = classificacao_peso_normal_test.isna().sum()
classificacao_peso_normal_missing_test

RecordID            0
level_1             0
Time                0
ALP              6625
ALT              6623
AST              6623
Age                 0
Albumin          6650
BUN              6253
Bilirubin        6620
Cholesterol      6753
Creatinine       6254
DiasABP          2846
FiO2             5681
GCS              4571
Gender           6406
Glucose          6282
HCO3             6273
HCT              6093
HR                197
Height              0
ICUType          6406
K                6236
Lactate          6505
MAP              2874
MechVent         5731
Mg               6268
NIDiasABP        3610
NIMAP            3652
NISysABP         3608
Na               6277
PaCO2            5976
PaO2             5973
Platelets        6255
RespRate         5561
SaO2             6457
SysABP           2846
Temp             4062
TroponinI        6747
TroponinT        6683
Urine            1878
WBC              6302
Weight              0
pH               5911
BMI                 0
Classifica

In [107]:
classificacao_sobrepeso_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID             0
level_1              0
Time                 0
ALP               9995
ALT               9990
AST               9990
Age                  0
Albumin          10033
BUN               9437
Bilirubin         9985
Cholesterol      10121
Creatinine        9433
DiasABP           3194
FiO2              8527
GCS               6948
Gender            9721
Glucose           9495
HCO3              9474
HCT               9147
HR                 213
Height               0
ICUType           9721
K                 9415
Lactate           9785
MAP               3211
MechVent          8532
Mg                9380
NIDiasABP         6257
NIMAP             6296
NISysABP          6250
Na                9480
PaCO2             8829
PaO2              8831
Platelets         9402
RespRate          8490
SaO2              9463
SysABP            3193
Temp              5080
TroponinI        10117
TroponinT        10044
Urine             2150
WBC               9469
Weight               0
pH         

In [108]:
classificacao_obesidade_grau_1_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_grau_1_missing_test = classificacao_obesidade_grau_1_test.isna().sum()
classificacao_obesidade_grau_1_missing_test

RecordID            0
level_1             0
Time                0
ALP              6490
ALT              6489
AST              6488
Age                 0
Albumin          6514
BUN              6157
Bilirubin        6486
Cholesterol      6592
Creatinine       6156
DiasABP          1870
FiO2             5561
GCS              4613
Gender           6373
Glucose          6204
HCO3             6172
HCT              6027
HR                127
Height              0
ICUType          6373
K                6138
Lactate          6312
MAP              1903
MechVent         5446
Mg               6129
NIDiasABP        4312
NIMAP            4344
NISysABP         4310
Na               6184
PaCO2            5614
PaO2             5615
Platelets        6157
RespRate         5736
SaO2             6092
SysABP           1870
Temp             3257
TroponinI        6580
TroponinT        6552
Urine            1231
WBC              6201
Weight              0
pH               5543
BMI                 0
Classifica

In [109]:
classificacao_obesidade_grau_2_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_grau_2_missing_test = classificacao_obesidade_grau_2_test.isna().sum()
classificacao_obesidade_grau_2_missing_test

RecordID            0
level_1             0
Time                0
ALP              3659
ALT              3660
AST              3660
Age                 0
Albumin          3677
BUN              3445
Bilirubin        3656
Cholesterol      3704
Creatinine       3444
DiasABP          1025
FiO2             2997
GCS              2526
Gender           3604
Glucose          3468
HCO3             3459
HCT              3366
HR                 62
Height              0
ICUType          3604
K                3434
Lactate          3510
MAP               972
MechVent         3050
Mg               3448
NIDiasABP        2463
NIMAP            2478
NISysABP         2463
Na               3464
PaCO2            3180
PaO2             3180
Platelets        3450
RespRate         3276
SaO2             3460
SysABP           1025
Temp             1736
TroponinI        3712
TroponinT        3664
Urine             752
WBC              3480
Weight              0
pH               3151
BMI                 0
Classifica

In [110]:
classificacao_obesidade_grau_3_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_grau_3_missing_test = classificacao_obesidade_grau_3_test.isna().sum()
classificacao_obesidade_grau_3_missing_test

RecordID            0
level_1             0
Time                0
ALP              3105
ALT              3104
AST              3104
Age                 0
Albumin          3126
BUN              2942
Bilirubin        3102
Cholesterol      3163
Creatinine       2942
DiasABP          1003
FiO2             2534
GCS              2296
Gender           3074
Glucose          2968
HCO3             2948
HCT              2884
HR                 42
Height              0
ICUType          3074
K                2937
Lactate          3015
MAP               996
MechVent         2577
Mg               2935
NIDiasABP        2098
NIMAP            2105
NISysABP         2095
Na               2955
PaCO2            2673
PaO2             2673
Platelets        2945
RespRate         2745
SaO2             2910
SysABP           1003
Temp             1542
TroponinI        3161
TroponinT        3139
Urine             603
WBC              2966
Weight              0
pH               2648
BMI                 0
Classifica

In [111]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_peso_normal_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_grau_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_grau_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_grau_3_missing_test
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,981,1308,502,763,686,59176,47547,528,6625,9995,6490,3659,3105
ALT,978,1302,501,760,683,59153,47516,528,6623,9990,6489,3660,3104
AST,979,1302,501,761,683,59150,47519,528,6623,9990,6488,3660,3104
Age,0,0,0,0,0,0,0,0,0,0,0,0,0
Albumin,984,1312,502,768,688,59334,47867,532,6650,10033,6514,3677,3126
BUN,891,1198,480,703,595,55533,44719,497,6253,9437,6157,3445,2942
Bilirubin,982,1308,501,762,684,59141,47508,528,6620,9985,6486,3656,3102
