In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-06 16:50:42 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-06 16:50:42 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-06 16:50:42 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-06 16:50:42 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [3]:
train_X = physionet2012_dataset['train_X']

In [4]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            159078
ALT            159010
AST            159017
Age              9951
Albumin        159639
BUN            150093
Bilirubin      158969
Cholesterol    161450
Creatinine     150048
DiasABP         79251
FiO2           136881
GCS            110427
Gender         158343
Glucose        150606
HCO3           150261
HCT            146877
HR              16208
Height           9951
ICUType        158343
K              149407
Lactate        155353
MAP             79697
MechVent       137388
Mg             150255
NIDiasABP       90259
NIMAP           91382
NISysABP        90195
Na             150170
PaCO2          144419
PaO2           144461
Platelets      150227
RespRate       119279
SaO2           155756
SysABP          79243
Temp           104298
TroponinI      161334
TroponinT      159995
Urine           49555
WBC            151024
Weight          73796
pH             143813
dtype: int64

In [5]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            202723
ALT            202618
AST            202605
Age             11652
Albumin        203572
BUN            191154
Bilirubin      202568
Cholesterol    205801
Creatinine     191085
DiasABP         90159
FiO2           173106
GCS            140633
Gender         201865
Glucose        192175
HCO3           191533
HCT            186112
HR              20056
Height          11652
ICUType        201865
K              190601
Lactate        197377
MAP             90841
MechVent       174870
Mg             191484
NIDiasABP      122301
NIMAP          123421
NISysABP       122232
Na             191637
PaCO2          180753
PaO2           180794
Platelets      190546
RespRate       161293
SaO2           197139
SysABP          90149
Temp           126194
TroponinI      205755
TroponinT      203981
Urine           63300
WBC            192062
Weight          98634
pH             179441
dtype: int64

In [6]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Gender"] != -1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training_missing = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training_missing.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            52302
ALT            52255
AST            52253
Age             4467
Albumin        52476
BUN            49330
Bilirubin      52268
Cholesterol    52772
Creatinine     49271
DiasABP        30768
FiO2           47098
GCS            39511
Gender         51982
Glucose        49481
HCO3           49443
HCT            48433
HR              6950
Height          4467
ICUType        51982
K              48721
Lactate        51858
MAP            30842
MechVent       47454
Mg             49343
NIDiasABP      27996
NIMAP          28163
NISysABP       27979
Na             49469
PaCO2          48558
PaO2           48558
Platelets      49256
RespRate       35084
SaO2           50386
SysABP         30767
Temp           36972
TroponinI      52918
TroponinT      51953
Urine          22893
WBC            49632
Weight         28682
pH             48469
dtype: int64

In [7]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Gender"] != -1.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training_missing = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training_missing.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            77782
ALT            77766
AST            77763
Age             2532
Albumin        77958
BUN            73222
Bilirubin      77778
Cholesterol    78313
Creatinine     73214
DiasABP        16939
FiO2           64772
GCS            57653
Gender         76704
Glucose        74660
HCO3           73597
HCT            69769
HR              6372
Height          2532
ICUType        76704
K              74169
Lactate        75616
MAP            16829
MechVent       65205
Mg             73322
NIDiasABP      61286
NIMAP          61426
NISysABP       61250
Na             74361
PaCO2          62302
PaO2           62351
Platelets      71975
RespRate       75150
SaO2           69772
SysABP         16937
Temp           32555
TroponinI      78208
TroponinT      78146
Urine          12111
WBC            73046
Weight         37918
pH             60809
dtype: int64

In [8]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Gender"] != -1.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training_missing = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training_missing.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            130444
ALT            130361
AST            130360
Age              9802
Albumin        131091
BUN            123247
Bilirubin      130219
Cholesterol    133166
Creatinine     123210
DiasABP         85806
FiO2           113842
GCS             97862
Gender         130519
Glucose        123268
HCO3           123230
HCT            121521
HR              14427
Height           9802
ICUType        130519
K              122414
Lactate        128108
MAP             86398
MechVent       115242
Mg             123804
NIDiasABP       57928
NIMAP           59329
NISysABP        57886
Na             123016
PaCO2          123200
PaO2           123196
Platelets      124205
RespRate        92414
SaO2           131660
SysABP          85797
Temp            95107
TroponinI      132979
TroponinT      131536
Urine           52440
WBC            124580
Weight          46979
pH             123056
dtype: int64

In [9]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Gender"] != -1.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training_missing = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training_missing.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            101273
ALT            101246
AST            101246
Age              4802
Albumin        101686
BUN             95448
Bilirubin      101272
Cholesterol    103000
Creatinine      95438
DiasABP         35897
FiO2            84275
GCS             56034
Gender         101003
Glucose         95372
HCO3            95524
HCT             93266
HR               8515
Height           4802
ICUType        101003
K               94704
Lactate         97148
MAP             36469
MechVent        84357
Mg              95270
NIDiasABP       65350
NIMAP           65885
NISysABP        65312
Na              94961
PaCO2           91112
PaO2            91150
Platelets       95337
RespRate        77924
SaO2           101077
SysABP          35891
Temp            65858
TroponinI      102984
TroponinT      102341
Urine           25411
WBC             95828
Weight          58851
pH              90920
dtype: int64

In [10]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Gender"] != -1.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train_missing = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train_missing.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            197321
ALT            197267
AST            197255
Age             11318
Albumin        197730
BUN            185901
Bilirubin      197195
Cholesterol    199596
Creatinine     185843
DiasABP         91469
FiO2           168805
GCS            138192
Gender         195802
Glucose        186875
HCO3           186212
HCT            181284
HR              19031
Height          11318
ICUType        195802
K              185311
Lactate        192161
MAP             92004
MechVent       170477
Mg             186065
NIDiasABP      115258
NIMAP          116321
NISysABP       115187
Na             186352
PaCO2          176560
PaO2           176592
Platelets      185561
RespRate       151099
SaO2           190766
SysABP          91459
Temp           122817
TroponinI      199416
TroponinT      197196
Urine           58284
WBC            186799
Weight          92771
pH             175465
dtype: int64

In [11]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Gender"] != -1.0]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train_missing = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train_missing.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            164480
ALT            164361
AST            164367
Age             10285
Albumin        165481
BUN            155346
Bilirubin      164342
Cholesterol    167655
Creatinine     155290
DiasABP         77941
FiO2           141182
GCS            112868
Gender         164406
Glucose        155906
HCO3           155582
HCT            151705
HR              17233
Height          10285
ICUType        164406
K              154697
Lactate        160569
MAP             78534
MechVent       141781
Mg             155674
NIDiasABP       97302
NIMAP           98482
NISysABP        97240
Na             155455
PaCO2          148612
PaO2           148663
Platelets      155212
RespRate       129473
SaO2           162129
SysABP          77933
Temp           107675
TroponinI      167673
TroponinT      166780
Urine           54571
WBC            156287
Weight          79659
pH             147789
dtype: int64

In [12]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [13]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [14]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

144       1.803
145       1.803
146       1.803
147       1.803
148       1.803
          ...  
575035    1.600
575037    1.600
575038    1.600
575039    1.600
575088    1.727
Name: Height, Length: 102976, dtype: float64

In [15]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,,11.5,84.6,,26.0,Sobrepeso
145,132543,1,1.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
146,132543,2,2.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
147,132543,3,3.0,,,,68.0,,,,...,,36.4,,,,,84.6,,26.0,Sobrepeso
148,132543,4,4.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso


In [16]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,600.0,11.5,84.6,,26.0,Sobrepeso
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
2,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.0,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
3,132555,0,0.0,,,,74.0,,19.0,,...,98.00,34.8,,,35.0,9.0,66.1,7.39,21.5,Peso normal
4,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,,36.6,,,600.0,8.8,102.6,,35.4,Obesidade grau 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4014,162995,0,0.0,60.0,21.0,20.0,84.0,,93.0,0.4,...,121.00,37.1,0.6,,60.0,17.1,96.5,7.31,28.8,Sobrepeso
4015,163002,0,0.0,,,,53.0,,,,...,,,,,,2.7,68.0,7.27,31.3,Obesidade grau 1
4016,163008,0,0.0,,,,59.0,,24.0,,...,97.00,37.6,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4017,163016,0,0.0,,27.0,120.0,65.0,,29.0,0.4,...,101.00,38.1,,,75.0,8.0,63.6,7.37,24.8,Peso normal


In [17]:
bmi_data_train  = bmi_data_train[bmi_data_train["Gender"] != -1.0]
bmi_data_train['RecordID'].count()

4016

In [18]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1394
Peso normal         1158
Obesidade grau 1     731
Obesidade grau 2     313
Obesidade grau 3     288
Baixo peso           132
Name: count, dtype: int64

In [19]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6232
ALT            6229
AST            6228
Age             270
Albumin        6260
BUN            5850
Bilirubin      6229
Cholesterol    6327
Creatinine     5848
DiasABP        2378
FiO2           5310
GCS            4368
Gender         6204
Glucose        5874
HCO3           5854
HCT            5684
HR              544
Height          270
ICUType        6204
K              5821
Lactate        5974
MAP            2339
MechVent       5263
Mg             5843
NIDiasABP      4097
NIMAP          4145
NISysABP       4096
Na             5862
PaCO2          5459
PaO2           5457
Platelets      5828
RespRate       5208
SaO2           5966
SysABP         2377
Temp           3765
TroponinI      6314
TroponinT      6271
Urine          1860
WBC            5879
Weight         3157
pH             5425
dtype: int64

In [20]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

132

In [21]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            54692
ALT            54669
AST            54669
Age             2207
Albumin        54898
BUN            51526
Bilirubin      54661
Cholesterol    55490
Creatinine     51516
DiasABP        19244
FiO2           46387
GCS            37912
Gender         54426
Glucose        51887
HCO3           51661
HCT            49929
HR              4500
Height          2207
ICUType        54426
K              51428
Lactate        53008
MAP            19384
MechVent       46318
Mg             51514
NIDiasABP      36607
NIMAP          36773
NISysABP       36586
Na             51723
PaCO2          47524
PaO2           47548
Platelets      51142
RespRate       47009
SaO2           52317
SysABP         19243
Temp           30508
TroponinI      55444
TroponinT      55078
Urine          14808
WBC            51662
Weight         26923
pH             47028
dtype: int64

In [22]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1158

In [23]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            65824
ALT            65800
AST            65801
Age             2911
Albumin        66132
BUN            61863
Bilirubin      65787
Cholesterol    66791
Creatinine     61842
DiasABP        21434
FiO2           55202
GCS            46583
Gender         65518
Glucose        62468
HCO3           62043
HCT            59770
HR              5943
Height          2911
ICUType        65518
K              61945
Lactate        63843
MAP            21538
MechVent       55478
Mg             61954
NIDiasABP      46058
NIMAP          46335
NISysABP       46034
Na             62306
PaCO2          56174
PaO2           56194
Platelets      61269
RespRate       57299
SaO2           62103
SysABP         21431
Temp           34914
TroponinI      66759
TroponinT      66310
Urine          16853
WBC            61976
Weight         31065
pH             55504
dtype: int64

In [24]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1394

In [25]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            34493
ALT            34480
AST            34477
Age             1310
Albumin        34668
BUN            32482
Bilirubin      34477
Cholesterol    35036
Creatinine     32474
DiasABP        10954
FiO2           28770
GCS            24342
Gender         34357
Glucose        32765
HCO3           32561
HCT            31440
HR              2765
Height          1310
ICUType        34357
K              32443
Lactate        33310
MAP            11095
MechVent       28883
Mg             32509
NIDiasABP      24188
NIMAP          24318
NISysABP       24178
Na             32657
PaCO2          29160
PaO2           29165
Platelets      32258
RespRate       30258
SaO2           32605
SysABP         10954
Temp           17826
TroponinI      34983
TroponinT      34734
Urine           8076
WBC            32591
Weight         16558
pH             28860
dtype: int64

In [26]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

731

In [27]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            14775
ALT            14773
AST            14773
Age              704
Albumin        14852
BUN            13897
Bilirubin      14763
Cholesterol    14996
Creatinine     13888
DiasABP         4697
FiO2           12297
GCS            10712
Gender         14711
Glucose        14048
HCO3           13938
HCT            13450
HR              1372
Height           704
ICUType        14711
K              13933
Lactate        14256
MAP             4710
MechVent       12445
Mg             13921
NIDiasABP      10598
NIMAP          10663
NISysABP       10596
Na             13992
PaCO2          12502
PaO2           12507
Platelets      13781
RespRate       12897
SaO2           13900
SysABP          4696
Temp            7779
TroponinI      14992
TroponinT      14872
Urine           3993
WBC            13915
Weight          6441
pH             12398
dtype: int64

In [28]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

313

In [29]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13602
ALT            13598
AST            13599
Age              636
Albumin        13657
BUN            12809
Bilirubin      13596
Cholesterol    13797
Creatinine     12809
DiasABP         4927
FiO2           11051
GCS             9888
Gender         13536
Glucose        12913
HCO3           12842
HCT            12530
HR              1217
Height           636
ICUType        13536
K              12791
Lactate        13147
MAP             4908
MechVent       11092
Mg             12820
NIDiasABP       9503
NIMAP           9553
NISysABP        9494
Na             12883
PaCO2          11588
PaO2           11587
Platelets      12804
RespRate       11951
SaO2           12943
SysABP          4927
Temp            7900
TroponinI      13800
TroponinT      13694
Urine           3615
WBC            12891
Weight          5731
pH             11502
dtype: int64

In [30]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

288

In [31]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [32]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,159078,202723,52302,77782,130444,101273,197321,164480,6232,54692,65824,34493,14775,13602
ALT,159010,202618,52255,77766,130361,101246,197267,164361,6229,54669,65800,34480,14773,13598
AST,159017,202605,52253,77763,130360,101246,197255,164367,6228,54669,65801,34477,14773,13599
Age,9951,11652,4467,2532,9802,4802,11318,10285,270,2207,2911,1310,704,636
Albumin,159639,203572,52476,77958,131091,101686,197730,165481,6260,54898,66132,34668,14852,13657
BUN,150093,191154,49330,73222,123247,95448,185901,155346,5850,51526,61863,32482,13897,12809
Bilirubin,158969,202568,52268,77778,130219,101272,197195,164342,6229,54661,65787,34477,14763,13596


<h3>Validation data</h3>

In [33]:
validation_X = physionet2012_dataset['val_X']

In [34]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            39788
ALT            39772
AST            39773
Age             2228
Albumin        39934
BUN            37508
Bilirubin      39758
Cholesterol    40346
Creatinine     37498
DiasABP        19769
FiO2           34243
GCS            26948
Gender         39574
Glucose        37653
HCO3           37568
HCT            36640
HR              3645
Height          2228
ICUType        39574
K              37343
Lactate        38776
MAP            19869
MechVent       34368
Mg             37571
NIDiasABP      22158
NIMAP          22376
NISysABP       22147
Na             37519
PaCO2          36128
PaO2           36132
Platelets      37493
RespRate       29277
SaO2           39000
SysABP         19768
Temp           26088
TroponinI      40353
TroponinT      39986
Urine          11921
WBC            37707
Weight         19710
pH             35970
dtype: int64

In [35]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            50626
ALT            50611
AST            50616
Age             2975
Albumin        50847
BUN            47791
Bilirubin      50603
Cholesterol    51393
Creatinine     47767
DiasABP        22596
FiO2           43660
GCS            34936
Gender         50431
Glucose        48000
HCO3           47877
HCT            46537
HR              5055
Height          2975
ICUType        50431
K              47579
Lactate        49419
MAP            22746
MechVent       44110
Mg             47842
NIDiasABP      30418
NIMAP          30651
NISysABP       30394
Na             47848
PaCO2          45525
PaO2           45539
Platelets      47636
RespRate       39068
SaO2           49523
SysABP         22595
Temp           32437
TroponinI      51416
TroponinT      50845
Urine          16797
WBC            48037
Weight         24980
pH             45230
dtype: int64

In [36]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Gender"] != -1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            14333
ALT            14321
AST            14323
Age             1241
Albumin        14402
BUN            13515
Bilirubin      14315
Cholesterol    14443
Creatinine     13493
DiasABP         8799
FiO2           13163
GCS            10705
Gender         14241
Glucose        13566
HCO3           13556
HCT            13251
HR              1845
Height          1241
ICUType        14241
K              13343
Lactate        14255
MAP             8821
MechVent       13310
Mg             13522
NIDiasABP       7331
NIMAP           7352
NISysABP        7329
Na             13555
PaCO2          13485
PaO2           13489
Platelets      13508
RespRate        9020
SaO2           13923
SysABP          8798
Temp           10288
TroponinI      14505
TroponinT      14221
Urine           6317
WBC            13627
Weight          8948
pH             13465
dtype: int64

In [37]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Gender"] != -1.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            17860
ALT            17860
AST            17861
Age              690
Albumin        17922
BUN            16851
Bilirubin      17860
Cholesterol    17995
Creatinine     16851
DiasABP         3958
FiO2           15162
GCS            13261
Gender         17625
Glucose        17153
HCO3           16942
HCT            16067
HR              1556
Height           690
ICUType        17625
K              17041
Lactate        17366
MAP             3901
MechVent       15125
Mg             16888
NIDiasABP      14079
NIMAP          14097
NISysABP       14069
Na             17109
PaCO2          14381
PaO2           14389
Platelets      16544
RespRate       17168
SaO2           16117
SysABP          3958
Temp            7587
TroponinI      17986
TroponinT      17944
Urine           3224
WBC            16800
Weight          8837
pH             14017
dtype: int64

In [38]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Gender"] != -1.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            31219
ALT            31210
AST            31212
Age             2144
Albumin        31363
BUN            29500
Bilirubin      31182
Cholesterol    31877
Creatinine     29490
DiasABP        20985
FiO2           27288
GCS            23477
Gender         31255
Glucose        29512
HCO3           29494
HCT            29036
HR              3198
Height          2144
ICUType        31255
K              29303
Lactate        30655
MAP            21139
MechVent       27606
Mg             29637
NIDiasABP      13216
NIMAP          13502
NISysABP       13205
Na             29417
PaCO2          29457
PaO2           29453
Platelets      29680
RespRate       21651
SaO2           31574
SysABP         20984
Temp           22762
TroponinI      31849
TroponinT      31424
Urine          12659
WBC            29795
Weight         10812
pH             29426
dtype: int64

In [39]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Gender"] != -1.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            27002
ALT            26992
AST            26993
Age             1128
Albumin        27094
BUN            25433
Bilirubin      27004
Cholesterol    27424
Creatinine     25431
DiasABP         8623
FiO2           22290
GCS            14441
Gender         26884
Glucose        25422
HCO3           25453
HCT            24823
HR              2101
Height          1128
ICUType        26884
K              25235
Lactate        25919
MAP             8754
MechVent       22437
Mg             25366
NIDiasABP      17950
NIMAP          18076
NISysABP       17938
Na             25286
PaCO2          24330
PaO2           24340
Platelets      25397
RespRate       20506
SaO2           26909
SysABP          8623
Temp           17888
TroponinI      27429
TroponinT      27242
Urine           6518
WBC            25522
Weight         16093
pH             24292
dtype: int64

In [40]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Gender"] != -1]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            49852
ALT            49835
AST            49837
Age             2678
Albumin        49974
BUN            46920
Bilirubin      49819
Cholesterol    50396
Creatinine     46897
DiasABP        22161
FiO2           42437
GCS            34375
Gender         49444
Glucose        47171
HCO3           47010
HCT            45638
HR              4504
Height          2678
ICUType        49444
K              46752
Lactate        48519
MAP            22280
MechVent       42931
Mg             46964
NIDiasABP      29573
NIMAP          29754
NISysABP       29561
Na             47019
PaCO2          44635
PaO2           44651
Platelets      46820
RespRate       37894
SaO2           48356
SysABP         22160
Temp           31010
TroponinI      50393
TroponinT      49767
Urine          14716
WBC            47164
Weight         24965
pH             44374
dtype: int64

In [41]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Gender"] != -1.0]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            40562
ALT            40548
AST            40552
Age             2525
Albumin        40807
BUN            38379
Bilirubin      40542
Cholesterol    41343
Creatinine     38368
DiasABP        20204
FiO2           35466
GCS            27509
Gender         40561
Glucose        38482
HCO3           38435
HCT            37539
HR              4196
Height          2525
ICUType        40561
K              38170
Lactate        39676
MAP            20335
MechVent       35547
Mg             38449
NIDiasABP      23003
NIMAP          23273
NISysABP       22980
Na             38348
PaCO2          37018
PaO2           37020
Platelets      38309
RespRate       30451
SaO2           40167
SysABP         20203
Temp           27515
TroponinI      41376
TroponinT      41064
Urine          14002
WBC            38580
Weight         19725
pH             36826
dtype: int64

In [42]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [43]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

576       1.575
577       1.575
578       1.575
579       1.575
580       1.575
          ...  
574988    1.524
574989    1.524
574990    1.524
574991    1.524
575184    1.727
Name: Height, Length: 23675, dtype: float64

In [49]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
576,132568,0,0.0,,,,66.0,,,,...,,,,,220.0,,84.5,,34.1,Obesidade grau 1
577,132568,1,1.0,,,,66.0,,,,...,,,,,45.0,,84.5,,34.1,Obesidade grau 1
578,132568,2,2.0,,,,66.0,,,,...,,36.1,,,45.0,,84.5,,34.1,Obesidade grau 1
579,132568,3,3.0,,,,66.0,,18.0,,...,,,,,45.0,14.8,84.5,,34.1,Obesidade grau 1
580,132568,4,4.0,,,,66.0,,,,...,,,,,50.0,,84.5,,34.1,Obesidade grau 1


In [50]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132568,0,0.0,,,,66.0,,18.0,,...,,36.10,,,220.0,14.8,84.5,,34.1,Obesidade grau 1
1,132575,0,0.0,,,,78.0,,18.0,,...,122.000000,37.40,,,38.0,12.5,63.0,7.34,22.4,Peso normal
2,132648,0,0.0,,,,87.0,,,,...,144.000000,37.80,,,1112.5,,66.0,,26.6,Sobrepeso
3,132659,0,0.0,70.0,87.0,132.0,78.0,2.7,17.0,0.8,...,116.500000,36.15,,,40.0,12.5,110.0,7.24,40.4,Obesidade grau 3
4,132685,0,0.0,,,,77.0,,,,...,,,,,,,81.8,,27.4,Sobrepeso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,162893,0,0.0,,,,82.0,,12.0,,...,175.333333,37.70,,,140.0,9.1,99.8,7.52,32.5,Obesidade grau 1
964,163003,0,0.0,124.0,254.0,204.0,36.0,2.6,3.0,30.3,...,,36.30,,,50.0,10.6,57.7,7.47,21.2,Peso normal
965,163007,0,0.0,42.0,30.0,40.0,19.0,2.8,16.0,0.6,...,0.000000,40.55,,1.00,150.0,14.1,114.3,7.36,34.2,Obesidade grau 1
966,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.000000,36.50,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso


In [51]:
bmi_data_validation = bmi_data_validation[bmi_data_validation["Gender"] != -1.0]
bmi_data_validation["RecordID"].count()

968

In [52]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           323
Peso normal         309
Obesidade grau 1    174
Obesidade grau 2     71
Obesidade grau 3     56
Baixo peso           35
Name: count, dtype: int64

In [53]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1654
ALT            1655
AST            1655
Age              50
Albumin        1653
BUN            1551
Bilirubin      1653
Cholesterol    1677
Creatinine     1551
DiasABP         506
FiO2           1393
GCS            1170
Gender         1645
Glucose        1557
HCO3           1552
HCT            1523
HR               90
Height           50
ICUType        1645
K              1545
Lactate        1591
MAP             511
MechVent       1382
Mg             1547
NIDiasABP      1077
NIMAP          1080
NISysABP       1073
Na             1547
PaCO2          1450
PaO2           1451
Platelets      1555
RespRate       1377
SaO2           1605
SysABP          506
Temp            954
TroponinI      1671
TroponinT      1659
Urine           403
WBC            1566
Weight          869
pH             1435
dtype: int64

In [55]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

35

In [56]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            14614
ALT            14607
AST            14610
Age              630
Albumin        14662
BUN            13723
Bilirubin      14612
Cholesterol    14800
Creatinine     13716
DiasABP         4928
FiO2           12160
GCS             9920
Gender         14523
Glucose        13804
HCO3           13752
HCT            13287
HR              1234
Height           630
ICUType        14523
K              13667
Lactate        14153
MAP             4971
MechVent       12169
Mg             13743
NIDiasABP       9841
NIMAP           9873
NISysABP        9838
Na             13783
PaCO2          12595
PaO2           12602
Platelets      13604
RespRate       12531
SaO2           13990
SysABP          4928
Temp            8336
TroponinI      14797
TroponinT      14696
Urine           3798
WBC            13762
Weight          7440
pH             12463
dtype: int64

In [57]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

309

In [58]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            15234
ALT            15232
AST            15231
Age              635
Albumin        15328
BUN            14394
Bilirubin      15223
Cholesterol    15469
Creatinine     14389
DiasABP         5043
FiO2           13000
GCS            10896
Gender         15181
Glucose        14551
HCO3           14454
HCT            13889
HR              1322
Height           635
ICUType        15181
K              14432
Lactate        14842
MAP             5054
MechVent       13102
Mg             14402
NIDiasABP      10507
NIMAP          10534
NISysABP       10501
Na             14499
PaCO2          13167
PaO2           13174
Platelets      14250
RespRate       12773
SaO2           14479
SysABP          5042
Temp            8179
TroponinI      15479
TroponinT      15362
Urine           4170
WBC            14419
Weight          7767
pH             13018
dtype: int64

In [60]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

323

In [61]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8236
ALT            8233
AST            8236
Age             351
Albumin        8266
BUN            7736
Bilirubin      8230
Cholesterol    8333
Creatinine     7735
DiasABP        2558
FiO2           6943
GCS            5780
Gender         8178
Glucose        7794
HCO3           7766
HCT            7515
HR              692
Height          351
ICUType        8178
K              7727
Lactate        7978
MAP            2605
MechVent       6954
Mg             7753
NIDiasABP      5894
NIMAP          5903
NISysABP       5892
Na             7783
PaCO2          7114
PaO2           7115
Platelets      7692
RespRate       7076
SaO2           7832
SysABP         2558
Temp           4581
TroponinI      8340
TroponinT      8252
Urine          2017
WBC            7781
Weight         3972
pH             7034
dtype: int64

In [62]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

174

In [63]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3343
ALT            3341
AST            3341
Age             132
Albumin        3363
BUN            3123
Bilirubin      3343
Cholesterol    3403
Creatinine     3120
DiasABP        1183
FiO2           2815
GCS            2342
Gender         3337
Glucose        3141
HCO3           3125
HCT            3005
HR              263
Height          132
ICUType        3337
K              3113
Lactate        3219
MAP            1182
MechVent       2810
Mg             3134
NIDiasABP      2276
NIMAP          2318
NISysABP       2276
Na             3137
PaCO2          2812
PaO2           2812
Platelets      3090
RespRate       2751
SaO2           3144
SysABP         1182
Temp           1755
TroponinI      3404
TroponinT      3360
Urine           857
WBC            3133
Weight         1438
pH             2797
dtype: int64

In [64]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

71

In [65]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            2631
ALT            2631
AST            2630
Age              70
Albumin        2662
BUN            2485
Bilirubin      2630
Cholesterol    2686
Creatinine     2486
DiasABP         900
FiO2           2184
GCS            1879
Gender         2632
Glucose        2508
HCO3           2487
HCT            2423
HR              162
Height           70
ICUType        2632
K              2488
Lactate        2481
MAP             887
MechVent       2218
Mg             2489
NIDiasABP      1877
NIMAP          1879
NISysABP       1875
Na             2500
PaCO2          2219
PaO2           2216
Platelets      2479
RespRate       2266
SaO2           2498
SysABP          900
Temp           1482
TroponinI      2686
TroponinT      2656
Urine           570
WBC            2494
Weight         1303
pH             2204
dtype: int64

In [66]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

56

In [67]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,39788,50626,14333,17860,31219,27002,49852,40562,1654,14614,15234,8236,3343,2631
ALT,39772,50611,14321,17860,31210,26992,49835,40548,1655,14607,15232,8233,3341,2631
AST,39773,50616,14323,17861,31212,26993,49837,40552,1655,14610,15231,8236,3341,2630
Age,2228,2975,1241,690,2144,1128,2678,2525,50,630,635,351,132,70
Albumin,39934,50847,14402,17922,31363,27094,49974,40807,1653,14662,15328,8266,3363,2662
BUN,37508,47791,13515,16851,29500,25433,46920,38379,1551,13723,14394,7736,3123,2485
Bilirubin,39758,50603,14315,17860,31182,27004,49819,40542,1653,14612,15223,8230,3343,2630


<h3>Test data</h3>

In [69]:
test_X = physionet2012_dataset['test_X']

In [80]:
test_X["Gender"].value_counts()

Gender
 1.0    1349
 0.0    1048
-1.0       2
Name: count, dtype: int64

In [72]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            49464
ALT            49429
AST            49426
Age             2530
Albumin        49645
BUN            46610
Bilirubin      49432
Cholesterol    50221
Creatinine     46591
DiasABP        22955
FiO2           42354
GCS            33533
Gender         49256
Glucose        46785
HCO3           46679
HCT            45694
HR              4242
Height          2530
ICUType        49256
K              46433
Lactate        48281
MAP            23170
MechVent       42619
Mg             46738
NIDiasABP      28790
NIMAP          29122
NISysABP       28770
Na             46642
PaCO2          44830
PaO2           44835
Platelets      46680
RespRate       36783
SaO2           48313
SysABP         22955
Temp           32219
TroponinI      50187
TroponinT      49762
Urine          14897
WBC            46931
Weight         23248
pH             44590
dtype: int64

In [76]:
female_gender_test_ids.unique().size

1048

In [73]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            63749
ALT            63719
AST            63721
Age             4186
Albumin        63977
BUN            60059
Bilirubin      63730
Cholesterol    64643
Creatinine     60037
DiasABP        28610
FiO2           54326
GCS            44070
Gender         63403
Glucose        60397
HCO3           60182
HCT            58343
HR              7039
Height          4186
ICUType        63403
K              59861
Lactate        61977
MAP            28790
MechVent       54823
Mg             60157
NIDiasABP      39028
NIMAP          39336
NISysABP       39000
Na             60191
PaCO2          56830
PaO2           56844
Platelets      59895
RespRate       50841
SaO2           61997
SysABP         28607
Temp           40134
TroponinI      64630
TroponinT      64041
Urine          20083
WBC            60378
Weight         30712
pH             56418
dtype: int64

In [77]:
male_gender_test_ids.unique().size

1349

In [81]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Gender"] != -1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            16826
ALT            16810
AST            16812
Age             1360
Albumin        16883
BUN            15822
Bilirubin      16827
Cholesterol    16967
Creatinine     15802
DiasABP        10152
FiO2           15221
GCS            12635
Gender         16732
Glucose        15883
HCO3           15872
HCT            15628
HR              2012
Height          1360
ICUType        16732
K              15612
Lactate        16691
MAP            10193
MechVent       15369
Mg             15839
NIDiasABP       8686
NIMAP           8712
NISysABP        8677
Na             15874
PaCO2          15681
PaO2           15676
Platelets      15871
RespRate       10744
SaO2           16231
SysABP         10152
Temp           12017
TroponinI      17040
TroponinT      16705
Urine           7347
WBC            15998
Weight          9137
pH             15642
dtype: int64

In [82]:
ICUType_1_test_ids.unique().size

356

In [83]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Gender"] != -1.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            24702
ALT            24694
AST            24697
Age              845
Albumin        24755
BUN            23260
Bilirubin      24697
Cholesterol    24859
Creatinine     23250
DiasABP         5182
FiO2           20687
GCS            18175
Gender         24346
Glucose        23721
HCO3           23374
HCT            22140
HR              2058
Height           845
ICUType        24346
K              23566
Lactate        23957
MAP             5187
MechVent       20701
Mg             23343
NIDiasABP      19488
NIMAP          19559
NISysABP       19481
Na             23621
PaCO2          19935
PaO2           19955
Platelets      22862
RespRate       23737
SaO2           22156
SysABP          5181
Temp           10478
TroponinI      24847
TroponinT      24803
Urine           3837
WBC            23177
Weight         11569
pH             19434
dtype: int64

In [84]:
ICUType_2_test_ids.unique().size

518

In [85]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Gender"] != -1.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            39280
ALT            39246
AST            39243
Age             2905
Albumin        39509
BUN            37160
Bilirubin      39229
Cholesterol    40147
Creatinine     37150
DiasABP        25936
FiO2           34082
GCS            29333
Gender         39339
Glucose        37152
HCO3           37144
HCT            36624
HR              4311
Height          2905
ICUType        39339
K              36913
Lactate        38652
MAP            26166
MechVent       34556
Mg             37337
NIDiasABP      17499
NIMAP          17931
NISysABP       17484
Na             37077
PaCO2          37113
PaO2           37111
Platelets      37433
RespRate       27996
SaO2           39688
SysABP         25936
Temp           28667
TroponinI      40065
TroponinT      39617
Urine          15821
WBC            37550
Weight         14271
pH             37051
dtype: int64

In [86]:
ICUType_3_test_ids.unique().size

837

In [87]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Gender"] != -1.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            32405
ALT            32398
AST            32395
Age             1606
Albumin        32475
BUN            30427
Bilirubin      32409
Cholesterol    32891
Creatinine     30426
DiasABP        10295
FiO2           26690
GCS            17460
Gender         32242
Glucose        30426
HCO3           30471
HCT            29645
HR              2900
Height          1606
ICUType        32242
K              30203
Lactate        30958
MAP            10414
MechVent       26816
Mg             30376
NIDiasABP      22145
NIMAP          22256
NISysABP       22128
Na             30261
PaCO2          28931
PaO2           28937
Platelets      30409
RespRate       25147
SaO2           32235
SysABP         10293
Temp           21191
TroponinI      32865
TroponinT      32678
Urine           7975
WBC            30584
Weight         18983
pH             28881
dtype: int64

In [88]:
ICUType_4_test_ids.unique().size

686

In [89]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Gender"] != -1.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            63044
ALT            63017
AST            63019
Age             3329
Albumin        63206
BUN            59389
Bilirubin      63024
Cholesterol    63818
Creatinine     59360
DiasABP        28357
FiO2           53799
GCS            43958
Gender         62604
Glucose        59742
HCO3           59499
HCT            57818
HR              5702
Height          3329
ICUType        62604
K              59223
Lactate        61418
MAP            28528
MechVent       54569
Mg             59516
NIDiasABP      36956
NIMAP          37252
NISysABP       36924
Na             59555
PaCO2          56459
PaO2           56474
Platelets      59318
RespRate       48309
SaO2           60961
SysABP         28357
Temp           39228
TroponinI      63775
TroponinT      63083
Urine          18174
WBC            59733
Weight         28766
pH             56078
dtype: int64

In [90]:
more_than_or_equal_to_65_test_ids.unique().size

1332

In [91]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Gender"] != -1.0]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            50169
ALT            50131
AST            50128
Age             3387
Albumin        50416
BUN            47280
Bilirubin      50138
Cholesterol    51046
Creatinine     47268
DiasABP        23208
FiO2           42881
GCS            33645
Gender         50055
Glucose        47440
HCO3           47362
HCT            46219
HR              5579
Height          3387
ICUType        50055
K              47071
Lactate        48840
MAP            23432
MechVent       42873
Mg             47379
NIDiasABP      30862
NIMAP          31206
NISysABP       30846
Na             47278
PaCO2          45201
PaO2           45205
Platelets      47257
RespRate       39315
SaO2           49349
SysABP         23205
Temp           33125
TroponinI      51042
TroponinT      50720
Urine          16806
WBC            47576
Weight         25194
pH             44930
dtype: int64

In [93]:
less_than_65_test_ids.unique().size

1065

In [95]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [96]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 32148, dtype: float64

In [97]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [98]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.00,37.50,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.30,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
2,132567,0,0.0,,,,71.0,,9.0,,...,111.50,35.60,,,15.0,9.0,56.0,7.44,22.6,Peso normal
3,132585,0,0.0,,,,40.0,,,,...,90.50,,,,320.0,,84.7,,31.1,Obesidade grau 1
4,132601,0,0.0,,,,74.0,,,,...,,,,,,,75.9,7.39,24.0,Peso normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1276,162944,0,0.0,,,,58.0,,21.0,,...,115.00,35.15,,,400.0,19.6,121.1,7.56,47.3,Obesidade grau 3
1277,162980,0,0.0,,,,88.0,,31.0,,...,133.00,38.00,,,45.0,17.6,76.5,7.38,29.9,Sobrepeso
1278,162991,0,0.0,,,,56.0,,,,...,155.25,,,,,,96.2,,33.7,Obesidade grau 1
1279,162999,0,0.0,,,,70.0,,30.0,,...,0.00,36.30,,,,2.5,68.1,,20.4,Peso normal


In [100]:
bmi_data_test = bmi_data_test[bmi_data_test["Gender"] != -1.0]
bmi_data_test.count()

RecordID         1281
level_1          1281
Time             1281
ALP               303
ALT               313
AST               313
Age              1281
Albumin           262
BUN               887
Bilirubin         307
Cholesterol        55
Creatinine        890
DiasABP           896
FiO2              699
GCS              1050
Gender           1281
Glucose           834
HCO3              864
HCT               910
HR               1112
Height           1281
ICUType          1281
K                 853
Lactate           468
MAP               892
MechVent          699
Mg                846
NIDiasABP         853
NIMAP             847
NISysABP          857
Na                859
PaCO2             835
PaO2              832
Platelets         899
RespRate          193
SaO2              518
SysABP            896
Temp             1070
TroponinI          41
TroponinT         149
Urine            1032
WBC               888
Weight           1281
pH                851
BMI              1281
Classifica

In [101]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           442
Peso normal         394
Obesidade grau 1    234
Obesidade grau 2     94
Obesidade grau 3     89
Baixo peso           28
Name: count, dtype: int64

In [102]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1331
ALT            1331
AST            1331
Age              52
Albumin        1333
BUN            1260
Bilirubin      1331
Cholesterol    1341
Creatinine     1260
DiasABP         505
FiO2           1103
GCS             901
Gender         1316
Glucose        1266
HCO3           1263
HCT            1244
HR               74
Height           52
ICUType        1316
K              1261
Lactate        1309
MAP             508
MechVent       1122
Mg             1269
NIDiasABP       864
NIMAP           877
NISysABP        864
Na             1266
PaCO2          1196
PaO2           1189
Platelets      1265
RespRate       1168
SaO2           1269
SysABP          505
Temp            725
TroponinI      1339
TroponinT      1331
Urine           366
WBC            1269
Weight          526
pH             1179
dtype: int64

In [103]:
classificacao_baixo_peso_ids_test.unique().size

28

In [104]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            18633
ALT            18623
AST            18622
Age              820
Albumin        18701
BUN            17540
Bilirubin      18623
Cholesterol    18878
Creatinine     17534
DiasABP         6409
FiO2           15801
GCS            12832
Gender         18518
Glucose        17693
HCO3           17584
HCT            16939
HR              1640
Height           820
ICUType        18518
K              17521
Lactate        18026
MAP             6473
MechVent       15912
Mg             17562
NIDiasABP      12858
NIMAP          12916
NISysABP       12850
Na             17631
PaCO2          16156
PaO2           16166
Platelets      17396
RespRate       15386
SaO2           17673
SysABP          6409
Temp           10313
TroponinI      18868
TroponinT      18730
Urine           5009
WBC            17568
Weight          9033
pH             15974
dtype: int64

In [105]:
classificacao_normal_peso_ids_test.unique().size

394

In [106]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            20877
ALT            20855
AST            20857
Age              957
Albumin        20964
BUN            19666
Bilirubin      20872
Cholesterol    21177
Creatinine     19651
DiasABP         6506
FiO2           17803
GCS            14406
Gender         20774
Glucose        19824
HCO3           19717
HCT            18969
HR              1862
Height           957
ICUType        20774
K              19653
Lactate        20265
MAP             6538
MechVent       17706
Mg             19659
NIDiasABP      14623
NIMAP          14676
NISysABP       14612
Na             19778
PaCO2          17956
PaO2           17967
Platelets      19488
RespRate       18048
SaO2           19783
SysABP          6505
Temp           11261
TroponinI      21152
TroponinT      20999
Urine           5380
WBC            19696
Weight         10780
pH             17749
dtype: int64

In [107]:
classificacao_sobrepeso_ids_test.unique().size

442

In [108]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID           0
level_1            0
Time               0
ALP            11032
ALT            11026
AST            11027
Age              460
Albumin        11087
BUN            10365
Bilirubin      11023
Cholesterol    11213
Creatinine     10361
DiasABP         3496
FiO2            9097
GCS             7842
Gender         10998
Glucose        10480
HCO3           10409
HCT             9980
HR               988
Height           460
ICUType        10998
K              10381
Lactate        10674
MAP             3523
MechVent        9168
Mg             10425
NIDiasABP       7819
NIMAP           7860
NISysABP        7812
Na             10437
PaCO2           9413
PaO2            9416
Platelets      10291
RespRate        9815
SaO2           10479
SysABP          3496
Temp            6040
TroponinI      11202
TroponinT      11133
Urine           2639
WBC            10400
Weight          5034
pH              9298
dtype: int64

In [109]:
classificacao_obesidade_1_ids_test.unique().size

234

In [112]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            4447
ALT            4447
AST            4447
Age             183
Albumin        4469
BUN            4173
Bilirubin      4446
Cholesterol    4503
Creatinine     4172
DiasABP        1420
FiO2           3655
GCS            3116
Gender         4418
Glucose        4213
HCO3           4188
HCT            4057
HR              358
Height          183
ICUType        4418
K              4183
Lactate        4296
MAP            1424
MechVent       3712
Mg             4199
NIDiasABP      3132
NIMAP          3146
NISysABP       3128
Na             4195
PaCO2          3789
PaO2           3789
Platelets      4155
RespRate       3797
SaO2           4203
SysABP         1420
Temp           2293
TroponinI      4504
TroponinT      4462
Urine          1005
WBC            4193
Weight         2114
pH             3754
dtype: int64

In [113]:
classificacao_obesidade_2_ids_test.unique().size

94

In [114]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            4194
ALT            4189
AST            4190
Age             148
Albumin        4211
BUN            3934
Bilirubin      4189
Cholesterol    4264
Creatinine     3934
DiasABP        1436
FiO2           3381
GCS            3050
Gender         4183
Glucose        3964
HCO3           3947
HCT            3837
HR              306
Height          148
ICUType        4183
K              3934
Lactate        3994
MAP            1472
MechVent       3398
Mg             3941
NIDiasABP      2875
NIMAP          2910
NISysABP       2875
Na             3957
PaCO2          3497
PaO2           3495
Platelets      3932
RespRate       3766
SaO2           3928
SysABP         1436
Temp           2475
TroponinI      4263
TroponinT      4216
Urine          1089
WBC            3970
Weight         1853
pH             3477
dtype: int64

In [115]:
classificacao_obesidade_3_ids_test.unique().size

89

In [116]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3
RecordID,0,0,0,0,0,0,0,0,0,0,0,0,0
level_1,0,0,0,0,0,0,0,0,0,0,0,0,0
Time,0,0,0,0,0,0,0,0,0,0,0,0,0
ALP,49464,63749,24702,39280,32405,63044,50169,1331,18633,20877,11032,4447,4194
ALT,49429,63719,24694,39246,32398,63017,50131,1331,18623,20855,11026,4447,4189
AST,49426,63721,24697,39243,32395,63019,50128,1331,18622,20857,11027,4447,4190
Age,2530,4186,845,2905,1606,3329,3387,52,820,957,460,183,148
Albumin,49645,63977,24755,39509,32475,63206,50416,1333,18701,20964,11087,4469,4211
BUN,46610,60059,23260,37160,30427,59389,47280,1260,17540,19666,10365,4173,3934
Bilirubin,49432,63730,24697,39229,32409,63024,50138,1331,18623,20872,11023,4446,4189
