In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [2]:
from pypotsModify.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-28 09:24:11 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-28 09:24:11 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-28 09:24:11 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-28 09:24:11 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [3]:
train_X = physionet2012_dataset['train_X']

In [4]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            159436
ALT            159345
AST            159354
Age              9351
Albumin        160042
BUN            150357
Bilirubin      159307
Cholesterol    161890
Creatinine     150310
DiasABP         77798
FiO2           136874
GCS            109812
Gender         158766
Glucose        150879
HCO3           150546
HCT            147230
HR              15395
Height           9351
ICUType        158766
K              149674
Lactate        155516
MAP             78281
MechVent       137560
Mg             150577
NIDiasABP       90650
NIMAP           91722
NISysABP        90601
Na             150406
PaCO2          144499
PaO2           144524
Platelets      150419
RespRate       120463
SaO2           156022
SysABP          77793
Temp           104488
TroponinI      161817
TroponinT      160355
Urine           48822
WBC            151214
Weight          75002
pH             143862
dtype: int64

In [5]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            202420
ALT            202330
AST            202326
Age             11939
Albumin        203241
BUN            190813
Bilirubin      202318
Cholesterol    205471
Creatinine     190740
DiasABP         90367
FiO2           173191
GCS            139783
Gender         201536
Glucose        191845
HCO3           191195
HCT            185770
HR              20496
Height          11939
ICUType        201536
K              190240
Lactate        197128
MAP             90961
MechVent       174772
Mg             191146
NIDiasABP      121826
NIMAP          122888
NISysABP       121749
Na             191241
PaCO2          180704
PaO2           180753
Platelets      190217
RespRate       160178
SaO2           197235
SysABP          90356
Temp           126917
TroponinI      205436
TroponinT      203591
Urine           64036
WBC            191785
Weight          99105
pH             179454
dtype: int64

In [6]:
undefined_gender_ids = train_X[train_X["Gender"] == -1.0]
undefined_gender_ids = undefined_gender_ids["RecordID"]
undefined_gender_missing_rate = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_missing_rate = undefined_gender_missing_rate.isna().sum()
undefined_gender_missing_rate

RecordID         0
level_1          0
Time             0
ALP            238
ALT            237
AST            237
Age             94
Albumin        238
BUN            228
Bilirubin      238
Cholesterol    240
Creatinine     228
DiasABP        118
FiO2           213
GCS            195
Gender         235
Glucose        228
HCO3           228
HCT            229
HR             103
Height          94
ICUType        235
K              227
Lactate        239
MAP            123
MechVent       214
Mg             229
NIDiasABP      200
NIMAP          200
NISysABP       200
Na             228
PaCO2          223
PaO2           223
Platelets      229
RespRate       194
SaO2           240
SysABP         118
Temp           193
TroponinI      240
TroponinT      233
Urine          136
WBC            228
Weight         112
pH             223
dtype: int64

In [7]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training_missing = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training_missing.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            52617
ALT            52571
AST            52574
Age             4399
Albumin        52816
BUN            49556
Bilirubin      52586
Cholesterol    53108
Creatinine     49485
DiasABP        31460
FiO2           47566
GCS            39739
Gender         52311
Glucose        49731
HCO3           49693
HCT            48660
HR              6801
Height          4399
ICUType        52311
K              48937
Lactate        52187
MAP            31529
MechVent       47885
Mg             49602
NIDiasABP      27639
NIMAP          27780
NISysABP       27621
Na             49723
PaCO2          48974
PaO2           48972
Platelets      49506
RespRate       34995
SaO2           50771
SysABP         31459
Temp           37272
TroponinI      53242
TroponinT      52280
Urine          22822
WBC            49907
Weight         29760
pH             48876
dtype: int64

In [8]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training_missing = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training_missing.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            76742
ALT            76728
AST            76728
Age             2482
Albumin        76917
BUN            72284
Bilirubin      76740
Cholesterol    77260
Creatinine     72267
DiasABP        16313
FiO2           64155
GCS            56694
Gender         75670
Glucose        73700
HCO3           72661
HCT            68831
HR              6323
Height          2482
ICUType        75670
K              73196
Lactate        74600
MAP            16264
MechVent       64410
Mg             72407
NIDiasABP      60783
NIMAP          60906
NISysABP       60754
Na             73404
PaCO2          61477
PaO2           61526
Platelets      71005
RespRate       73871
SaO2           68906
SysABP         16312
Temp           32286
TroponinI      77186
TroponinT      77087
Urine          12220
WBC            72053
Weight         37485
pH             60003
dtype: int64

In [9]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training_missing = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training_missing.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            129601
ALT            129506
AST            129509
Age              9694
Albumin        130283
BUN            122430
Bilirubin      129391
Cholesterol    132408
Creatinine     122405
DiasABP         85390
FiO2           112905
GCS             97259
Gender         129767
Glucose        122456
HCO3           122393
HCT            120813
HR              14267
Height           9694
ICUType        129767
K              121626
Lactate        127236
MAP             85952
MechVent       114332
Mg             123003
NIDiasABP       57304
NIMAP           58624
NISysABP        57269
Na             122175
PaCO2          122285
PaO2           122282
Platelets      123329
RespRate        92914
SaO2           130955
SysABP          85383
Temp            94461
TroponinI      132226
TroponinT      130692
Urine           52271
WBC            123752
Weight          46864
pH             122134
dtype: int64

In [10]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training_missing = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training_missing.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            103134
ALT            103107
AST            103106
Age              4809
Albumin        103505
BUN             97128
Bilirubin      103146
Cholesterol    104825
Creatinine      97121
DiasABP         35120
FiO2            85652
GCS             56098
Gender         102789
Glucose         97065
HCO3            97222
HCT             94925
HR               8603
Height           4809
ICUType        102789
K               96382
Lactate         98860
MAP             35620
MechVent        85919
Mg              96940
NIDiasABP       66950
NIMAP           67500
NISysABP        66906
Na              96573
PaCO2           92690
PaO2            92720
Platelets       97025
RespRate        79055
SaO2           102865
SysABP          35113
Temp            67579
TroponinI      104839
TroponinT      104120
Urine           25681
WBC             97515
Weight          60110
pH              92526
dtype: int64

In [11]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train_missing = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train_missing.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            197188
ALT            197119
AST            197117
Age             11032
Albumin        197655
BUN            185739
Bilirubin      197077
Cholesterol    199513
Creatinine     185679
DiasABP         90586
FiO2           168763
GCS            137499
Gender         195708
Glucose        186747
HCO3           186080
HCT            180946
HR              18685
Height          11032
ICUType        195708
K              185182
Lactate        191951
MAP             91120
MechVent       170468
Mg             185992
NIDiasABP      114982
NIMAP          115946
NISysABP       114918
Na             186230
PaCO2          176551
PaO2           176584
Platelets      185376
RespRate       150885
SaO2           190845
SysABP          90579
Temp           123054
TroponinI      199384
TroponinT      197077
Urine           58154
WBC            186631
Weight          93463
pH             175473
dtype: int64

In [12]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train_missing = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train_missing.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            164906
ALT            164793
AST            164800
Age             10352
Albumin        165866
BUN            155659
Bilirubin      164786
Cholesterol    168088
Creatinine     155599
DiasABP         77697
FiO2           141515
GCS            112291
Gender         164829
Glucose        156205
HCO3           155889
HCT            152283
HR              17309
Height          10352
ICUType        164829
K              154959
Lactate        160932
MAP             78245
MechVent       142078
Mg             155960
NIDiasABP       97694
NIMAP           98864
NISysABP        97632
Na             155645
PaCO2          148875
PaO2           148916
Platelets      155489
RespRate       129950
SaO2           162652
SysABP          77688
Temp           108544
TroponinI      168109
TroponinT      167102
Urine           54840
WBC            156596
Weight          80756
pH             148066
dtype: int64

In [13]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [14]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [15]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 101924, dtype: float64

In [16]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [17]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.0,37.5,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132555,0,0.0,,,,74.0,,19.0,,...,98.0,34.8,,,35.0,9.0,66.1,7.39,21.5,Peso normal
2,132567,0,0.0,,,,71.0,,9.0,,...,111.5,35.6,,,15.0,9.0,56.0,7.44,22.6,Peso normal
3,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,,36.6,,,600.0,8.8,102.6,,35.4,Obesidade grau 2
4,132588,0,0.0,,,,48.0,,,,...,,,,,,,42.3,,17.6,Baixo peso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4035,163003,0,0.0,124.0,254.0,204.0,36.0,2.6,3.0,30.3,...,,36.3,,,50.0,10.6,57.7,7.47,21.2,Peso normal
4036,163008,0,0.0,,,,59.0,,24.0,,...,97.0,37.6,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4037,163016,0,0.0,,27.0,120.0,65.0,,29.0,0.4,...,101.0,38.1,,,75.0,8.0,63.6,7.37,24.8,Peso normal
4038,163021,0,0.0,,,,72.0,,9.0,,...,,,,,,8.6,62.0,,20.8,Peso normal


In [18]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1392
Peso normal         1208
Obesidade grau 1     718
Obesidade grau 2     305
Obesidade grau 3     285
Baixo peso           132
Name: count, dtype: int64

In [19]:
classificacao_undefined_ids = bmi_data_train["RecordID"]
classificacao_undefined_missing = train_X[~train_X["RecordID"].isin(classificacao_undefined_ids)]
classificacao_undefined_missing = classificacao_undefined_missing.isna().sum()
classificacao_undefined_missing

RecordID            0
level_1             0
Time                0
ALP            171338
ALT            171240
AST            171240
Age             13438
Albumin        171925
BUN            161931
Bilirubin      171201
Cholesterol    174003
Creatinine     161882
DiasABP        104317
FiO2           150105
GCS            115681
Gender         170657
Glucose        161946
HCO3           161987
HCT            159410
HR              19726
Height          13438
ICUType        170657
K              160747
Lactate        168171
MAP            105007
MechVent       151927
Mg             162269
NIDiasABP       81064
NIMAP           82527
NISysABP        81007
Na             161430
PaCO2          161768
PaO2           161795
Platelets      162762
RespRate       116303
SaO2           172334
SysABP         104306
Temp           127198
TroponinI      174004
TroponinT      172138
Urine           63162
WBC            163262
Weight          82223
pH             161556
dtype: int64

In [20]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6239
ALT            6237
AST            6236
Age             274
Albumin        6263
BUN            5857
Bilirubin      6235
Cholesterol    6325
Creatinine     5855
DiasABP        2258
FiO2           5238
GCS            4397
Gender         6204
Glucose        5890
HCO3           5865
HCT            5734
HR              499
Height          274
ICUType        6204
K              5844
Lactate        6033
MAP            2236
MechVent       5243
Mg             5863
NIDiasABP      4167
NIMAP          4205
NISysABP       4166
Na             5872
PaCO2          5496
PaO2           5488
Platelets      5863
RespRate       5333
SaO2           5969
SysABP         2258
Temp           3656
TroponinI      6312
TroponinT      6278
Urine          1929
WBC            5902
Weight         3121
pH             5451
dtype: int64

In [21]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

132

In [22]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            57061
ALT            57035
AST            57037
Age             2319
Albumin        57267
BUN            53719
Bilirubin      57038
Cholesterol    57874
Creatinine     53703
DiasABP        19861
FiO2           48378
GCS            39350
Gender         56776
Glucose        54135
HCO3           53877
HCT            52021
HR              4798
Height          2319
ICUType        56776
K              53622
Lactate        55280
MAP            20005
MechVent       48434
Mg             53750
NIDiasABP      38508
NIMAP          38693
NISysABP       38489
Na             53946
PaCO2          49480
PaO2           49515
Platelets      53278
RespRate       48306
SaO2           54507
SysABP         19860
Temp           31970
TroponinI      57859
TroponinT      57450
Urine          15453
WBC            53841
Weight         28245
pH             48965
dtype: int64

In [23]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1208

In [24]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            65718
ALT            65682
AST            65682
Age             2892
Albumin        66019
BUN            61813
Bilirubin      65684
Cholesterol    66707
Creatinine     61783
DiasABP        21634
FiO2           55370
GCS            46236
Gender         65424
Glucose        62403
HCO3           62008
HCT            59712
HR              5850
Height          2892
ICUType        65424
K              61883
Lactate        63811
MAP            21749
MechVent       55449
Mg             61870
NIDiasABP      45471
NIMAP          45712
NISysABP       45445
Na             62231
PaCO2          56305
PaO2           56323
Platelets      61253
RespRate       56666
SaO2           62189
SysABP         21632
Temp           35192
TroponinI      66672
TroponinT      66191
Urine          17266
WBC            61936
Weight         32603
pH             55667
dtype: int64

In [25]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1392

In [26]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            33917
ALT            33901
AST            33905
Age             1269
Albumin        34064
BUN            31923
Bilirubin      33897
Cholesterol    34412
Creatinine     31910
DiasABP        10954
FiO2           28247
GCS            24117
Gender         33746
Glucose        32208
HCO3           32011
HCT            30881
HR              2751
Height          1269
ICUType        33746
K              31890
Lactate        32772
MAP            11079
MechVent       28409
Mg             31997
NIDiasABP      23607
NIMAP          23707
NISysABP       23596
Na             32100
PaCO2          28720
PaO2           28728
Platelets      31689
RespRate       29877
SaO2           32052
SysABP         10954
Temp           17944
TroponinI      34375
TroponinT      34121
Urine           8067
WBC            32030
Weight         15656
pH             28403
dtype: int64

In [27]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

718

In [28]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            14391
ALT            14392
AST            14392
Age              601
Albumin        14466
BUN            13506
Bilirubin      14384
Cholesterol    14621
Creatinine     13496
DiasABP         4647
FiO2           11910
GCS            10220
Gender         14335
Glucose        13622
HCO3           13541
HCT            13124
HR              1219
Height           601
ICUType        14335
K              13521
Lactate        13869
MAP             4669
MechVent       12023
Mg             13552
NIDiasABP      10223
NIMAP          10288
NISysABP       10219
Na             13581
PaCO2          12220
PaO2           12217
Platelets      13400
RespRate       12439
SaO2           13656
SysABP          4645
Temp            7629
TroponinI      14609
TroponinT      14467
Urine           3569
WBC            13532
Weight          6042
pH             12137
dtype: int64

In [29]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

305

In [30]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13430
ALT            13425
AST            13425
Age              591
Albumin        13517
BUN            12649
Bilirubin      13424
Cholesterol    13659
Creatinine     12649
DiasABP         4612
FiO2           11030
GCS             9789
Gender         13395
Glucose        12748
HCO3           12680
HCT            12347
HR              1151
Height           591
ICUType        13395
K              12634
Lactate        12947
MAP             4620
MechVent       11061
Mg             12651
NIDiasABP       9636
NIMAP           9678
NISysABP        9628
Na             12715
PaCO2          11437
PaO2           11434
Platelets      12620
RespRate       11911
SaO2           12790
SysABP          4612
Temp            8009
TroponinI      13662
TroponinT      13534
Urine           3548
WBC            12724
Weight          6329
pH             11360
dtype: int64

In [31]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

285

In [32]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [33]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["Undefined gender"] = undefined_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
df_missing_transpose["Undefined classification"] = classificacao_undefined_missing
df_missing_transpose = df_missing_transpose.drop("RecordID", axis=0)
df_missing_transpose = df_missing_transpose.drop("level_1", axis=0)
df_missing_transpose = df_missing_transpose.drop("Time", axis=0)
df_missing_transpose = df_missing_transpose.drop("Age", axis=0)
df_missing_transpose = df_missing_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,159436,202420,238,52617,76742,129601,103134,197188,164906,6239,57061,65718,33917,14391,13430,171338
ALT,159345,202330,237,52571,76728,129506,103107,197119,164793,6237,57035,65682,33901,14392,13425,171240
AST,159354,202326,237,52574,76728,129509,103106,197117,164800,6236,57037,65682,33905,14392,13425,171240
Albumin,160042,203241,238,52816,76917,130283,103505,197655,165866,6263,57267,66019,34064,14466,13517,171925
BUN,150357,190813,228,49556,72284,122430,97128,185739,155659,5857,53719,61813,31923,13506,12649,161931
Bilirubin,159307,202318,238,52586,76740,129391,103146,197077,164786,6235,57038,65684,33897,14384,13424,171201
Cholesterol,161890,205471,240,53108,77260,132408,104825,199513,168088,6325,57874,66707,34412,14621,13659,174003
Creatinine,150310,190740,228,49485,72267,122405,97121,185679,155599,5855,53703,61783,31910,13496,12649,161882
DiasABP,77798,90367,118,31460,16313,85390,35120,90586,77697,2258,19861,21634,10954,4647,4612,104317
FiO2,136874,173191,213,47566,64155,112905,85652,168763,141515,5238,48378,55370,28247,11910,11030,150105


<h3>Validation data</h3>

In [34]:
validation_X = physionet2012_dataset['val_X']

In [35]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            38834
ALT            38818
AST            38814
Age             2257
Albumin        38969
BUN            36657
Bilirubin      38806
Cholesterol    39381
Creatinine     36645
DiasABP        18899
FiO2           33076
GCS            26624
Gender         38634
Glucose        36799
HCO3           36711
HCT            35818
HR              3642
Height          2257
ICUType        38634
K              36526
Lactate        37932
MAP            19045
MechVent       33154
Mg             36712
NIDiasABP      22229
NIMAP          22521
NISysABP       22212
Na             36696
PaCO2          35177
PaO2           35197
Platelets      36712
RespRate       29509
SaO2           37916
SysABP         18895
Temp           25028
TroponinI      39348
TroponinT      39061
Urine          11556
WBC            36902
Weight         18392
pH             35018
dtype: int64

In [36]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            51525
ALT            51501
AST            51500
Age             2979
Albumin        51754
BUN            48610
Bilirubin      51497
Cholesterol    52263
Creatinine     48594
DiasABP        22592
FiO2           43874
GCS            35785
Gender         51277
Glucose        48827
HCO3           48693
HCT            47255
HR              5122
Height          2979
ICUType        51277
K              48398
Lactate        50086
MAP            22690
MechVent       44285
Mg             48610
NIDiasABP      31757
NIMAP          32068
NISysABP       31737
Na             48696
PaCO2          45889
PaO2           45902
Platelets      48451
RespRate       41149
SaO2           50086
SysABP         22590
Temp           32382
TroponinI      52278
TroponinT      51774
Urine          16516
WBC            48821
Weight         24248
pH             45545
dtype: int64

In [37]:
undefined_gender_ids_validation = validation_X[validation_X["Gender"] == -1.0]
undefined_gender_ids_validation = undefined_gender_ids_validation["RecordID"]
undefined_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(undefined_gender_ids_validation)]
undefined_gender_missing_rate_validation = undefined_gender_missing_rate_validation.isna().sum()
undefined_gender_missing_rate_validation

RecordID         0
level_1          0
Time             0
ALP            237
ALT            237
AST            237
Age              6
Albumin        237
BUN            217
Bilirubin      237
Cholesterol    240
Creatinine     217
DiasABP         72
FiO2           216
GCS            167
Gender         235
Glucose        217
HCO3           217
HCT            219
HR              11
Height           6
ICUType        235
K              217
Lactate        213
MAP             73
MechVent       215
Mg             219
NIDiasABP      147
NIMAP          147
NISysABP       147
Na             218
PaCO2          204
PaO2           204
Platelets      219
RespRate       198
SaO2           237
SysABP          72
Temp           142
TroponinI      240
TroponinT      238
Urine           99
WBC            221
Weight         112
pH             199
dtype: int64

In [38]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            14424
ALT            14410
AST            14409
Age             1106
Albumin        14479
BUN            13613
Bilirubin      14419
Cholesterol    14534
Creatinine     13597
DiasABP         8390
FiO2           13019
GCS            10743
Gender         14335
Glucose        13653
HCO3           13647
HCT            13429
HR              1683
Height          1106
ICUType        14335
K              13449
Lactate        14300
MAP             8430
MechVent       13139
Mg             13600
NIDiasABP       7588
NIMAP           7634
NISysABP        7587
Na             13648
PaCO2          13440
PaO2           13438
Platelets      13631
RespRate        9710
SaO2           13977
SysABP          8389
Temp           10409
TroponinI      14597
TroponinT      14315
Urine           6538
WBC            13734
Weight          7633
pH             13413
dtype: int64

In [39]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            19453
ALT            19447
AST            19447
Age              660
Albumin        19505
BUN            18293
Bilirubin      19449
Cholesterol    19580
Creatinine     18291
DiasABP         3830
FiO2           16110
GCS            14439
Gender         19176
Glucose        18629
HCO3           18383
HCT            17466
HR              1586
Height           660
ICUType        19176
K              18529
Lactate        18865
MAP             3784
MechVent       16246
Mg             18298
NIDiasABP      15676
NIMAP          15753
NISysABP       15664
Na             18572
PaCO2          15539
PaO2           15554
Platelets      17997
RespRate       18866
SaO2           17331
SysABP          3828
Temp            7927
TroponinI      19558
TroponinT      19531
Urine           2989
WBC            18266
Weight          9045
pH             15132
dtype: int64

In [40]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            31171
ALT            31159
AST            31155
Age             2366
Albumin        31326
BUN            29501
Bilirubin      31125
Cholesterol    31790
Creatinine     29495
DiasABP        20986
FiO2           27082
GCS            23305
Gender         31161
Glucose        29505
HCO3           29504
HCT            28972
HR              3462
Height          2366
ICUType        31161
K              29307
Lactate        30654
MAP            21118
MechVent       27410
Mg             29642
NIDiasABP      13704
NIMAP          14079
NISysABP       13693
Na             29446
PaCO2          29505
PaO2           29509
Platelets      29726
RespRate       21976
SaO2           31464
SysABP         20984
Temp           22939
TroponinI      31756
TroponinT      31377
Urine          12359
WBC            29790
Weight         11300
pH             29485
dtype: int64

In [41]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            25548
ALT            25540
AST            25540
Age             1110
Albumin        25650
BUN            24077
Bilirubin      25547
Cholesterol    25980
Creatinine     24073
DiasABP         8357
FiO2           20955
GCS            14089
Gender         25474
Glucose        24056
HCO3           24087
HCT            23425
HR              2044
Height          1110
ICUType        25474
K              23856
Lactate        24412
MAP             8476
MechVent       20859
Mg             24001
NIDiasABP      17165
NIMAP          17270
NISysABP       17152
Na             23944
PaCO2          22786
PaO2           22802
Platelets      24028
RespRate       20304
SaO2           25467
SysABP          8356
Temp           16277
TroponinI      25955
TroponinT      25850
Urine           6285
WBC            24154
Weight         14774
pH             22732
dtype: int64

In [42]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            50677
ALT            50663
AST            50660
Age             2833
Albumin        50785
BUN            47764
Bilirubin      50646
Cholesterol    51250
Creatinine     47743
DiasABP        22557
FiO2           42920
GCS            35344
Gender         50290
Glucose        48022
HCO3           47836
HCT            46563
HR              4710
Height          2833
ICUType        50290
K              47621
Lactate        49341
MAP            22662
MechVent       43336
Mg             47792
NIDiasABP      30511
NIMAP          30794
NISysABP       30493
Na             47895
PaCO2          45071
PaO2           45087
Platelets      47649
RespRate       39629
SaO2           48888
SysABP         22553
Temp           31134
TroponinI      51214
TroponinT      50652
Urine          14703
WBC            48007
Weight         24114
pH             44789
dtype: int64

In [43]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            39919
ALT            39893
AST            39891
Age             2409
Albumin        40175
BUN            37720
Bilirubin      39894
Cholesterol    40634
Creatinine     37713
DiasABP        19006
FiO2           34246
GCS            27232
Gender         39856
Glucose        37821
HCO3           37785
HCT            36729
HR              4065
Height          2409
ICUType        39856
K              37520
Lactate        38890
MAP            19146
MechVent       34318
Mg             37749
NIDiasABP      23622
NIMAP          23942
NISysABP       23603
Na             37715
PaCO2          36199
PaO2           36216
Platelets      37733
RespRate       31227
SaO2           39351
SysABP         19004
Temp           26418
TroponinI      40652
TroponinT      40421
Urine          13468
WBC            37937
Weight         18638
pH             35973
dtype: int64

In [44]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [45]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

144       1.803
145       1.803
146       1.803
147       1.803
148       1.803
          ...  
574890    1.829
574891    1.829
574892    1.829
574893    1.829
574894    1.829
Name: Height, Length: 25711, dtype: float64

In [46]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,,11.5,84.6,,26.0,Sobrepeso
145,132543,1,1.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
146,132543,2,2.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
147,132543,3,3.0,,,,68.0,,,,...,,36.4,,,,,84.6,,26.0,Sobrepeso
148,132543,4,4.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso


In [47]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.30,,,600.0,11.5,84.6,,26.0,Sobrepeso
1,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.30,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
2,132568,0,0.0,,,,66.0,,18.0,,...,,36.10,,,220.0,14.8,84.5,,34.1,Obesidade grau 1
3,132573,0,0.0,,,,77.0,,,,...,,36.90,,,120.0,,90.1,,34.1,Obesidade grau 1
4,132585,0,0.0,,,,40.0,,,,...,90.50,,,,320.0,,84.7,,31.1,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,162824,0,0.0,,,,80.0,,32.0,,...,,37.90,,,155.0,9.3,80.7,,31.5,Obesidade grau 1
987,162836,0,0.0,,,,76.0,,,,...,158.00,,,,,,75.0,,23.1,Peso normal
988,162942,0,0.0,67.0,61.0,92.0,40.0,3.3,12.0,0.3,...,,36.30,,,600.0,20.6,120.7,7.38,37.1,Obesidade grau 2
989,162991,0,0.0,,,,56.0,,,,...,155.25,,,,,,96.2,,33.7,Obesidade grau 1


In [48]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           351
Peso normal         300
Obesidade grau 1    181
Obesidade grau 2     72
Obesidade grau 3     60
Baixo peso           27
Name: count, dtype: int64

In [49]:
classificacao_undefined_ids_validation = bmi_data_validation["RecordID"]
classificacao_undefined_missing_validation = validation_X[~validation_X["RecordID"].isin(classificacao_undefined_ids_validation)]
classificacao_undefined_missing_validation = classificacao_undefined_missing_validation.isna().sum()
classificacao_undefined_missing_validation

RecordID           0
level_1            0
Time               0
ALP            43807
ALT            43788
AST            43785
Age             3240
Albumin        43916
BUN            41436
Bilirubin      43778
Cholesterol    44416
Creatinine     41420
DiasABP        26645
FiO2           38065
GCS            29781
Gender         43569
Glucose        41437
HCO3           41471
HCT            40702
HR              4815
Height          3240
ICUType        43569
K              41102
Lactate        42999
MAP            26802
MechVent       38461
Mg             41500
NIDiasABP      21092
NIMAP          21480
NISysABP       21069
Na             41329
PaCO2          41320
PaO2           41332
Platelets      41664
RespRate       30035
SaO2           43957
SysABP         26643
Temp           32546
TroponinI      44407
TroponinT      43957
Urine          16344
WBC            41777
Weight         20895
pH             41254
dtype: int64

In [50]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1284
ALT            1284
AST            1284
Age              51
Albumin        1281
BUN            1202
Bilirubin      1284
Cholesterol    1294
Creatinine     1202
DiasABP         509
FiO2           1111
GCS             856
Gender         1269
Glucose        1204
HCO3           1203
HCT            1170
HR              101
Height           51
ICUType        1269
K              1198
Lactate        1233
MAP             511
MechVent       1085
Mg             1199
NIDiasABP       805
NIMAP           827
NISysABP        805
Na             1200
PaCO2          1138
PaO2           1137
Platelets      1202
RespRate       1081
SaO2           1231
SysABP          508
Temp            790
TroponinI      1295
TroponinT      1280
Urine           338
WBC            1207
Weight          523
pH             1126
dtype: int64

In [51]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

27

In [52]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            14175
ALT            14166
AST            14166
Age              604
Albumin        14229
BUN            13364
Bilirubin      14161
Cholesterol    14380
Creatinine     13362
DiasABP         4619
FiO2           11787
GCS             9830
Gender         14100
Glucose        13446
HCO3           13389
HCT            12878
HR              1170
Height           604
ICUType        14100
K              13334
Lactate        13721
MAP             4666
MechVent       11699
Mg             13335
NIDiasABP       9679
NIMAP           9712
NISysABP        9674
Na             13413
PaCO2          12164
PaO2           12171
Platelets      13269
RespRate       12639
SaO2           13515
SysABP          4619
Temp            7710
TroponinI      14368
TroponinT      14265
Urine           3603
WBC            13406
Weight          6865
pH             12037
dtype: int64

In [53]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

300

In [54]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            16573
ALT            16568
AST            16569
Age              713
Albumin        16673
BUN            15612
Bilirubin      16570
Cholesterol    16805
Creatinine     15605
DiasABP         4994
FiO2           14077
GCS            11681
Gender         16497
Glucose        15755
HCO3           15650
HCT            15102
HR              1429
Height           713
ICUType        16497
K              15613
Lactate        16092
MAP             5002
MechVent       14149
Mg             15586
NIDiasABP      11942
NIMAP          12008
NISysABP       11935
Na             15709
PaCO2          14235
PaO2           14249
Platelets      15448
RespRate       14268
SaO2           15669
SysABP          4991
Temp            8726
TroponinI      16811
TroponinT      16696
Urine           4287
WBC            15630
Weight          7515
pH             14054
dtype: int64

In [55]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

351

In [56]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8509
ALT            8505
AST            8502
Age             395
Albumin        8580
BUN            8007
Bilirubin      8504
Cholesterol    8668
Creatinine     8005
DiasABP        2808
FiO2           7069
GCS            5930
Gender         8507
Glucose        8073
HCO3           8033
HCT            7746
HR              748
Height          395
ICUType        8507
K              8018
Lactate        8250
MAP            2860
MechVent       7116
Mg             8045
NIDiasABP      6161
NIMAP          6221
NISysABP       6159
Na             8053
PaCO2          7297
PaO2           7296
Platelets      7961
RespRate       7465
SaO2           8093
SysABP         2808
Temp           4554
TroponinI      8669
TroponinT      8597
Urine          2048
WBC            8033
Weight         4333
pH             7224
dtype: int64

In [57]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

181

In [58]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3410
ALT            3407
AST            3407
Age             145
Albumin        3430
BUN            3205
Bilirubin      3406
Cholesterol    3448
Creatinine     3203
DiasABP        1003
FiO2           2781
GCS            2455
Gender         3384
Glucose        3252
HCO3           3215
HCT            3088
HR              315
Height          145
ICUType        3384
K              3224
Lactate        3237
MAP            1004
MechVent       2856
Mg             3210
NIDiasABP      2488
NIMAP          2506
NISysABP       2488
Na             3233
PaCO2          2744
PaO2           2746
Platelets      3171
RespRate       2985
SaO2           3122
SysABP         1003
Temp           1705
TroponinI      3446
TroponinT      3429
Urine           866
WBC            3208
Weight         1507
pH             2713
dtype: int64

In [59]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

72

In [60]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            2838
ALT            2838
AST            2838
Age              94
Albumin        2851
BUN            2658
Bilirubin      2837
Cholesterol    2873
Creatinine     2659
DiasABP         985
FiO2           2276
GCS            2043
Gender         2820
Glucose        2676
HCO3           2660
HCT            2606
HR              197
Height           94
ICUType        2820
K              2652
Lactate        2699
MAP             963
MechVent       2288
Mg             2666
NIDiasABP      1966
NIMAP          1982
NISysABP       1966
Na             2673
PaCO2          2372
PaO2           2372
Platelets      2667
RespRate       2383
SaO2           2652
SysABP          985
Temp           1521
TroponinI      2870
TroponinT      2849
Urine           685
WBC            2683
Weight         1114
pH             2354
dtype: int64

In [61]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

60

In [62]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["Undefined gender"] = undefined_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
df_missing_transpose_validation["Undefined classification"] = classificacao_undefined_missing_validation
df_missing_transpose_validation = df_missing_transpose_validation.drop("RecordID", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("level_1", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Time", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Age", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,38834,51525,237,14424,19453,31171,25548,50677,39919,1284,14175,16573,8509,3410,2838,43807
ALT,38818,51501,237,14410,19447,31159,25540,50663,39893,1284,14166,16568,8505,3407,2838,43788
AST,38814,51500,237,14409,19447,31155,25540,50660,39891,1284,14166,16569,8502,3407,2838,43785
Albumin,38969,51754,237,14479,19505,31326,25650,50785,40175,1281,14229,16673,8580,3430,2851,43916
BUN,36657,48610,217,13613,18293,29501,24077,47764,37720,1202,13364,15612,8007,3205,2658,41436
Bilirubin,38806,51497,237,14419,19449,31125,25547,50646,39894,1284,14161,16570,8504,3406,2837,43778
Cholesterol,39381,52263,240,14534,19580,31790,25980,51250,40634,1294,14380,16805,8668,3448,2873,44416
Creatinine,36645,48594,217,13597,18291,29495,24073,47743,37713,1202,13362,15605,8005,3203,2659,41420
DiasABP,18899,22592,72,8390,3830,20986,8357,22557,19006,509,4619,4994,2808,1003,985,26645
FiO2,33076,43874,216,13019,16110,27082,20955,42920,34246,1111,11787,14077,7069,2781,2276,38065


<h3>Test data</h3>

In [63]:
test_X = physionet2012_dataset['test_X']

In [64]:
test_X["Gender"].value_counts()

Gender
 1.0    1338
 0.0    1059
-1.0       2
Name: count, dtype: int64

In [65]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            50060
ALT            50048
AST            50048
Age             3101
Albumin        50207
BUN            47197
Bilirubin      50046
Cholesterol    50746
Creatinine     47182
DiasABP        25278
FiO2           43528
GCS            34472
Gender         49773
Glucose        47366
HCO3           47251
HCT            46163
HR              5058
Height          3101
ICUType        49773
K              46983
Lactate        48962
MAP            25410
MechVent       43661
Mg             47275
NIDiasABP      28328
NIMAP          28637
NISysABP       28299
Na             47229
PaCO2          45701
PaO2           45707
Platelets      47269
RespRate       35367
SaO2           49131
SysABP         25278
Temp           33089
TroponinI      50709
TroponinT      50327
Urine          15995
WBC            47546
Weight         23360
pH             45493
dtype: int64

In [66]:
female_gender_test_ids.unique().size

1059

In [67]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            63153
ALT            63117
AST            63116
Age             3895
Albumin        63401
BUN            59581
Bilirubin      63086
Cholesterol    64103
Creatinine     59555
DiasABP        28406
FiO2           54027
GCS            44071
Gender         62886
Glucose        59900
HCO3           59704
HCT            57967
HR              6532
Height          3895
ICUType        62886
K              59403
Lactate        61559
MAP            28726
MechVent       54746
Mg             59727
NIDiasABP      38164
NIMAP          38452
NISysABP       38140
Na             59739
PaCO2          56515
PaO2           56522
Platelets      59409
RespRate       49875
SaO2           61338
SysABP         28405
Temp           39466
TroponinI      64087
TroponinT      63502
Urine          19628
WBC            59871
Weight         30973
pH             56090
dtype: int64

In [68]:
male_gender_test_ids.unique().size

1338

In [69]:
undefined_gender_ids_test = test_X[test_X["Gender"] == -1.0]
undefined_gender_ids_test = undefined_gender_ids_test["RecordID"]
undefined_gender_missing_rate_test = test_X[test_X["RecordID"].isin(undefined_gender_ids_test)]
undefined_gender_missing_rate_test = undefined_gender_missing_rate_test.isna().sum()
undefined_gender_missing_rate_test

RecordID        0
level_1         0
Time            0
ALP            96
ALT            96
AST            96
Age            46
Albumin        96
BUN            94
Bilirubin      96
Cholesterol    96
Creatinine     94
DiasABP        96
FiO2           95
GCS            84
Gender         94
Glucose        94
HCO3           94
HCT            94
HR             49
Height         46
ICUType        94
K              94
Lactate        95
MAP            96
MechVent       96
Mg             94
NIDiasABP      52
NIMAP          52
NISysABP       52
Na             94
PaCO2          93
PaO2           93
Platelets      94
RespRate       49
SaO2           96
SysABP         96
Temp           86
TroponinI      95
TroponinT      96
Urine          58
WBC            94
Weight         52
pH             93
dtype: int64

In [70]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            16420
ALT            16405
AST            16405
Age             1563
Albumin        16466
BUN            15498
Bilirubin      16405
Cholesterol    16540
Creatinine     15484
DiasABP         9869
FiO2           14897
GCS            12369
Gender         16309
Glucose        15546
HCO3           15531
HCT            15223
HR              2323
Height          1563
ICUType        16309
K              15290
Lactate        16317
MAP             9897
MechVent       15109
Mg             15502
NIDiasABP       8786
NIMAP           8813
NISysABP        8777
Na             15527
PaCO2          15310
PaO2           15313
Platelets      15498
RespRate       10143
SaO2           15792
SysABP          9869
Temp           11596
TroponinI      16624
TroponinT      16284
Urine           7197
WBC            15616
Weight          9374
pH             15287
dtype: int64

In [71]:
ICUType_1_test_ids.unique().size

347

In [72]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            24292
ALT            24288
AST            24289
Age              972
Albumin        24357
BUN            22894
Bilirubin      24289
Cholesterol    24471
Creatinine     22895
DiasABP         6001
FiO2           20496
GCS            18079
Gender         23970
Glucose        23343
HCO3           23007
HCT            21816
HR              2128
Height           972
ICUType        23970
K              23189
Lactate        23616
MAP             5935
MechVent       20517
Mg             22986
NIDiasABP      18508
NIMAP          18537
NISysABP       18496
Na             23254
PaCO2          19740
PaO2           19753
Platelets      22517
RespRate       23462
SaO2           21951
SysABP          6001
Temp           10501
TroponinI      24441
TroponinT      24419
Urine           4055
WBC            22843
Weight         11900
pH             19259
dtype: int64

In [73]:
ICUType_2_test_ids.unique().size

510

In [74]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            40551
ALT            40531
AST            40530
Age             2886
Albumin        40733
BUN            38333
Bilirubin      40494
Cholesterol    41376
Creatinine     38307
DiasABP        26565
FiO2           35561
GCS            30412
Gender         40561
Glucose        38328
HCO3           38328
HCT            37756
HR              4313
Height          2886
ICUType        40561
K              38053
Lactate        39882
MAP            26852
MechVent       35997
Mg             38493
NIDiasABP      17896
NIMAP          18320
NISysABP       17874
Na             38246
PaCO2          38315
PaO2           38304
Platelets      38622
RespRate       27462
SaO2           40885
SysABP         26564
Temp           29425
TroponinI      41294
TroponinT      40883
Urine          16474
WBC            38742
Weight         14021
pH             38249
dtype: int64

In [75]:
ICUType_3_test_ids.unique().size

863

In [76]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            32046
ALT            32037
AST            32036
Age             1621
Albumin        32148
BUN            30147
Bilirubin      32040
Cholesterol    32558
Creatinine     30145
DiasABP        11345
FiO2           26696
GCS            17767
Gender         31913
Glucose        30143
HCO3           30183
HCT            29429
HR              2875
Height          1621
ICUType        31913
K              29948
Lactate        30801
MAP            11548
MechVent       26880
Mg             30115
NIDiasABP      21354
NIMAP          21471
NISysABP       21344
Na             30035
PaCO2          28944
PaO2           28952
Platelets      30135
RespRate       24224
SaO2           31937
SysABP         11345
Temp           21119
TroponinI      32532
TroponinT      32339
Urine           7955
WBC            30310
Weight         19090
pH             28881
dtype: int64

In [77]:
ICUType_4_test_ids.unique().size

679

In [78]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            62592
ALT            62576
AST            62573
Age             3512
Albumin        62710
BUN            58935
Bilirubin      62555
Cholesterol    63287
Creatinine     58906
DiasABP        28973
FiO2           53592
GCS            43862
Gender         62087
Glucose        59246
HCO3           59033
HCT            57459
HR              5902
Height          3512
ICUType        62087
K              58709
Lactate        61046
MAP            29161
MechVent       54411
Mg             58989
NIDiasABP      36435
NIMAP          36728
NISysABP       36402
Na             59029
PaCO2          56264
PaO2           56278
Platelets      58903
RespRate       46893
SaO2           60590
SysABP         28973
Temp           39036
TroponinI      63225
TroponinT      62553
Urine          18416
WBC            59288
Weight         29074
pH             55882
dtype: int64

In [79]:
more_than_or_equal_to_65_test_ids.unique().size

1321

In [80]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            50717
ALT            50685
AST            50687
Age             3530
Albumin        50994
BUN            47937
Bilirubin      50673
Cholesterol    51658
Creatinine     47925
DiasABP        24807
FiO2           44058
GCS            34765
Gender         50666
Glucose        48114
HCO3           48016
HCT            46765
HR              5737
Height          3530
ICUType        50666
K              47771
Lactate        49570
MAP            25071
MechVent       44092
Mg             48107
NIDiasABP      30109
NIMAP          30413
NISysABP       30089
Na             48033
PaCO2          46045
PaO2           46044
Platelets      47869
RespRate       38398
SaO2           49975
SysABP         24806
Temp           33605
TroponinI      51666
TroponinT      51372
Urine          17265
WBC            48223
Weight         25311
pH             45794
dtype: int64

In [81]:
less_than_65_test_ids.unique().size

1078

In [82]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [83]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

240       1.803
336       1.626
337       1.626
341       1.626
342       1.626
          ...  
574988    1.524
574989    1.524
574990    1.524
574991    1.524
575184    1.727
Name: Height, Length: 31164, dtype: float64

In [84]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
240,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
336,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.0,3.5,,,16.1,48.4,7.4,18.3,Baixo peso
337,132551,1,1.0,,,,78.0,,,,...,114.5,,,,120.0,,48.4,,18.3,Baixo peso
341,132551,5,5.0,,,,78.0,,,,...,104.0,,,,130.0,,48.4,7.29,18.3,Baixo peso
342,132551,6,6.0,,,,78.0,,67.0,,...,141.0,35.6,3.1,,60.0,20.4,48.4,7.25,18.3,Baixo peso


In [85]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
1,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.00,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
2,132575,0,0.0,,,,78.0,,18.0,,...,122.00,37.40,,,38.0,12.5,63.0,7.34,22.4,Peso normal
3,132582,0,0.0,,,,84.0,2.6,31.0,,...,,36.30,,,200.0,5.3,82.5,,24.7,Peso normal
4,132597,0,0.0,,,,66.0,,27.0,,...,,36.50,1.2,,,18.6,82.0,,43.6,Obesidade grau 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,162946,0,0.0,,,,65.0,,13.0,,...,96.00,35.25,,,100.0,5.0,97.9,7.37,34.9,Obesidade grau 1
1233,162952,0,0.0,,,,64.0,,,,...,,,,,,,47.7,,16.5,Baixo peso
1234,162971,0,0.0,,,,63.0,,,,...,130.50,35.80,,,0.0,,83.3,,37.1,Obesidade grau 2
1235,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.00,36.50,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso


In [86]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           417
Peso normal         354
Obesidade grau 1    240
Obesidade grau 2    102
Obesidade grau 3     88
Baixo peso           36
Name: count, dtype: int64

In [87]:
classificacao_undefined_ids_test = bmi_data_test["RecordID"]
classificacao_undefined_missing_test = test_X[~test_X["RecordID"].isin(classificacao_undefined_ids_test)]
classificacao_undefined_missing_test = classificacao_undefined_missing_test.isna().sum()
classificacao_undefined_missing_test

RecordID           0
level_1            0
Time               0
ALP            54869
ALT            54841
AST            54838
Age             4463
Albumin        55037
BUN            51883
Bilirubin      54823
Cholesterol    55686
Creatinine     51850
DiasABP        34123
FiO2           48451
GCS            37480
Gender         54614
Glucose        51895
HCO3           51911
HCT            51034
HR              6531
Height          4463
ICUType        54614
K              51520
Lactate        54077
MAP            34456
MechVent       49057
Mg             52008
NIDiasABP      26390
NIMAP          26808
NISysABP       26368
Na             51724
PaCO2          52037
PaO2           52039
Platelets      52186
RespRate       35124
SaO2           55152
SysABP         34122
Temp           40894
TroponinI      55698
TroponinT      55125
Urine          20768
WBC            52348
Weight         26173
pH             51963
dtype: int64

In [88]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1694
ALT            1694
AST            1694
Age              47
Albumin        1702
BUN            1602
Bilirubin      1694
Cholesterol    1726
Creatinine     1602
DiasABP         622
FiO2           1457
GCS            1186
Gender         1692
Glucose        1603
HCO3           1601
HCT            1547
HR              108
Height           47
ICUType        1692
K              1585
Lactate        1608
MAP             611
MechVent       1439
Mg             1597
NIDiasABP      1066
NIMAP          1070
NISysABP       1062
Na             1603
PaCO2          1471
PaO2           1472
Platelets      1583
RespRate       1339
SaO2           1640
SysABP          622
Temp            998
TroponinI      1717
TroponinT      1703
Urine           362
WBC            1605
Weight          908
pH             1462
dtype: int64

In [89]:
classificacao_baixo_peso_ids_test.unique().size

36

In [90]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            16751
ALT            16746
AST            16746
Age              735
Albumin        16813
BUN            15752
Bilirubin      16745
Cholesterol    16962
Creatinine     15747
DiasABP         6111
FiO2           14227
GCS            11523
Gender         16638
Glucose        15848
HCO3           15777
HCT            15300
HR              1409
Height           735
ICUType        16638
K              15705
Lactate        16234
MAP             6167
MechVent       14312
Mg             15780
NIDiasABP      11153
NIMAP          11191
NISysABP       11145
Na             15824
PaCO2          14676
PaO2           14675
Platelets      15640
RespRate       14029
SaO2           16006
SysABP          6111
Temp            9485
TroponinI      16930
TroponinT      16837
Urine           4563
WBC            15791
Weight          8333
pH             14504
dtype: int64

In [91]:
classificacao_normal_peso_ids_test.unique().size

354

In [92]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            19691
ALT            19684
AST            19685
Age              898
Albumin        19780
BUN            18542
Bilirubin      19675
Cholesterol    19973
Creatinine     18538
DiasABP         6362
FiO2           16606
GCS            14004
Gender         19599
Glucose        18730
HCO3           18600
HCT            17859
HR              1848
Height           898
ICUType        19599
K              18579
Lactate        19094
MAP             6387
MechVent       16736
Mg             18603
NIDiasABP      13807
NIMAP          13857
NISysABP       13799
Na             18688
PaCO2          16804
PaO2           16810
Platelets      18351
RespRate       17234
SaO2           18554
SysABP          6362
Temp           10474
TroponinI      19955
TroponinT      19832
Urine           4890
WBC            18570
Weight          9506
pH             16597
dtype: int64

In [93]:
classificacao_sobrepeso_ids_test.unique().size

417

In [94]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID           0
level_1            0
Time               0
ALP            11335
ALT            11333
AST            11333
Age              457
Albumin        11377
BUN            10653
Bilirubin      11329
Cholesterol    11502
Creatinine     10655
DiasABP         3246
FiO2            9494
GCS             7917
Gender         11280
Glucose        10758
HCO3           10692
HCT            10308
HR               946
Height           457
ICUType        11280
K              10643
Lactate        10940
MAP             3284
MechVent        9480
Mg             10645
NIDiasABP       8133
NIMAP           8153
NISysABP        8127
Na             10724
PaCO2           9670
PaO2            9672
Platelets      10591
RespRate        9807
SaO2           10771
SysABP          3246
Temp            5949
TroponinI      11481
TroponinT      11401
Urine           2617
WBC            10709
Weight          5575
pH              9565
dtype: int64

In [95]:
classificacao_obesidade_1_ids_test.unique().size

240

In [96]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            4810
ALT            4808
AST            4808
Age             273
Albumin        4833
BUN            4519
Bilirubin      4808
Cholesterol    4881
Creatinine     4518
DiasABP        1650
FiO2           4105
GCS            3532
Gender         4794
Glucose        4565
HCO3           4532
HCT            4339
HR              460
Height          273
ICUType        4794
K              4521
Lactate        4687
MAP            1643
MechVent       4120
Mg             4530
NIDiasABP      3342
NIMAP          3380
NISysABP       3340
Na             4547
PaCO2          4156
PaO2           4162
Platelets      4493
RespRate       4069
SaO2           4515
SysABP         1650
Temp           2514
TroponinI      4893
TroponinT      4844
Urine          1440
WBC            4540
Weight         2446
pH             4116
dtype: int64

In [97]:
classificacao_obesidade_2_ids_test.unique().size

102

In [98]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            4159
ALT            4155
AST            4156
Age             169
Albumin        4162
BUN            3921
Bilirubin      4154
Cholesterol    4215
Creatinine     3921
DiasABP        1666
FiO2           3310
GCS            2985
Gender         4136
Glucose        3961
HCO3           3936
HCT            3837
HR              337
Height          169
ICUType        4136
K              3927
Lactate        3976
MAP            1684
MechVent       3359
Mg             3933
NIDiasABP      2653
NIMAP          2682
NISysABP       2650
Na             3952
PaCO2          3495
PaO2           3492
Platelets      3928
RespRate       3689
SaO2           3927
SysABP         1666
Temp           2327
TroponinI      4217
TroponinT      4183
Urine          1041
WBC            3948
Weight         1444
pH             3469
dtype: int64

In [99]:
classificacao_obesidade_3_ids_test.unique().size

88

In [100]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test["Undefined gender"] = undefined_gender_missing_rate_test
df_missing_transpose_test["ICUType 1"] = ICUType_1_test_missing
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
df_missing_transpose_test["Undefined classification"] = classificacao_undefined_missing_test
df_missing_transpose_test = df_missing_transpose_test.drop("RecordID", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("level_1", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Time", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Age", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,50060,63153,96,16420,24292,40551,32046,62592,50717,1694,16751,19691,11335,4810,4159,54869
ALT,50048,63117,96,16405,24288,40531,32037,62576,50685,1694,16746,19684,11333,4808,4155,54841
AST,50048,63116,96,16405,24289,40530,32036,62573,50687,1694,16746,19685,11333,4808,4156,54838
Albumin,50207,63401,96,16466,24357,40733,32148,62710,50994,1702,16813,19780,11377,4833,4162,55037
BUN,47197,59581,94,15498,22894,38333,30147,58935,47937,1602,15752,18542,10653,4519,3921,51883
Bilirubin,50046,63086,96,16405,24289,40494,32040,62555,50673,1694,16745,19675,11329,4808,4154,54823
Cholesterol,50746,64103,96,16540,24471,41376,32558,63287,51658,1726,16962,19973,11502,4881,4215,55686
Creatinine,47182,59555,94,15484,22895,38307,30145,58906,47925,1602,15747,18538,10655,4518,3921,51850
DiasABP,25278,28406,96,9869,6001,26565,11345,28973,24807,622,6111,6362,3246,1650,1666,34123
FiO2,43528,54027,95,14897,20496,35561,26696,53592,44058,1457,14227,16606,9494,4105,3310,48451
