In [107]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [108]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-09 17:58:10 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-09 17:58:10 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-09 17:58:10 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-09 17:58:10 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [109]:
train_X = physionet2012_dataset['train_X']

In [110]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            158045
ALT            157980
AST            157985
Age              9115
Albumin        158619
BUN            149146
Bilirubin      157941
Cholesterol    160380
Creatinine     149101
DiasABP         78737
FiO2           135923
GCS            109243
Gender         157309
Glucose        149679
HCO3           149355
HCT            145974
HR              15099
Height           9115
ICUType        157309
K              148473
Lactate        154454
MAP             79234
MechVent       136790
Mg             149366
NIDiasABP       89025
NIMAP           90075
NISysABP        88965
Na             149201
PaCO2          143529
PaO2           143542
Platelets      149282
RespRate       117230
SaO2           154557
SysABP          78732
Temp           103348
TroponinI      160277
TroponinT      158938
Urine           48729
WBC            150120
Weight          73759
pH             142861
dtype: int64

In [111]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            203843
ALT            203745
AST            203746
Age             12097
Albumin        204621
BUN            192216
Bilirubin      203703
Cholesterol    206762
Creatinine     192134
DiasABP         91400
FiO2           174506
GCS            141008
Gender         202805
Glucose        193221
HCO3           192607
HCT            187157
HR              20685
Height          12097
ICUType        202805
K              191565
Lactate        198715
MAP             91969
MechVent       176195
Mg             192477
NIDiasABP      122331
NIMAP          123501
NISysABP       122253
Na             192631
PaCO2          181937
PaO2           181979
Platelets      191673
RespRate       161194
SaO2           198170
SysABP          91388
Temp           128045
TroponinI      206734
TroponinT      204907
Urine           64205
WBC            193210
Weight         100158
pH             180595
dtype: int64

In [112]:
undefined_gender_ids = train_X[train_X["Gender"] == -1.0]
undefined_gender_ids = undefined_gender_ids["RecordID"]
undefined_gender_missing_rate = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_missing_rate = undefined_gender_missing_rate.isna().sum()
undefined_gender_missing_rate

RecordID         0
level_1          0
Time             0
ALP            429
ALT            429
AST            429
Age             98
Albumin        428
BUN            401
Bilirubin      429
Cholesterol    432
Creatinine     401
DiasABP        218
FiO2           396
GCS            332
Gender         423
Glucose        401
HCO3           401
HCT            403
HR             111
Height          98
ICUType        423
K              401
Lactate        404
MAP            221
MechVent       400
Mg             404
NIDiasABP      285
NIMAP          285
NISysABP       285
Na             402
PaCO2          386
PaO2           386
Platelets      403
RespRate       343
SaO2           429
SysABP         218
Temp           302
TroponinI      431
TroponinT      427
Urine          213
WBC            405
Weight         224
pH             381
dtype: int64

In [113]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training_missing = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training_missing.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            54450
ALT            54408
AST            54413
Age             4450
Albumin        54631
BUN            51328
Bilirubin      54415
Cholesterol    54904
Creatinine     51255
DiasABP        32848
FiO2           49299
GCS            40906
Gender         54097
Glucose        51497
HCO3           51452
HCT            50446
HR              6864
Height          4450
ICUType        54097
K              50675
Lactate        54059
MAP            32928
MechVent       49769
Mg             51312
NIDiasABP      28199
NIMAP          28337
NISysABP       28175
Na             51464
PaCO2          50789
PaO2           50785
Platelets      51320
RespRate       35091
SaO2           52556
SysABP         32846
Temp           38624
TroponinI      55085
TroponinT      54057
Urine          23544
WBC            51731
Weight         30179
pH             50677
dtype: int64

In [114]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training_missing = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training_missing.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            79732
ALT            79719
AST            79719
Age             2533
Albumin        79925
BUN            75124
Bilirubin      79730
Cholesterol    80282
Creatinine     75109
DiasABP        17354
FiO2           66657
GCS            59106
Gender         78631
Glucose        76551
HCO3           75528
HCT            71546
HR              6402
Height          2533
ICUType        78631
K              76035
Lactate        77543
MAP            17244
MechVent       67020
Mg             75269
NIDiasABP      62464
NIMAP          62632
NISysABP       62425
Na             76279
PaCO2          63962
PaO2           64005
Platelets      73815
RespRate       76802
SaO2           71543
SysABP         17351
Temp           33826
TroponinI      80205
TroponinT      80117
Urine          12829
WBC            74904
Weight         39107
pH             62393
dtype: int64

In [115]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training_missing = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training_missing.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            127518
ALT            127433
AST            127435
Age              9592
Albumin        128172
BUN            120553
Bilirubin      127313
Cholesterol    130185
Creatinine     120514
DiasABP         84942
FiO2           111401
GCS             95874
Gender         127605
Glucose        120566
HCO3           120542
HCT            118848
HR              14141
Height           9592
ICUType        127605
K              119721
Lactate        125441
MAP             85563
MechVent       112954
Mg             121092
NIDiasABP       55708
NIMAP           57078
NISysABP        55666
Na             120274
PaCO2          120611
PaO2           120595
Platelets      121524
RespRate        89212
SaO2           128762
SysABP          84938
Temp            92896
TroponinI      129989
TroponinT      128544
Urine           51610
WBC            121909
Weight          46368
pH             120455
dtype: int64

In [116]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training_missing = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training_missing.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            100617
ALT            100594
AST            100593
Age              4735
Albumin        100940
BUN             94758
Bilirubin      100615
Cholesterol    102203
Creatinine      94758
DiasABP         35211
FiO2            83468
GCS             54697
Gender         100204
Glucose         94687
HCO3            94841
HCT             92694
HR               8488
Height           4735
ICUType        100204
K               94008
Lactate         96530
MAP             35689
MechVent        83642
Mg              94574
NIDiasABP       65270
NIMAP           65814
NISysABP        65237
Na              94217
PaCO2           90490
PaO2            90522
Platelets       94699
RespRate        77662
SaO2           100295
SysABP          35203
Temp            66349
TroponinI      102163
TroponinT      101554
Urine           25164
WBC             95191
Weight          58487
pH              90312
dtype: int64

In [117]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train_missing = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train_missing.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            200061
ALT            200010
AST            200009
Age             11134
Albumin        200504
BUN            188548
Bilirubin      199956
Cholesterol    202335
Creatinine     188470
DiasABP         91844
FiO2           170720
GCS            139461
Gender         198481
Glucose        189555
HCO3           188879
HCT            183660
HR              18870
Height          11134
ICUType        198481
K              187917
Lactate        194945
MAP             92437
MechVent       172771
Mg             188692
NIDiasABP      116930
NIMAP          117919
NISysABP       116859
Na             188981
PaCO2          178934
PaO2           178966
Platelets      188165
RespRate       153463
SaO2           193387
SysABP          91835
Temp           124523
TroponinI      202185
TroponinT      199965
Urine           58687
WBC            189494
Weight          94820
pH             177786
dtype: int64

In [118]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train_missing = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train_missing.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            162256
ALT            162144
AST            162151
Age             10176
Albumin        163164
BUN            153215
Bilirubin      162117
Cholesterol    165239
Creatinine     153166
DiasABP         78511
FiO2           140105
GCS            111122
Gender         162056
Glucose        153746
HCO3           153484
HCT            149874
HR              17025
Height          10176
ICUType        162056
K              152522
Lactate        158628
MAP             78987
MechVent       140614
Mg             153555
NIDiasABP       94711
NIMAP           95942
NISysABP        94644
Na             153253
PaCO2          146918
PaO2           146941
Platelets      153193
RespRate       125304
SaO2           159769
SysABP          78503
Temp           107172
TroponinI      165257
TroponinT      164307
Urine           54460
WBC            154241
Weight          79321
pH             146051
dtype: int64

In [119]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [120]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [121]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

144       1.803
145       1.803
146       1.803
147       1.803
148       1.803
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 102714, dtype: float64

In [122]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,,11.5,84.6,,26.0,Sobrepeso
145,132543,1,1.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
146,132543,2,2.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
147,132543,3,3.0,,,,68.0,,,,...,,36.4,,,,,84.6,,26.0,Sobrepeso
148,132543,4,4.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso


In [123]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,600.0,11.5,84.6,,26.0,Sobrepeso
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
2,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.3,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
3,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.0,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
4,132555,0,0.0,,,,74.0,,19.0,,...,98.00,34.8,,,35.0,9.0,66.1,7.39,21.5,Peso normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4051,162999,0,0.0,,,,70.0,,30.0,,...,0.00,36.3,,,,2.5,68.1,,20.4,Peso normal
4052,163002,0,0.0,,,,53.0,,,,...,,,,,,2.7,68.0,7.27,31.3,Obesidade grau 1
4053,163008,0,0.0,,,,59.0,,24.0,,...,97.00,37.6,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4054,163029,0,0.0,,,,61.0,,,,...,,,,,,,85.0,,28.5,Sobrepeso


In [124]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1387
Peso normal         1194
Obesidade grau 1     734
Obesidade grau 2     331
Obesidade grau 3     282
Baixo peso           128
Name: count, dtype: int64

In [208]:
classificacao_undefined_ids = bmi_data_train["RecordID"]
classificacao_undefined_missing = train_X[~train_X["RecordID"].isin(classificacao_undefined_ids)]
classificacao_undefined_missing = classificacao_undefined_missing.isna().sum()
classificacao_undefined_missing

RecordID            0
level_1             0
Time                0
ALP            170770
ALT            170690
AST            170689
Age             13213
Albumin        171276
BUN            161449
Bilirubin      170640
Cholesterol    173224
Creatinine     161380
DiasABP        105442
FiO2           149536
GCS            115361
Gender         169905
Glucose        161451
HCO3           161535
HCT            158914
HR              19540
Height          13213
ICUType        169905
K              160216
Lactate        167722
MAP            106112
MechVent       151394
Mg             161735
NIDiasABP       79963
NIMAP           81503
NISysABP        79907
Na             160922
PaCO2          161306
PaO2           161332
Platelets      162329
RespRate       113821
SaO2           171461
SysABP         105432
Temp           126901
TroponinI      173218
TroponinT      171394
Urine           63939
WBC            162803
Weight          82167
pH             161086
dtype: int64

In [125]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6052
ALT            6051
AST            6050
Age             214
Albumin        6076
BUN            5691
Bilirubin      6050
Cholesterol    6131
Creatinine     5689
DiasABP        2289
FiO2           5132
GCS            4318
Gender         6016
Glucose        5714
HCO3           5697
HCT            5568
HR              426
Height          214
ICUType        6016
K              5664
Lactate        5847
MAP            2290
MechVent       5123
Mg             5687
NIDiasABP      3885
NIMAP          3931
NISysABP       3881
Na             5695
PaCO2          5327
PaO2           5317
Platelets      5679
RespRate       5156
SaO2           5759
SysABP         2288
Temp           3605
TroponinI      6116
TroponinT      6078
Urine          1751
WBC            5726
Weight         3003
pH             5275
dtype: int64

In [126]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

128

In [127]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            56453
ALT            56425
AST            56429
Age             2426
Albumin        56643
BUN            53170
Bilirubin      56427
Cholesterol    57222
Creatinine     53159
DiasABP        19941
FiO2           47866
GCS            39057
Gender         56118
Glucose        53557
HCO3           53304
HCT            51529
HR              4780
Height          2426
ICUType        56118
K              53051
Lactate        54773
MAP            20126
MechVent       48044
Mg             53178
NIDiasABP      37917
NIMAP          38094
NISysABP       37893
Na             53391
PaCO2          49090
PaO2           49106
Platelets      52790
RespRate       47993
SaO2           53916
SysABP         19940
Temp           31839
TroponinI      57172
TroponinT      56787
Urine          15100
WBC            53315
Weight         27458
pH             48535
dtype: int64

In [128]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1194

In [129]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            65481
ALT            65450
AST            65453
Age             2903
Albumin        65798
BUN            61642
Bilirubin      65434
Cholesterol    66449
Creatinine     61617
DiasABP        21609
FiO2           55535
GCS            46410
Gender         65189
Glucose        62229
HCO3           61832
HCT            59510
HR              5889
Height          2903
ICUType        65189
K              61711
Lactate        63743
MAP            21650
MechVent       55755
Mg             61703
NIDiasABP      45478
NIMAP          45710
NISysABP       45450
Na             62076
PaCO2          56143
PaO2           56164
Platelets      61046
RespRate       56588
SaO2           61937
SysABP         21605
Temp           34926
TroponinI      66428
TroponinT      65967
Urine          17210
WBC            61786
Weight         32320
pH             55487
dtype: int64

In [130]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1387

In [131]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            34629
ALT            34616
AST            34617
Age             1390
Albumin        34805
BUN            32618
Bilirubin      34610
Cholesterol    35175
Creatinine     32608
DiasABP        11247
FiO2           29033
GCS            24495
Gender         34498
Glucose        32908
HCO3           32726
HCT            31568
HR              2883
Height          1390
ICUType        34498
K              32603
Lactate        33627
MAP            11380
MechVent       29073
Mg             32678
NIDiasABP      24197
NIMAP          24302
NISysABP       24185
Na             32805
PaCO2          29550
PaO2           29554
Platelets      32414
RespRate       30251
SaO2           32816
SysABP         11247
Temp           18496
TroponinI      35141
TroponinT      34916
Urine           7992
WBC            32758
Weight         17067
pH             29209
dtype: int64

In [132]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

734

In [133]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            15633
ALT            15628
AST            15628
Age              644
Albumin        15705
BUN            14684
Bilirubin      15622
Cholesterol    15860
Creatinine     14673
DiasABP         5225
FiO2           12992
GCS            11269
Gender         15557
Glucose        14822
HCO3           14725
HCT            14206
HR              1334
Height           644
ICUType        15557
K              14688
Lactate        15067
MAP             5238
MechVent       13180
Mg             14739
NIDiasABP      10809
NIMAP          10882
NISysABP       10804
Na             14766
PaCO2          13212
PaO2           13215
Platelets      14581
RespRate       13392
SaO2           14673
SysABP          5224
Temp            8233
TroponinI      15854
TroponinT      15711
Urine           3856
WBC            14728
Weight          6578
pH             13097
dtype: int64

In [134]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

331

In [135]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13299
ALT            13294
AST            13294
Age              520
Albumin        13365
BUN            12509
Bilirubin      13290
Cholesterol    13513
Creatinine     12510
DiasABP         4602
FiO2           10731
GCS             9673
Gender         13254
Glucose        12620
HCO3           12544
HCT            12239
HR              1043
Height           520
ICUType        13254
K              12506
Lactate        12794
MAP             4628
MechVent       10816
Mg             12527
NIDiasABP       9392
NIMAP           9439
NISysABP        9383
Na             12579
PaCO2          11224
PaO2           11219
Platelets      12519
RespRate       11566
SaO2           12594
SysABP          4602
Temp            7695
TroponinI      13513
TroponinT      13419
Urine           3299
WBC            12619
Weight          5548
pH             11148
dtype: int64

In [136]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

282

In [137]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [210]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["Undefined gender"] = undefined_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
df_missing_transpose["Undefined classification"] = classificacao_undefined_missing
df_missing_transpose = df_missing_transpose.drop("RecordID", axis=0)
df_missing_transpose = df_missing_transpose.drop("level_1", axis=0)
df_missing_transpose = df_missing_transpose.drop("Time", axis=0)
df_missing_transpose = df_missing_transpose.drop("Age", axis=0)
df_missing_transpose = df_missing_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,158045,203843,429,54450,79732,127518,100617,200061,162256,6052,56453,65481,34629,15633,13299,170770
ALT,157980,203745,429,54408,79719,127433,100594,200010,162144,6051,56425,65450,34616,15628,13294,170690
AST,157985,203746,429,54413,79719,127435,100593,200009,162151,6050,56429,65453,34617,15628,13294,170689
Albumin,158619,204621,428,54631,79925,128172,100940,200504,163164,6076,56643,65798,34805,15705,13365,171276
BUN,149146,192216,401,51328,75124,120553,94758,188548,153215,5691,53170,61642,32618,14684,12509,161449
Bilirubin,157941,203703,429,54415,79730,127313,100615,199956,162117,6050,56427,65434,34610,15622,13290,170640
Cholesterol,160380,206762,432,54904,80282,130185,102203,202335,165239,6131,57222,66449,35175,15860,13513,173224
Creatinine,149101,192134,401,51255,75109,120514,94758,188470,153166,5689,53159,61617,32608,14673,12510,161380
DiasABP,78737,91400,218,32848,17354,84942,35211,91844,78511,2289,19941,21609,11247,5225,4602,105442
FiO2,135923,174506,396,49299,66657,111401,83468,170720,140105,5132,47866,55535,29033,12992,10731,149536


<h3>Validation data</h3>

In [139]:
validation_X = physionet2012_dataset['val_X']

In [140]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            40698
ALT            40675
AST            40671
Age             2719
Albumin        40824
BUN            38354
Bilirubin      40669
Cholesterol    41267
Creatinine     38345
DiasABP        19812
FiO2           35208
GCS            27934
Gender         40467
Glucose        38496
HCO3           38398
HCT            37444
HR              4289
Height          2719
ICUType        40467
K              38193
Lactate        39656
MAP            19884
MechVent       35247
Mg             38402
NIDiasABP      23454
NIMAP          23733
NISysABP       23434
Na             38402
PaCO2          36982
PaO2           37005
Platelets      38326
RespRate       30207
SaO2           39933
SysABP         19811
Temp           26863
TroponinI      41244
TroponinT      40879
Urine          12382
WBC            38518
Weight         20029
pH             36826
dtype: int64

In [141]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            49801
ALT            49785
AST            49782
Age             2966
Albumin        50045
BUN            46926
Bilirubin      49776
Cholesterol    50600
Creatinine     46909
DiasABP        21950
FiO2           42293
GCS            34780
Gender         49632
Glucose        47159
HCO3           47007
HCT            45668
HR              5078
Height          2966
ICUType        49632
K              46784
Lactate        48415
MAP            22203
MechVent       42808
Mg             47035
NIDiasABP      30557
NIMAP          30781
NISysABP       30540
Na             47021
PaCO2          44486
PaO2           44493
Platelets      46846
RespRate       39867
SaO2           48664
SysABP         21949
Temp           30930
TroponinI      50569
TroponinT      50146
Urine          15962
WBC            47192
Weight         24005
pH             44198
dtype: int64

In [142]:
undefined_gender_ids_validation = validation_X[validation_X["Gender"] == -1.0]
undefined_gender_ids_validation = undefined_gender_ids_validation["RecordID"]
undefined_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(undefined_gender_ids_validation)]
undefined_gender_missing_rate_validation = undefined_gender_missing_rate_validation.isna().sum()
undefined_gender_missing_rate_validation

RecordID        0
level_1         0
Time            0
ALP            48
ALT            48
AST            48
Age            46
Albumin        48
BUN            47
Bilirubin      48
Cholesterol    48
Creatinine     47
DiasABP        48
FiO2           48
GCS            48
Gender         47
Glucose        47
HCO3           47
HCT            47
HR             48
Height         46
ICUType        47
K              47
Lactate        48
MAP            48
MechVent       48
Mg             47
NIDiasABP      48
NIMAP          48
NISysABP       48
Na             47
PaCO2          48
PaO2           48
Platelets      47
RespRate       48
SaO2           48
SysABP         48
Temp           48
TroponinI      48
TroponinT      48
Urine          48
WBC            47
Weight         47
pH             48
dtype: int64

In [211]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            13919
ALT            13905
AST            13904
Age             1310
Albumin        13953
BUN            13109
Bilirubin      13915
Cholesterol    14026
Creatinine     13097
DiasABP         7777
FiO2           12598
GCS            10545
Gender         13818
Glucose        13138
HCO3           13143
HCT            12852
HR              1943
Height          1310
ICUType        13818
K              12938
Lactate        13761
MAP             7793
MechVent       12648
Mg             13120
NIDiasABP       7942
NIMAP           7972
NISysABP        7941
Na             13147
PaCO2          12903
PaO2           12904
Platelets      13072
RespRate        9242
SaO2           13405
SysABP          7777
Temp            9846
TroponinI      14066
TroponinT      13809
Urine           6205
WBC            13174
Weight          7898
pH             12883
dtype: int64

In [212]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            18101
ALT            18097
AST            18097
Age              758
Albumin        18148
BUN            17033
Bilirubin      18098
Cholesterol    18235
Creatinine     17029
DiasABP         4142
FiO2           15237
GCS            13491
Gender         17860
Glucose        17376
HCO3           17117
HCT            16270
HR              1699
Height           758
ICUType        17860
K              17257
Lactate        17551
MAP             4151
MechVent       15379
Mg             17074
NIDiasABP      14283
NIMAP          14311
NISysABP       14278
Na             17312
PaCO2          14655
PaO2           14666
Platelets      16787
RespRate       17404
SaO2           16370
SysABP          4142
Temp            7660
TroponinI      18210
TroponinT      18183
Urine           2908
WBC            17009
Weight          9049
pH             14300
dtype: int64

In [213]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            32224
ALT            32210
AST            32205
Age             2464
Albumin        32382
BUN            30440
Bilirubin      32171
Cholesterol    32900
Creatinine     30431
DiasABP        21114
FiO2           27923
GCS            24065
Gender         32242
Glucose        30440
HCO3           30421
HCT            29964
HR              3605
Height          2464
ICUType        32242
K              30259
Lactate        31592
MAP            21264
MechVent       28298
Mg             30588
NIDiasABP      14465
NIMAP          14799
NISysABP       14454
Na             30387
PaCO2          30359
PaO2           30364
Platelets      30635
RespRate       23208
SaO2           32563
SysABP         21112
Temp           23557
TroponinI      32844
TroponinT      32495
Urine          12919
WBC            30733
Weight         11576
pH             30330
dtype: int64

In [214]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            26303
ALT            26296
AST            26295
Age             1199
Albumin        26434
BUN            24745
Bilirubin      26309
Cholesterol    26754
Creatinine     24744
DiasABP         8777
FiO2           21791
GCS            14661
Gender         26226
Glucose        24748
HCO3           24771
HCT            24073
HR              2168
Height          1199
ICUType        26226
K              24570
Lactate        25215
MAP             8927
MechVent       21778
Mg             24702
NIDiasABP      17369
NIMAP          17480
NISysABP       17349
Na             24624
PaCO2          23599
PaO2           23612
Platelets      24725
RespRate       20268
SaO2           26307
SysABP          8777
Temp           16778
TroponinI      26741
TroponinT      26586
Urine           6360
WBC            24841
Weight         15558
pH             23559
dtype: int64

In [215]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            49193
ALT            49181
AST            49176
Age             2857
Albumin        49298
BUN            46298
Bilirubin      49161
Cholesterol    49784
Creatinine     46281
DiasABP        22924
FiO2           42239
GCS            34606
Gender         48833
Glucose        46537
HCO3           46381
HCT            45139
HR              4735
Height          2857
ICUType        48833
K              46145
Lactate        47869
MAP            23020
MechVent       42731
Mg             46380
NIDiasABP      28831
NIMAP          29099
NISysABP       28809
Na             46427
PaCO2          44259
PaO2           44272
Platelets      46250
RespRate       37037
SaO2           47747
SysABP         22924
Temp           30981
TroponinI      49728
TroponinT      49175
Urine          14655
WBC            46522
Weight         22984
pH             43993
dtype: int64

In [216]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            41354
ALT            41327
AST            41325
Age             2874
Albumin        41619
BUN            39029
Bilirubin      41332
Cholesterol    42131
Creatinine     39020
DiasABP        18886
FiO2           35310
GCS            28156
Gender         41313
Glucose        39165
HCO3           39071
HCT            38020
HR              4680
Height          2874
ICUType        41313
K              38879
Lactate        40250
MAP            19115
MechVent       35372
Mg             39104
NIDiasABP      25228
NIMAP          25463
NISysABP       25213
Na             39043
PaCO2          37257
PaO2           37274
Platelets      38969
RespRate       33085
SaO2           40898
SysABP         18884
Temp           26860
TroponinI      42133
TroponinT      41898
Urine          13737
WBC            39235
Weight         21097
pH             37079
dtype: int64

In [149]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [150]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

1728      1.524
3120      1.702
3360      1.829
3361      1.829
3362      1.829
          ...  
575034    1.600
575035    1.600
575037    1.600
575038    1.600
575039    1.600
Name: Height, Length: 23770, dtype: float64

In [151]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
1728,132618,0,0.0,,,,72.0,,,,...,56.0,37.475,,,,,69.1,,29.8,Sobrepeso
3120,132694,0,0.0,,,,61.0,,,,...,,36.1,,,,,165.0,,57.0,Obesidade grau 3
3360,132704,0,0.0,,,,63.0,,,,...,,33.3,,,500.0,,68.9,,20.6,Peso normal
3361,132704,1,1.0,,,,63.0,,,,...,,33.4,,,160.0,,68.9,,20.6,Peso normal
3362,132704,2,2.0,,,,63.0,,8.0,,...,0.0,35.25,,,150.0,7.1,68.9,,20.6,Peso normal


In [152]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132618,0,0.0,,,,72.0,,,,...,56.00,37.475,,,,,69.1,,29.8,Sobrepeso
1,132694,0,0.0,,,,61.0,,,,...,,36.100,,,,,165.0,,57.0,Obesidade grau 3
2,132704,0,0.0,57.0,33.0,73.0,63.0,3.1,8.0,0.4,...,0.00,33.300,,,500.0,7.1,68.9,7.32,20.6,Peso normal
3,132744,0,0.0,,,,70.0,,,,...,90.75,37.200,,,24.0,,95.3,7.34,32.9,Obesidade grau 1
4,132797,0,0.0,,,,88.0,,27.0,,...,151.50,38.000,,,20.0,13.8,78.8,7.28,33.9,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,162971,0,0.0,,,,63.0,,,,...,130.50,35.800,,,0.0,,83.3,,37.1,Obesidade grau 2
982,162983,0,0.0,95.0,369.0,366.0,75.0,3.1,28.0,6.4,...,124.00,35.300,1.2,,80.0,25.0,90.0,7.33,31.1,Obesidade grau 1
983,163003,0,0.0,124.0,254.0,204.0,36.0,2.6,3.0,30.3,...,,36.300,,,50.0,10.6,57.7,7.47,21.2,Peso normal
984,163007,0,0.0,42.0,30.0,40.0,19.0,2.8,16.0,0.6,...,0.00,40.550,,1.0,150.0,14.1,114.3,7.36,34.2,Obesidade grau 1


In [154]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           354
Peso normal         291
Obesidade grau 1    181
Obesidade grau 3     68
Obesidade grau 2     62
Baixo peso           30
Name: count, dtype: int64

In [217]:
classificacao_undefined_ids_validation = bmi_data_validation["RecordID"]
classificacao_undefined_missing_validation = validation_X[~validation_X["RecordID"].isin(classificacao_undefined_ids_validation)]
classificacao_undefined_missing_validation = classificacao_undefined_missing_validation.isna().sum()
classificacao_undefined_missing_validation

RecordID           0
level_1            0
Time               0
ALP            43971
ALT            43949
AST            43946
Age             3740
Albumin        44133
BUN            41604
Bilirubin      43939
Cholesterol    44674
Creatinine     41591
DiasABP        27324
FiO2           38559
GCS            30187
Gender         43804
Glucose        41619
HCO3           41613
HCT            40843
HR              5356
Height          3740
ICUType        43804
K              41335
Lactate        43259
MAP            27563
MechVent       39010
Mg             41719
NIDiasABP      20825
NIMAP          21132
NISysABP       20805
Na             41493
PaCO2          41682
PaO2           41694
Platelets      41789
RespRate       29973
SaO2           44340
SysABP         27323
Temp           32726
TroponinI      44659
TroponinT      44240
Urine          16418
WBC            41918
Weight         20523
pH             41632
dtype: int64

In [155]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1418
ALT            1418
AST            1418
Age              80
Albumin        1421
BUN            1314
Bilirubin      1416
Cholesterol    1439
Creatinine     1314
DiasABP         437
FiO2           1173
GCS             942
Gender         1410
Glucose        1321
HCO3           1316
HCT            1271
HR              140
Height           80
ICUType        1410
K              1313
Lactate        1334
MAP             415
MechVent       1167
Mg             1322
NIDiasABP      1031
NIMAP          1032
NISysABP       1030
Na             1321
PaCO2          1226
PaO2           1228
Platelets      1325
RespRate       1174
SaO2           1384
SysABP          437
Temp            867
TroponinI      1436
TroponinT      1425
Urine           404
WBC            1330
Weight          798
pH             1219
dtype: int64

In [156]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

30

In [157]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            13713
ALT            13707
AST            13705
Age              547
Albumin        13779
BUN            12890
Bilirubin      13705
Cholesterol    13937
Creatinine     12885
DiasABP         4488
FiO2           11584
GCS             9461
Gender         13677
Glucose        12973
HCO3           12923
HCT            12472
HR              1180
Height           547
ICUType        13677
K              12848
Lactate        13209
MAP             4528
MechVent       11502
Mg             12897
NIDiasABP       9577
NIMAP           9611
NISysABP        9575
Na             12940
PaCO2          11834
PaO2           11845
Platelets      12798
RespRate       11891
SaO2           13183
SysABP          4488
Temp            7697
TroponinI      13921
TroponinT      13809
Urine           3696
WBC            12917
Weight          7603
pH             11739
dtype: int64

In [158]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

291

In [159]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            16741
ALT            16735
AST            16734
Age              698
Albumin        16821
BUN            15711
Bilirubin      16736
Cholesterol    16962
Creatinine     15705
DiasABP         5123
FiO2           14087
GCS            11719
Gender         16638
Glucose        15859
HCO3           15761
HCT            15171
HR              1418
Height           698
ICUType        16638
K              15715
Lactate        16216
MAP             5166
MechVent       14210
Mg             15725
NIDiasABP      11822
NIMAP          11902
NISysABP       11814
Na             15820
PaCO2          14367
PaO2           14369
Platelets      15585
RespRate       14165
SaO2           15818
SysABP          5122
Temp            8572
TroponinI      16953
TroponinT      16850
Urine           4040
WBC            15745
Weight          8039
pH             14179
dtype: int64

In [160]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

354

In [161]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8551
ALT            8545
AST            8545
Age             309
Albumin        8588
BUN            8038
Bilirubin      8549
Cholesterol    8676
Creatinine     8035
DiasABP        2644
FiO2           7128
GCS            6088
Gender         8507
Glucose        8105
HCO3           8053
HCT            7787
HR              687
Height          309
ICUType        8507
K              8032
Lactate        8235
MAP            2657
MechVent       7183
Mg             8053
NIDiasABP      6171
NIMAP          6222
NISysABP       6168
Na             8086
PaCO2          7241
PaO2           7244
Platelets      7972
RespRate       7338
SaO2           8109
SysABP         2644
Temp           4511
TroponinI      8664
TroponinT      8591
Urine          2014
WBC            8061
Weight         3953
pH             7170
dtype: int64

In [162]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

181

In [163]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            2931
ALT            2933
AST            2933
Age             237
Albumin        2946
BUN            2745
Bilirubin      2928
Cholesterol    2969
Creatinine     2744
DiasABP         903
FiO2           2461
GCS            2072
Gender         2914
Glucose        2773
HCO3           2755
HCT            2666
HR              371
Height          237
ICUType        2914
K              2758
Lactate        2811
MAP             904
MechVent       2455
Mg             2748
NIDiasABP      2268
NIMAP          2284
NISysABP       2267
Na             2765
PaCO2          2476
PaO2           2477
Platelets      2730
RespRate       2588
SaO2           2773
SysABP          903
Temp           1609
TroponinI      2971
TroponinT      2937
Urine           951
WBC            2748
Weight         1490
pH             2471
dtype: int64

In [164]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

62

In [165]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3222
ALT            3221
AST            3220
Age             120
Albumin        3229
BUN            3025
Bilirubin      3220
Cholesterol    3258
Creatinine     3027
DiasABP         891
FiO2           2557
GCS            2293
Gender         3196
Glucose        3052
HCO3           3031
HCT            2949
HR              263
Height          120
ICUType        3196
K              3023
Lactate        3055
MAP             902
MechVent       2576
Mg             3020
NIDiasABP      2365
NIMAP          2379
NISysABP       2363
Na             3045
PaCO2          2690
PaO2           2689
Platelets      3020
RespRate       2993
SaO2           3038
SysABP          891
Temp           1859
TroponinI      3257
TroponinT      3221
Urine           869
WBC            3038
Weight         1675
pH             2662
dtype: int64

In [166]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

68

In [218]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["Undefined gender"] = undefined_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
df_missing_transpose_validation["Undefined classification"] = classificacao_undefined_missing_validation
df_missing_transpose_validation = df_missing_transpose_validation.drop("RecordID", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("level_1", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Time", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Age", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,40698,49801,48,13919,18101,32224,26303,49193,41354,1418,13713,16741,8551,2931,3222,43971
ALT,40675,49785,48,13905,18097,32210,26296,49181,41327,1418,13707,16735,8545,2933,3221,43949
AST,40671,49782,48,13904,18097,32205,26295,49176,41325,1418,13705,16734,8545,2933,3220,43946
Albumin,40824,50045,48,13953,18148,32382,26434,49298,41619,1421,13779,16821,8588,2946,3229,44133
BUN,38354,46926,47,13109,17033,30440,24745,46298,39029,1314,12890,15711,8038,2745,3025,41604
Bilirubin,40669,49776,48,13915,18098,32171,26309,49161,41332,1416,13705,16736,8549,2928,3220,43939
Cholesterol,41267,50600,48,14026,18235,32900,26754,49784,42131,1439,13937,16962,8676,2969,3258,44674
Creatinine,38345,46909,47,13097,17029,30431,24744,46281,39020,1314,12885,15705,8035,2744,3027,41591
DiasABP,19812,21950,48,7777,4142,21114,8777,22924,18886,437,4488,5123,2644,903,891,27324
FiO2,35208,42293,48,12598,15237,27923,21791,42239,35310,1173,11584,14087,7128,2461,2557,38559


<h3>Test data</h3>

In [168]:
test_X = physionet2012_dataset['test_X']

In [169]:
test_X["Gender"].value_counts()

Gender
 1.0    1346
 0.0    1051
-1.0       2
Name: count, dtype: int64

In [170]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            49587
ALT            49556
AST            49560
Age             2875
Albumin        49775
BUN            46711
Bilirubin      49549
Cholesterol    50370
Creatinine     46691
DiasABP        23426
FiO2           42347
GCS            33731
Gender         49397
Glucose        46869
HCO3           46755
HCT            45793
HR              4707
Height          2875
ICUType        49397
K              46517
Lactate        48300
MAP            23618
MechVent       42338
Mg             46796
NIDiasABP      28728
NIMAP          29072
NISysABP       28713
Na             46728
PaCO2          44866
PaO2           44881
Platelets      46792
RespRate       37902
SaO2           48579
SysABP         23423
Temp           32394
TroponinI      50353
TroponinT      49926
Urine          15262
WBC            47024
Weight         22966
pH             44686
dtype: int64

In [171]:
female_gender_test_ids.unique().size

1051

In [172]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            63454
ALT            63418
AST            63414
Age             3750
Albumin        63730
BUN            59862
Bilirubin      63422
Cholesterol    64475
Creatinine     59846
DiasABP        28015
FiO2           54293
GCS            43851
Gender         63262
Glucose        60192
HCO3           59978
HCT            58167
HR              6387
Height          3750
ICUType        63262
K              59692
Lactate        61643
MAP            28205
MechVent       54800
Mg             59971
NIDiasABP      38859
NIMAP          39126
NISysABP       38833
Na             60024
PaCO2          56685
PaO2           56705
Platelets      59558
RespRate       50141
SaO2           61825
SysABP         28014
Temp           39790
TroponinI      64498
TroponinT      63814
Urine          20013
WBC            60075
Weight         30163
pH             56296
dtype: int64

In [173]:
male_gender_test_ids.unique().size

1346

In [174]:
undefined_gender_ids_test = test_X[test_X["Gender"] == -1.0]
undefined_gender_ids_test = undefined_gender_ids_test["RecordID"]
undefined_gender_missing_rate_test = test_X[test_X["RecordID"].isin(undefined_gender_ids_test)]
undefined_gender_missing_rate_test = undefined_gender_missing_rate_test.isna().sum()
undefined_gender_missing_rate_test

RecordID        0
level_1         0
Time            0
ALP            94
ALT            93
AST            93
Age             2
Albumin        95
BUN            91
Bilirubin      94
Cholesterol    96
Creatinine     91
DiasABP        20
FiO2           80
GCS            66
Gender         94
Glucose        91
HCO3           91
HCT            92
HR              4
Height          2
ICUType        94
K              90
Lactate        95
MAP            23
MechVent       77
Mg             91
NIDiasABP      66
NIMAP          66
NISysABP       66
Na             91
PaCO2          86
PaO2           86
Platelets      92
RespRate       50
SaO2           96
SysABP         20
Temp           71
TroponinI      96
TroponinT      92
Urine          32
WBC            91
Weight          5
pH             86
dtype: int64

In [219]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            15092
ALT            15073
AST            15071
Age             1308
Albumin        15177
BUN            14230
Bilirubin      15080
Cholesterol    15252
Creatinine     14214
DiasABP         9094
FiO2           13585
GCS            11400
Gender         15040
Glucose        14295
HCO3           14276
HCT            14014
HR              2000
Height          1308
ICUType        15040
K              14063
Lactate        14984
MAP             9135
MechVent       13716
Mg             14272
NIDiasABP       7872
NIMAP           7918
NISysABP        7869
Na             14287
PaCO2          14032
PaO2           14034
Platelets      14243
RespRate       10515
SaO2           14579
SysABP          9094
Temp           10807
TroponinI      15312
TroponinT      15013
Urine           6808
WBC            14352
Weight          8690
pH             14016
dtype: int64

In [176]:
ICUType_1_test_ids.unique().size

320

In [220]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            22654
ALT            22647
AST            22648
Age              823
Albumin        22706
BUN            21314
Bilirubin      22650
Cholesterol    22794
Creatinine     21315
DiasABP         4648
FiO2           18867
GCS            16615
Gender         22325
Glucose        21745
HCO3           21406
HCT            20297
HR              1936
Height           823
ICUType        22325
K              21622
Lactate        21987
MAP             4588
MechVent       18774
Mg             21348
NIDiasABP      18220
NIMAP          18253
NISysABP       18211
Na             21639
PaCO2          18139
PaO2           18162
Platelets      20917
RespRate       21993
SaO2           20275
SysABP          4648
Temp            9228
TroponinI      22770
TroponinT      22737
Urine           3527
WBC            21249
Weight         10274
pH             17701
dtype: int64

In [178]:
ICUType_2_test_ids.unique().size

475

In [221]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            41581
ALT            41553
AST            41554
Age             2890
Albumin        41788
BUN            39271
Bilirubin      41526
Cholesterol    42489
Creatinine     39262
DiasABP        26885
FiO2           36224
GCS            31037
Gender         41642
Glucose        39283
HCO3           39262
HCT            38729
HR              4296
Height          2890
ICUType        41642
K              39006
Lactate        40739
MAP            27095
MechVent       36487
Mg             39458
NIDiasABP      18731
NIMAP          19146
NISysABP       18716
Na             39206
PaCO2          39135
PaO2           39136
Platelets      39518
RespRate       29932
SaO2           41979
SysABP         26881
Temp           30372
TroponinI      42443
TroponinT      41913
Urine          16575
WBC            39642
Weight         14241
pH             39083
dtype: int64

In [180]:
ICUType_3_test_ids.unique().size

884

In [222]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            33808
ALT            33794
AST            33794
Age             1606
Albumin        33929
BUN            31849
Bilirubin      33809
Cholesterol    34406
Creatinine     31837
DiasABP        10834
FiO2           28044
GCS            18596
Gender         33746
Glucose        31829
HCO3           31880
HCT            31012
HR              2866
Height          1606
ICUType        33746
K              31608
Lactate        32328
MAP            11028
MechVent       28238
Mg             31780
NIDiasABP      22830
NIMAP          22947
NISysABP       22816
Na             31711
PaCO2          30331
PaO2           30340
Platelets      31764
RespRate       25653
SaO2           33667
SysABP         10834
Temp           21848
TroponinI      34422
TroponinT      34169
Urine           8397
WBC            31947
Weight         19929
pH             30268
dtype: int64

In [182]:
ICUType_4_test_ids.unique().size

718

In [223]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            61203
ALT            61167
AST            61165
Age             3386
Albumin        61348
BUN            57592
Bilirubin      61161
Cholesterol    61931
Creatinine     57577
DiasABP        27348
FiO2           52316
GCS            42638
Gender         60771
Glucose        57923
HCO3           57689
HCT            56169
HR              5692
Height          3386
ICUType        60771
K              57450
Lactate        59524
MAP            27486
MechVent       52713
Mg             57701
NIDiasABP      36167
NIMAP          36450
NISysABP       36145
Na             57746
PaCO2          54693
PaO2           54711
Platelets      57513
RespRate       46907
SaO2           59189
SysABP         27346
Temp           37720
TroponinI      61910
TroponinT      61142
Urine          17931
WBC            57910
Weight         28847
pH             54365
dtype: int64

In [184]:
more_than_or_equal_to_65_test_ids.unique().size

1292

In [224]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            51932
ALT            51900
AST            51902
Age             3241
Albumin        52252
BUN            49072
Bilirubin      51904
Cholesterol    53010
Creatinine     49051
DiasABP        24113
FiO2           44404
GCS            35010
Gender         51982
Glucose        49229
HCO3           49135
HCT            47883
HR              5406
Height          3241
ICUType        51982
K              48849
Lactate        50514
MAP            24360
MechVent       44502
Mg             49157
NIDiasABP      31486
NIMAP          31814
NISysABP       31467
Na             49097
PaCO2          46944
PaO2           46961
Platelets      48929
RespRate       41186
SaO2           51311
SysABP         24111
Temp           34535
TroponinI      53037
TroponinT      52690
Urine          17376
WBC            49280
Weight         24287
pH             46703
dtype: int64

In [186]:
less_than_65_test_ids.unique().size

1105

In [187]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [188]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
574988    1.524
574989    1.524
574990    1.524
574991    1.524
575088    1.727
Name: Height, Length: 32315, dtype: float64

In [189]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [190]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.0,37.5,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132617,0,0.0,,,,77.0,,110.0,,...,,36.4,,,100.0,7.9,75.0,7.55,25.9,Sobrepeso
2,132623,0,0.0,,,,24.0,,,,...,,,,,,,78.0,7.45,23.3,Peso normal
3,132637,0,0.0,,,,78.0,,13.0,,...,99.0,37.0,,,90.0,14.2,56.0,7.39,19.3,Peso normal
4,132658,0,0.0,71.0,9.0,42.0,81.0,,18.0,1.3,...,97.0,38.4,,,90.0,61.3,105.4,7.42,30.7,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221,162942,0,0.0,67.0,61.0,92.0,40.0,3.3,12.0,0.3,...,,36.3,,,600.0,20.6,120.7,7.38,37.1,Obesidade grau 2
1222,162980,0,0.0,,,,88.0,,31.0,,...,133.0,38.0,,,45.0,17.6,76.5,7.38,29.9,Sobrepeso
1223,162995,0,0.0,60.0,21.0,20.0,84.0,,93.0,0.4,...,121.0,37.1,0.6,,60.0,17.1,96.5,7.31,28.8,Sobrepeso
1224,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.0,36.5,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso


In [192]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           419
Peso normal         377
Obesidade grau 1    224
Obesidade grau 2     86
Obesidade grau 3     83
Baixo peso           37
Name: count, dtype: int64

In [225]:
classificacao_undefined_ids_test = bmi_data_test["RecordID"]
classificacao_undefined_missing_test = test_X[~test_X["RecordID"].isin(classificacao_undefined_ids_test)]
classificacao_undefined_missing_test = classificacao_undefined_missing_test.isna().sum()
classificacao_undefined_missing_test

RecordID           0
level_1            0
Time               0
ALP            55273
ALT            55230
AST            55228
Age             4188
Albumin        55469
BUN            52197
Bilirubin      55223
Cholesterol    56207
Creatinine     52181
DiasABP        32319
FiO2           48526
GCS            37394
Gender         55131
Glucose        52208
HCO3           52221
HCT            51389
HR              6176
Height          4188
ICUType        55131
K              51818
Lactate        54266
MAP            32590
MechVent       49041
Mg             52323
NIDiasABP      27758
NIMAP          28180
NISysABP       27732
Na             52068
PaCO2          52137
PaO2           52140
Platelets      52494
RespRate       37668
SaO2           55642
SysABP         32316
Temp           41011
TroponinI      56232
TroponinT      55586
Urine          19917
WBC            52666
Weight         26601
pH             52055
dtype: int64

In [193]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1747
ALT            1746
AST            1746
Age              78
Albumin        1749
BUN            1656
Bilirubin      1747
Cholesterol    1775
Creatinine     1656
DiasABP         663
FiO2           1501
GCS            1179
Gender         1739
Glucose        1662
HCO3           1656
HCT            1612
HR              142
Height           78
ICUType        1739
K              1650
Lactate        1693
MAP             653
MechVent       1477
Mg             1650
NIDiasABP      1122
NIMAP          1139
NISysABP       1122
Na             1659
PaCO2          1552
PaO2           1552
Platelets      1644
RespRate       1423
SaO2           1697
SysABP          663
Temp            972
TroponinI      1772
TroponinT      1758
Urine           474
WBC            1658
Weight          751
pH             1545
dtype: int64

In [194]:
classificacao_baixo_peso_ids_test.unique().size

37

In [195]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            17821
ALT            17815
AST            17815
Age              685
Albumin        17887
BUN            16775
Bilirubin      17812
Cholesterol    18057
Creatinine     16768
DiasABP         6162
FiO2           14942
GCS            12185
Gender         17719
Glucose        16899
HCO3           16816
HCT            16198
HR              1417
Height           685
ICUType        17719
K              16762
Lactate        17253
MAP             6184
MechVent       14899
Mg             16790
NIDiasABP      11846
NIMAP          11891
NISysABP       11840
Na             16852
PaCO2          15396
PaO2           15410
Platelets      16599
RespRate       15090
SaO2           16929
SysABP          6162
Temp            9629
TroponinI      18064
TroponinT      17956
Urine           4823
WBC            16806
Weight          8382
pH             15232
dtype: int64

In [196]:
classificacao_normal_peso_ids_test.unique().size

377

In [197]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            19760
ALT            19749
AST            19749
Age              902
Albumin        19853
BUN            18614
Bilirubin      19759
Cholesterol    20074
Creatinine     18604
DiasABP         6258
FiO2           16431
GCS            13792
Gender         19693
Glucose        18800
HCO3           18665
HCT            17992
HR              1820
Height           902
ICUType        19693
K              18649
Lactate        19038
MAP             6322
MechVent       16369
Mg             18631
NIDiasABP      13920
NIMAP          13965
NISysABP       13915
Na             18732
PaCO2          16834
PaO2           16849
Platelets      18421
RespRate       17415
SaO2           18657
SysABP          6258
Temp           10894
TroponinI      20057
TroponinT      19902
Urine           5193
WBC            18605
Weight          9265
pH             16652
dtype: int64

In [198]:
classificacao_sobrepeso_ids_test.unique().size

419

In [199]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID           0
level_1            0
Time               0
ALP            10581
ALT            10578
AST            10578
Age              422
Albumin        10628
BUN             9927
Bilirubin      10571
Cholesterol    10731
Creatinine      9927
DiasABP         3117
FiO2            8649
GCS             7381
Gender         10528
Glucose        10026
HCO3            9957
HCT             9580
HR               875
Height           422
ICUType        10528
K               9916
Lactate        10100
MAP             3186
MechVent        8749
Mg              9956
NIDiasABP       7533
NIMAP           7557
NISysABP        7529
Na              9986
PaCO2           8896
PaO2            8898
Platelets       9855
RespRate        9560
SaO2            9991
SysABP          3117
Temp            5440
TroponinI      10720
TroponinT      10612
Urine           2726
WBC             9953
Weight          4544
pH              8813
dtype: int64

In [200]:
classificacao_obesidade_1_ids_test.unique().size

224

In [201]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            4047
ALT            4046
AST            4046
Age             138
Albumin        4078
BUN            3801
Bilirubin      4048
Cholesterol    4121
Creatinine     3800
DiasABP        1172
FiO2           3343
GCS            2866
Gender         4042
Glucose        3844
HCO3           3808
HCT            3679
HR              289
Height          138
ICUType        4042
K              3820
Lactate        3915
MAP            1174
MechVent       3364
Mg             3805
NIDiasABP      2976
NIMAP          3008
NISysABP       2976
Na             3830
PaCO2          3432
PaO2           3433
Platelets      3753
RespRate       3513
SaO2           3847
SysABP         1171
Temp           2006
TroponinI      4123
TroponinT      4092
Urine          1068
WBC            3804
Weight         1927
pH             3398
dtype: int64

In [202]:
classificacao_obesidade_2_ids_test.unique().size

86

In [203]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            3906
ALT            3903
AST            3905
Age             214
Albumin        3936
BUN            3694
Bilirubin      3905
Cholesterol    3976
Creatinine     3692
DiasABP        1770
FiO2           3328
GCS            2851
Gender         3901
Glucose        3713
HCO3           3701
HCT            3602
HR              379
Height          214
ICUType        3901
K              3684
Lactate        3773
MAP            1737
MechVent       3316
Mg             3703
NIDiasABP      2498
NIMAP          2524
NISysABP       2498
Na             3716
PaCO2          3390
PaO2           3390
Platelets      3676
RespRate       3424
SaO2           3737
SysABP         1770
Temp           2303
TroponinI      3979
TroponinT      3926
Urine          1106
WBC            3698
Weight         1664
pH             3373
dtype: int64

In [204]:
classificacao_obesidade_3_ids_test.unique().size

83

In [227]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test["Undefined gender"] = undefined_gender_missing_rate_test
df_missing_transpose_test["ICUType 1"] = ICUType_1_test_missing
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
df_missing_transpose_test["Undefined classification"] = classificacao_undefined_missing_test
df_missing_transpose_test = df_missing_transpose_test.drop("RecordID", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("level_1", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Time", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Age", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,49587,63454,94,15092,22654,41581,33808,61203,51932,1747,17821,19760,10581,4047,3906,55273
ALT,49556,63418,93,15073,22647,41553,33794,61167,51900,1746,17815,19749,10578,4046,3903,55230
AST,49560,63414,93,15071,22648,41554,33794,61165,51902,1746,17815,19749,10578,4046,3905,55228
Albumin,49775,63730,95,15177,22706,41788,33929,61348,52252,1749,17887,19853,10628,4078,3936,55469
BUN,46711,59862,91,14230,21314,39271,31849,57592,49072,1656,16775,18614,9927,3801,3694,52197
Bilirubin,49549,63422,94,15080,22650,41526,33809,61161,51904,1747,17812,19759,10571,4048,3905,55223
Cholesterol,50370,64475,96,15252,22794,42489,34406,61931,53010,1775,18057,20074,10731,4121,3976,56207
Creatinine,46691,59846,91,14214,21315,39262,31837,57577,49051,1656,16768,18604,9927,3800,3692,52181
DiasABP,23426,28015,20,9094,4648,26885,10834,27348,24113,663,6162,6258,3117,1172,1770,32319
FiO2,42347,54293,80,13585,18867,36224,28044,52316,44404,1501,14942,16431,8649,3343,3328,48526
