In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [2]:
from pypotsModify.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-28 09:01:02 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-28 09:01:02 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-28 09:01:02 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-28 09:01:02 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [3]:
train_X = physionet2012_dataset['train_X']

In [4]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            156273
ALT            156210
AST            156210
Age              9169
Albumin        156862
BUN            147499
Bilirubin      156176
Cholesterol    158670
Creatinine     147465
DiasABP         76034
FiO2           134209
GCS            107698
Gender         155617
Glucose        148060
HCO3           147685
HCT            144229
HR              15087
Height           9169
ICUType        155617
K              146903
Lactate        152589
MAP             76509
MechVent       134697
Mg             147676
NIDiasABP       89091
NIMAP           90110
NISysABP        89027
Na             147580
PaCO2          141449
PaO2           141480
Platelets      147518
RespRate       117228
SaO2           152765
SysABP          76029
Temp           101668
TroponinI      158583
TroponinT      157231
Urine           47738
WBC            148338
Weight          73117
pH             140767
dtype: int64

In [5]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            205439
ALT            205347
AST            205345
Age             12653
Albumin        206217
BUN            193760
Bilirubin      205323
Cholesterol    208467
Creatinine     193698
DiasABP         92165
FiO2           175922
GCS            142692
Gender         204497
Glucose        194774
HCO3           194132
HCT            188339
HR              21451
Height          12653
ICUType        204497
K              193097
Lactate        199850
MAP             92757
MechVent       177469
Mg             194040
NIDiasABP      124129
NIMAP          125224
NISysABP       124046
Na             194208
PaCO2          183200
PaO2           183246
Platelets      193078
RespRate       163034
SaO2           199828
SysABP          92157
Temp           128516
TroponinI      208476
TroponinT      206607
Urine           65509
WBC            194638
Weight         100300
pH             181875
dtype: int64

In [6]:
undefined_gender_ids = train_X[train_X["Gender"] == -1.0]
undefined_gender_ids = undefined_gender_ids["RecordID"]
undefined_gender_missing_rate = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_missing_rate = undefined_gender_missing_rate.isna().sum()
undefined_gender_missing_rate

RecordID         0
level_1          0
Time             0
ALP            427
ALT            427
AST            427
Age             54
Albumin        428
BUN            401
Bilirubin      427
Cholesterol    432
Creatinine     401
DiasABP        174
FiO2           381
GCS            312
Gender         423
Glucose        401
HCO3           401
HCT            403
HR              65
Height          54
ICUType        423
K              401
Lactate        403
MAP            178
MechVent       381
Mg             403
NIDiasABP      272
NIMAP          272
NISysABP       272
Na             402
PaCO2          379
PaO2           379
Platelets      403
RespRate       343
SaO2           429
SysABP         174
Temp           288
TroponinI      431
TroponinT      427
Urine          177
WBC            404
Weight         179
pH             374
dtype: int64

In [7]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training_missing = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training_missing.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            53293
ALT            53243
AST            53245
Age             4573
Albumin        53514
BUN            50312
Bilirubin      53264
Cholesterol    53796
Creatinine     50259
DiasABP        32190
FiO2           48357
GCS            40119
Gender         53016
Glucose        50475
HCO3           50426
HCT            49434
HR              6921
Height          4573
ICUType        53016
K              49695
Lactate        52935
MAP            32251
MechVent       48715
Mg             50315
NIDiasABP      27744
NIMAP          27917
NISysABP       27727
Na             50444
PaCO2          49671
PaO2           49669
Platelets      50269
RespRate       35187
SaO2           51434
SysABP         32188
Temp           37615
TroponinI      53979
TroponinT      52975
Urine          23256
WBC            50673
Weight         29479
pH             49573
dtype: int64

In [8]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training_missing = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training_missing.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            78857
ALT            78844
AST            78845
Age             2628
Albumin        79055
BUN            74267
Bilirubin      78856
Cholesterol    79415
Creatinine     74262
DiasABP        17095
FiO2           66010
GCS            58612
Gender         77785
Glucose        75696
HCO3           74649
HCT            70702
HR              6587
Height          2628
ICUType        77785
K              75193
Lactate        76551
MAP            16927
MechVent       66208
Mg             74372
NIDiasABP      62010
NIMAP          62137
NISysABP       61970
Na             75413
PaCO2          63044
PaO2           63092
Platelets      72979
RespRate       76082
SaO2           70634
SysABP         17094
Temp           32984
TroponinI      79326
TroponinT      79242
Urine          12407
WBC            74047
Weight         38033
pH             61482
dtype: int64

In [9]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training_missing = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training_missing.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            126682
ALT            126617
AST            126615
Age              9650
Albumin        127267
BUN            119747
Bilirubin      126498
Cholesterol    129341
Creatinine     119718
DiasABP         83788
FiO2           110286
GCS             95198
Gender         126759
Glucose        119771
HCO3           119725
HCT            117948
HR              14222
Height           9650
ICUType        126759
K              118931
Lactate        124422
MAP             84403
MechVent       111657
Mg             120326
NIDiasABP       56008
NIMAP           57321
NISysABP        55964
Na             119492
PaCO2          119449
PaO2           119442
Platelets      120589
RespRate        89899
SaO2           127896
SysABP          83784
Temp            92281
TroponinI      129184
TroponinT      127735
Urine           51639
WBC            120987
Weight          45152
pH             119292
dtype: int64

In [10]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training_missing = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training_missing.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            103307
ALT            103280
AST            103277
Age              5025
Albumin        103671
BUN             97334
Bilirubin      103308
Cholesterol    105017
Creatinine      97325
DiasABP         35300
FiO2            85859
GCS             56773
Gender         102977
Glucose         97293
HCO3            97418
HCT             94887
HR               8873
Height           5025
ICUType        102977
K               96582
Lactate         98934
MAP             35863
MechVent        85967
Mg              97106
NIDiasABP       67730
NIMAP           68231
NISysABP        67684
Na              96841
PaCO2           92864
PaO2            92902
Platelets       97162
RespRate        79437
SaO2           103058
SysABP          35294
Temp            67592
TroponinI      105001
TroponinT      104313
Urine           26122
WBC             97673
Weight          60932
pH              92669
dtype: int64

In [11]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train_missing = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train_missing.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            199482
ALT            199421
AST            199414
Age             11396
Albumin        199947
BUN            188014
Bilirubin      199369
Cholesterol    201901
Creatinine     187961
DiasABP         91471
FiO2           170528
GCS            139774
Gender         198058
Glucose        189086
HCO3           188362
HCT            183025
HR              19122
Height          11396
ICUType        198058
K              187481
Lactate        194221
MAP             91903
MechVent       172244
Mg             188192
NIDiasABP      116789
NIMAP          117776
NISysABP       116710
Na             188522
PaCO2          178229
PaO2           178263
Platelets      187623
RespRate       153139
SaO2           192826
SysABP          91465
Temp           123528
TroponinI      201774
TroponinT      199464
Urine           58976
WBC            188905
Weight          94222
pH             177073
dtype: int64

In [12]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train_missing = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train_missing.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            162657
ALT            162563
AST            162568
Age             10480
Albumin        163560
BUN            153646
Bilirubin      162557
Cholesterol    165668
Creatinine     153603
DiasABP         76902
FiO2           139984
GCS            110928
Gender         162479
Glucose        154149
HCO3           153856
HCT            149946
HR              17481
Height          10480
ICUType        162479
K              152920
Lactate        158621
MAP             77541
MechVent       140303
Mg             153927
NIDiasABP       96703
NIMAP           97830
NISysABP        96635
Na             153668
PaCO2          146799
PaO2           146842
Platelets      153376
RespRate       127466
SaO2           160196
SysABP          76895
Temp           106944
TroponinI      165716
TroponinT      164801
Urine           54448
WBC            154475
Weight          79374
pH             145943
dtype: int64

In [13]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [14]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [15]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575035    1.600
575037    1.600
575038    1.600
575039    1.600
575184    1.727
Name: Height, Length: 103441, dtype: float64

In [16]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [17]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.00,37.50,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.30,,,600.0,11.5,84.6,,26.0,Sobrepeso
2,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
3,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.30,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
4,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.00,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4066,163007,0,0.0,42.0,30.0,40.0,19.0,2.8,16.0,0.6,...,0.00,40.55,,1.00,150.0,14.1,114.3,7.36,34.2,Obesidade grau 1
4067,163008,0,0.0,,,,59.0,,24.0,,...,97.00,37.60,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4068,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.00,36.50,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso
4069,163016,0,0.0,,27.0,120.0,65.0,,29.0,0.4,...,101.00,38.10,,,75.0,8.0,63.6,7.37,24.8,Peso normal


In [18]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1402
Peso normal         1211
Obesidade grau 1     730
Obesidade grau 2     305
Obesidade grau 3     292
Baixo peso           131
Name: count, dtype: int64

In [19]:
classificacao_undefined_ids = bmi_data_train["RecordID"]
classificacao_undefined_missing = train_X[~train_X["RecordID"].isin(classificacao_undefined_ids)]
classificacao_undefined_missing = classificacao_undefined_missing.isna().sum()
classificacao_undefined_missing

RecordID            0
level_1             0
Time                0
ALP            169910
ALT            169832
AST            169825
Age             13795
Albumin        170403
BUN            160740
Bilirubin      169786
Cholesterol    172525
Creatinine     160689
DiasABP        104811
FiO2           149237
GCS            115423
Gender         169200
Glucose        160756
HCO3           160817
HCT            157992
HR              20108
Height          13795
ICUType        169200
K              159536
Lactate        166914
MAP            105616
MechVent       151004
Mg             161068
NIDiasABP       80483
NIMAP           81952
NISysABP        80414
Na             160251
PaCO2          160621
PaO2           160656
Platelets      161508
RespRate       113756
SaO2           170848
SysABP         104802
Temp           126358
TroponinI      172554
TroponinT      170771
Urine           63743
WBC            162006
Weight          81629
pH             160397
dtype: int64

In [20]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6196
ALT            6193
AST            6193
Age             199
Albumin        6220
BUN            5838
Bilirubin      6195
Cholesterol    6282
Creatinine     5837
DiasABP        2282
FiO2           5267
GCS            4398
Gender         6157
Glucose        5873
HCO3           5843
HCT            5677
HR              440
Height          199
ICUType        6157
K              5823
Lactate        5970
MAP            2247
MechVent       5255
Mg             5832
NIDiasABP      4006
NIMAP          4042
NISysABP       4001
Na             5857
PaCO2          5416
PaO2           5408
Platelets      5812
RespRate       5287
SaO2           5911
SysABP         2282
Temp           3575
TroponinI      6258
TroponinT      6217
Urine          1775
WBC            5856
Weight         3189
pH             5366
dtype: int64

In [21]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

131

In [22]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            57215
ALT            57184
AST            57184
Age             2382
Albumin        57432
BUN            53837
Bilirubin      57186
Cholesterol    58017
Creatinine     53828
DiasABP        20061
FiO2           48354
GCS            39477
Gender         56917
Glucose        54245
HCO3           53982
HCT            52007
HR              4891
Height          2382
ICUType        56917
K              53739
Lactate        55282
MAP            20133
MechVent       48389
Mg             53840
NIDiasABP      38280
NIMAP          38442
NISysABP       38262
Na             54085
PaCO2          49432
PaO2           49460
Platelets      53375
RespRate       48871
SaO2           54580
SysABP         20061
Temp           31798
TroponinI      57989
TroponinT      57567
Urine          15403
WBC            53938
Weight         28476
pH             48879
dtype: int64

In [23]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1211

In [24]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            66181
ALT            66157
AST            66160
Age             3069
Albumin        66509
BUN            62310
Bilirubin      66151
Cholesterol    67160
Creatinine     62288
DiasABP        20952
FiO2           55845
GCS            46777
Gender         65894
Glucose        62912
HCO3           62491
HCT            60169
HR              6108
Height          3069
ICUType        65894
K              62384
Lactate        64273
MAP            21007
MechVent       55951
Mg             62376
NIDiasABP      46801
NIMAP          47020
NISysABP       46774
Na             62746
PaCO2          56511
PaO2           56533
Platelets      61703
RespRate       57190
SaO2           62421
SysABP         20949
Temp           34969
TroponinI      67142
TroponinT      66657
Urine          17267
WBC            62423
Weight         31803
pH             55844
dtype: int64

In [25]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1402

In [26]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            34484
ALT            34474
AST            34476
Age             1295
Albumin        34627
BUN            32452
Bilirubin      34464
Cholesterol    34978
Creatinine     32442
DiasABP        10877
FiO2           28639
GCS            24286
Gender         34310
Glucose        32716
HCO3           32528
HCT            31387
HR              2742
Height          1295
ICUType        34310
K              32395
Lactate        33293
MAP            11008
MechVent       28682
Mg             32471
NIDiasABP      24148
NIMAP          24257
NISysABP       24129
Na             32601
PaCO2          29235
PaO2           29239
Platelets      32242
RespRate       30649
SaO2           32605
SysABP         10877
Temp           18283
TroponinI      34944
TroponinT      34693
Urine           8197
WBC            32559
Weight         16490
pH             28906
dtype: int64

In [27]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

730

In [28]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            14410
ALT            14407
AST            14407
Age              585
Albumin        14481
BUN            13527
Bilirubin      14409
Cholesterol    14613
Creatinine     13523
DiasABP         4499
FiO2           11956
GCS            10295
Gender         14335
Glucose        13687
HCO3           13577
HCT            13097
HR              1215
Height           585
ICUType        14335
K              13584
Lactate        13883
MAP             4529
MechVent       12017
Mg             13570
NIDiasABP      10289
NIMAP          10361
NISysABP       10286
Na             13629
PaCO2          12099
PaO2           12103
Platelets      13413
RespRate       12727
SaO2           13541
SysABP          4498
Temp            7402
TroponinI      14611
TroponinT      14505
Urine           3540
WBC            13556
Weight          6144
pH             11989
dtype: int64

In [29]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

305

In [30]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13743
ALT            13737
AST            13737
Age              551
Albumin        13835
BUN            12956
Bilirubin      13735
Cholesterol    13994
Creatinine     12957
DiasABP         4891
FiO2           11214
GCS            10046
Gender         13724
Glucose        13046
HCO3           12980
HCT            12642
HR              1099
Height           551
ICUType        13724
K              12940
Lactate        13227
MAP             4904
MechVent       11249
Mg             12962
NIDiasABP       9485
NIMAP           9532
NISysABP        9479
Na             13021
PaCO2          11714
PaO2           11706
Platelets      12946
RespRate       12125
SaO2           13116
SysABP          4891
Temp            8087
TroponinI      13992
TroponinT      13855
Urine           3499
WBC            13042
Weight          5865
pH             11635
dtype: int64

In [31]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

292

In [32]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [33]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["Undefined gender"] = undefined_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
df_missing_transpose["Undefined classification"] = classificacao_undefined_missing
df_missing_transpose = df_missing_transpose.drop("RecordID", axis=0)
df_missing_transpose = df_missing_transpose.drop("level_1", axis=0)
df_missing_transpose = df_missing_transpose.drop("Time", axis=0)
df_missing_transpose = df_missing_transpose.drop("Age", axis=0)
df_missing_transpose = df_missing_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,156273,205439,427,53293,78857,126682,103307,199482,162657,6196,57215,66181,34484,14410,13743,169910
ALT,156210,205347,427,53243,78844,126617,103280,199421,162563,6193,57184,66157,34474,14407,13737,169832
AST,156210,205345,427,53245,78845,126615,103277,199414,162568,6193,57184,66160,34476,14407,13737,169825
Albumin,156862,206217,428,53514,79055,127267,103671,199947,163560,6220,57432,66509,34627,14481,13835,170403
BUN,147499,193760,401,50312,74267,119747,97334,188014,153646,5838,53837,62310,32452,13527,12956,160740
Bilirubin,156176,205323,427,53264,78856,126498,103308,199369,162557,6195,57186,66151,34464,14409,13735,169786
Cholesterol,158670,208467,432,53796,79415,129341,105017,201901,165668,6282,58017,67160,34978,14613,13994,172525
Creatinine,147465,193698,401,50259,74262,119718,97325,187961,153603,5837,53828,62288,32442,13523,12957,160689
DiasABP,76034,92165,174,32190,17095,83788,35300,91471,76902,2282,20061,20952,10877,4499,4891,104811
FiO2,134209,175922,381,48357,66010,110286,85859,170528,139984,5267,48354,55845,28639,11956,11214,149237


<h3>Validation data</h3>

In [34]:
validation_X = physionet2012_dataset['val_X']

In [35]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            42001
ALT            41975
AST            41979
Age             2278
Albumin        42144
BUN            39591
Bilirubin      41974
Cholesterol    42588
Creatinine     39572
DiasABP        20023
FiO2           35880
GCS            28444
Gender         41783
Glucose        39711
HCO3           39653
HCT            38763
HR              3819
Height          2278
ICUType        41783
K              39362
Lactate        40985
MAP            20104
MechVent       35998
Mg             39648
NIDiasABP      24470
NIMAP          24734
NISysABP       24449
Na             39593
PaCO2          38001
PaO2           38006
Platelets      39641
RespRate       31553
SaO2           41133
SysABP         20022
Temp           27337
TroponinI      42576
TroponinT      42216
Urine          12559
WBC            39853
Weight         20582
pH             37872
dtype: int64

In [36]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            48428
ALT            48404
AST            48402
Age             2679
Albumin        48673
BUN            45631
Bilirubin      48393
Cholesterol    49224
Creatinine     45609
DiasABP        21111
FiO2           41422
GCS            33361
Gender         48269
Glucose        45841
HCO3           45720
HCT            44435
HR              4605
Height          2679
ICUType        48269
K              45477
Lactate        47149
MAP            21256
MechVent       41778
Mg             45711
NIDiasABP      29811
NIMAP          30085
NISysABP       29788
Na             45695
PaCO2          43400
PaO2           43409
Platelets      45514
RespRate       37929
SaO2           47280
SysABP         21109
Temp           30434
TroponinI      49174
TroponinT      48695
Urine          15578
WBC            45867
Weight         23550
pH             43126
dtype: int64

In [37]:
undefined_gender_ids_validation = validation_X[validation_X["Gender"] == -1.0]
undefined_gender_ids_validation = undefined_gender_ids_validation["RecordID"]
undefined_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(undefined_gender_ids_validation)]
undefined_gender_missing_rate_validation = undefined_gender_missing_rate_validation.isna().sum()
undefined_gender_missing_rate_validation

RecordID        0
level_1         0
Time            0
ALP            96
ALT            95
AST            95
Age            46
Albumin        95
BUN            91
Bilirubin      96
Cholesterol    96
Creatinine     91
DiasABP        64
FiO2           95
GCS            86
Gender         94
Glucose        91
HCO3           91
HCT            92
HR             50
Height         46
ICUType        94
K              90
Lactate        96
MAP            66
MechVent       96
Mg             92
NIDiasABP      79
NIMAP          79
NISysABP       79
Na             91
PaCO2          93
PaO2           93
Platelets      92
RespRate       50
SaO2           96
SysABP         64
Temp           85
TroponinI      96
TroponinT      92
Urine          68
WBC            92
Weight         50
pH             93
dtype: int64

In [38]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            13381
ALT            13366
AST            13368
Age             1052
Albumin        13433
BUN            12591
Bilirubin      13373
Cholesterol    13501
Creatinine     12572
DiasABP         7047
FiO2           11985
GCS            10000
Gender         13301
Glucose        12643
HCO3           12635
HCT            12380
HR              1634
Height          1052
ICUType        13301
K              12420
Lactate        13238
MAP             7089
MechVent       12077
Mg             12603
NIDiasABP       7925
NIMAP           7947
NISysABP        7914
Na             12643
PaCO2          12354
PaO2           12354
Platelets      12604
RespRate        8364
SaO2           12846
SysABP          7047
Temp            9608
TroponinI      13546
TroponinT      13304
Urine           5935
WBC            12698
Weight          8028
pH             12330
dtype: int64

In [39]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            18132
ALT            18123
AST            18124
Age              622
Albumin        18169
BUN            17081
Bilirubin      18128
Cholesterol    18238
Creatinine     17077
DiasABP         4275
FiO2           15107
GCS            13375
Gender         17860
Glucose        17389
HCO3           17166
HCT            16333
HR              1441
Height           622
ICUType        17860
K              17268
Lactate        17715
MAP             4252
MechVent       15226
Mg             17136
NIDiasABP      14015
NIMAP          14057
NISysABP       14010
Na             17327
PaCO2          14819
PaO2           14829
Platelets      16827
RespRate       17224
SaO2           16443
SysABP          4274
Temp            7888
TroponinI      18211
TroponinT      18194
Urine           3015
WBC            17068
Weight          9237
pH             14516
dtype: int64

In [40]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            33454
ALT            33430
AST            33428
Age             2238
Albumin        33649
BUN            31606
Bilirubin      33398
Cholesterol    34179
Creatinine     31591
DiasABP        21657
FiO2           29352
GCS            24796
Gender         33511
Glucose        31606
HCO3           31606
HCT            31128
HR              3366
Height          2238
ICUType        33511
K              31404
Lactate        32861
MAP            21768
MechVent       29540
Mg             31734
NIDiasABP      15064
NIMAP          15444
NISysABP       15050
Na             31531
PaCO2          31579
PaO2           31576
Platelets      31818
RespRate       23808
SaO2           33781
SysABP         21655
Temp           24243
TroponinI      34124
TroponinT      33698
Urine          13366
WBC            31901
Weight         12354
pH             31547
dtype: int64

In [41]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            25558
ALT            25555
AST            25556
Age             1091
Albumin        25661
BUN            24035
Bilirubin      25564
Cholesterol    25990
Creatinine     24032
DiasABP         8219
FiO2           20953
GCS            13720
Gender         25474
Glucose        24005
HCO3           24057
HCT            23449
HR              2033
Height          1091
ICUType        25474
K              23837
Lactate        24416
MAP             8317
MechVent       21029
Mg             23978
NIDiasABP      17356
NIMAP          17450
NISysABP       17342
Na             23878
PaCO2          22742
PaO2           22749
Platelets      23998
RespRate       20136
SaO2           25439
SysABP          8219
Temp           16117
TroponinI      25965
TroponinT      25807
Urine           5889
WBC            24145
Weight         14563
pH             22698
dtype: int64

In [42]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            49584
ALT            49565
AST            49564
Age             2483
Albumin        49686
BUN            46639
Bilirubin      49553
Cholesterol    50111
Creatinine     46608
DiasABP        21657
FiO2           42382
GCS            34082
Gender         49162
Glucose        46840
HCO3           46722
HCT            45500
HR              4277
Height          2483
ICUType        49162
K              46432
Lactate        48237
MAP            21789
MechVent       42824
Mg             46684
NIDiasABP      29899
NIMAP          30144
NISysABP       29873
Na             46690
PaCO2          44479
PaO2           44489
Platelets      46586
RespRate       37353
SaO2           48026
SysABP         21656
Temp           31038
TroponinI      50053
TroponinT      49500
Urine          14533
WBC            46924
Weight         23715
pH             44249
dtype: int64

In [43]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            40941
ALT            40909
AST            40912
Age             2520
Albumin        41226
BUN            38674
Bilirubin      40910
Cholesterol    41797
Creatinine     38664
DiasABP        19541
FiO2           35015
GCS            27809
Gender         40984
Glucose        38803
HCO3           38742
HCT            37790
HR              4197
Height          2520
ICUType        40984
K              38497
Lactate        39993
MAP            19637
MechVent       35048
Mg             38767
NIDiasABP      24461
NIMAP          24754
NISysABP       24443
Na             38689
PaCO2          37015
PaO2           37019
Platelets      38661
RespRate       32179
SaO2           40483
SysABP         19539
Temp           26818
TroponinI      41793
TroponinT      41503
Urine          13672
WBC            38888
Weight         20467
pH             36842
dtype: int64

In [44]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [45]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

432       1.753
433       1.753
434       1.753
435       1.753
436       1.753
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 24992, dtype: float64

In [46]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
432,132555,0,0.0,,,,74.0,,,,...,98.0,34.8,,,35.0,,66.1,7.39,21.5,Peso normal
433,132555,1,1.0,,,,74.0,,19.0,,...,112.0,35.3,,,130.0,9.0,66.1,7.41,21.5,Peso normal
434,132555,2,2.0,,,,74.0,,,,...,104.0,36.05,,,210.0,,66.1,,21.5,Peso normal
435,132555,3,3.0,,,,74.0,,,,...,114.0,36.2,,,120.0,,66.1,,21.5,Peso normal
436,132555,4,4.0,,,,74.0,,,,...,111.0,36.1,,,185.0,,66.1,7.29,21.5,Peso normal


In [47]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132555,0,0.0,,,,74.0,,19.0,,...,98.0,34.800,,,35.0,9.0,66.1,7.39,21.5,Peso normal
1,132590,0,0.0,,,,58.0,,,,...,119.0,36.800,,,70.0,,98.0,,27.7,Sobrepeso
2,132599,0,0.0,,,,53.0,,,,...,,37.300,,,350.0,,73.5,,23.3,Peso normal
3,132618,0,0.0,,,,72.0,,,,...,56.0,37.475,,,,,69.1,,29.8,Sobrepeso
4,132622,0,0.0,,,,71.0,,64.0,,...,,37.400,19.0,,80.0,7.2,79.0,,30.9,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,162724,0,0.0,,,,80.0,,,,...,,,,,,,100.0,,32.5,Obesidade grau 1
1002,162787,0,0.0,,,,75.0,,,,...,,,,,,,94.5,,32.6,Obesidade grau 1
1003,162929,0,0.0,,,,63.0,,26.0,,...,113.0,37.000,,,55.0,13.0,100.0,7.41,36.7,Obesidade grau 2
1004,162995,0,0.0,60.0,21.0,20.0,84.0,,93.0,0.4,...,121.0,37.100,0.6,,60.0,17.1,96.5,7.31,28.8,Sobrepeso


In [48]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           357
Peso normal         297
Obesidade grau 1    174
Obesidade grau 2     85
Obesidade grau 3     64
Baixo peso           29
Name: count, dtype: int64

In [49]:
classificacao_undefined_ids_validation = bmi_data_validation["RecordID"]
classificacao_undefined_missing_validation = validation_X[~validation_X["RecordID"].isin(classificacao_undefined_ids_validation)]
classificacao_undefined_missing_validation = classificacao_undefined_missing_validation.isna().sum()
classificacao_undefined_missing_validation

RecordID           0
level_1            0
Time               0
ALP            43038
ALT            43015
AST            43013
Age             3003
Albumin        43214
BUN            40631
Bilirubin      43005
Cholesterol    43697
Creatinine     40611
DiasABP        25712
FiO2           37536
GCS            28704
Gender         42864
Glucose        40635
HCO3           40655
HCT            39988
HR              4540
Height          3003
ICUType        42864
K              40329
Lactate        42204
MAP            25836
MechVent       37889
Mg             40694
NIDiasABP      20886
NIMAP          21207
NISysABP       20866
Na             40501
PaCO2          40595
PaO2           40594
Platelets      40863
RespRate       29326
SaO2           43312
SysABP         25710
Temp           31741
TroponinI      43682
TroponinT      43176
Urine          15605
WBC            40986
Weight         20886
pH             40548
dtype: int64

In [50]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1369
ALT            1369
AST            1369
Age              57
Albumin        1368
BUN            1282
Bilirubin      1367
Cholesterol    1388
Creatinine     1281
DiasABP         394
FiO2           1123
GCS             862
Gender         1363
Glucose        1285
HCO3           1283
HCT            1272
HR               98
Height           57
ICUType        1363
K              1272
Lactate        1309
MAP             398
MechVent       1094
Mg             1286
NIDiasABP       967
NIMAP           971
NISysABP        967
Na             1278
PaCO2          1180
PaO2           1180
Platelets      1283
RespRate       1234
SaO2           1312
SysABP          394
Temp            771
TroponinI      1389
TroponinT      1385
Urine           355
WBC            1291
Weight          673
pH             1176
dtype: int64

In [51]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

29

In [52]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            14050
ALT            14044
AST            14046
Age              595
Albumin        14092
BUN            13224
Bilirubin      14041
Cholesterol    14233
Creatinine     13218
DiasABP         4732
FiO2           11907
GCS             9672
Gender         13959
Glucose        13283
HCO3           13254
HCT            12837
HR              1126
Height           595
ICUType        13959
K              13159
Lactate        13725
MAP             4803
MechVent       11941
Mg             13229
NIDiasABP       9831
NIMAP           9869
NISysABP        9825
Na             13252
PaCO2          12335
PaO2           12339
Platelets      13142
RespRate       11479
SaO2           13450
SysABP          4731
Temp            8118
TroponinI      14218
TroponinT      14127
Urine           3792
WBC            13272
Weight          6882
pH             12221
dtype: int64

In [53]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

297

In [54]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            16845
ALT            16832
AST            16832
Age              699
Albumin        16935
BUN            15855
Bilirubin      16838
Cholesterol    17111
Creatinine     15845
DiasABP         5668
FiO2           14246
GCS            11736
Gender         16779
Glucose        15979
HCO3           15903
HCT            15261
HR              1421
Height           699
ICUType        16779
K              15832
Lactate        16309
MAP             5671
MechVent       14276
Mg             15848
NIDiasABP      11584
NIMAP          11679
NISysABP       11573
Na             15941
PaCO2          14481
PaO2           14489
Platelets      15701
RespRate       14222
SaO2           16035
SysABP          5668
Temp            9049
TroponinI      17081
TroponinT      16985
Urine           4525
WBC            15871
Weight          8552
pH             14338
dtype: int64

In [55]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

357

In [56]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8182
ALT            8176
AST            8177
Age             316
Albumin        8229
BUN            7708
Bilirubin      8179
Cholesterol    8339
Creatinine     7707
DiasABP        2334
FiO2           6837
GCS            5856
Gender         8178
Glucose        7793
HCO3           7739
HCT            7478
HR              680
Height          316
ICUType        8178
K              7734
Lactate        7932
MAP            2371
MechVent       6834
Mg             7763
NIDiasABP      6105
NIMAP          6130
NISysABP       6105
Na             7762
PaCO2          6958
PaO2           6961
Platelets      7660
RespRate       7156
SaO2           7777
SysABP         2334
Temp           4312
TroponinI      8329
TroponinT      8259
Urine          1959
WBC            7745
Weight         3957
pH             6903
dtype: int64

In [57]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

174

In [58]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            4012
ALT            4011
AST            4011
Age             205
Albumin        4039
BUN            3762
Bilirubin      4007
Cholesterol    4072
Creatinine     3760
DiasABP        1154
FiO2           3261
GCS            2830
Gender         3995
Glucose        3786
HCO3           3766
HCT            3637
HR              367
Height          205
ICUType        3995
K              3756
Lactate        3835
MAP            1155
MechVent       3358
Mg             3767
NIDiasABP      2981
NIMAP          3013
NISysABP       2979
Na             3772
PaCO2          3356
PaO2           3355
Platelets      3741
RespRate       3448
SaO2           3766
SysABP         1154
Temp           2091
TroponinI      4076
TroponinT      4022
Urine          1155
WBC            3776
Weight         1934
pH             3333
dtype: int64

In [59]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

85

In [60]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3029
ALT            3027
AST            3028
Age             128
Albumin        3035
BUN            2851
Bilirubin      3026
Cholesterol    3068
Creatinine     2850
DiasABP        1204
FiO2           2487
GCS            2231
Gender         3008
Glucose        2882
HCO3           2864
HCT            2817
HR              242
Height          128
ICUType        3008
K              2847
Lactate        2916
MAP            1192
MechVent       2480
Mg             2864
NIDiasABP      2006
NIMAP          2029
NISysABP       2001
Na             2873
PaCO2          2589
PaO2           2590
Platelets      2857
RespRate       2667
SaO2           2857
SysABP         1204
Temp           1774
TroponinI      3071
TroponinT      3049
Urine           814
WBC            2871
Weight         1298
pH             2572
dtype: int64

In [61]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

64

In [62]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["Undefined gender"] = undefined_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
df_missing_transpose_validation["Undefined classification"] = classificacao_undefined_missing_validation
df_missing_transpose_validation = df_missing_transpose_validation.drop("RecordID", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("level_1", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Time", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Age", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,42001,48428,96,13381,18132,33454,25558,49584,40941,1369,14050,16845,8182,4012,3029,43038
ALT,41975,48404,95,13366,18123,33430,25555,49565,40909,1369,14044,16832,8176,4011,3027,43015
AST,41979,48402,95,13368,18124,33428,25556,49564,40912,1369,14046,16832,8177,4011,3028,43013
Albumin,42144,48673,95,13433,18169,33649,25661,49686,41226,1368,14092,16935,8229,4039,3035,43214
BUN,39591,45631,91,12591,17081,31606,24035,46639,38674,1282,13224,15855,7708,3762,2851,40631
Bilirubin,41974,48393,96,13373,18128,33398,25564,49553,40910,1367,14041,16838,8179,4007,3026,43005
Cholesterol,42588,49224,96,13501,18238,34179,25990,50111,41797,1388,14233,17111,8339,4072,3068,43697
Creatinine,39572,45609,91,12572,17077,31591,24032,46608,38664,1281,13218,15845,7707,3760,2850,40611
DiasABP,20023,21111,64,7047,4275,21657,8219,21657,19541,394,4732,5668,2334,1154,1204,25712
FiO2,35880,41422,95,11985,15107,29352,20953,42382,35015,1123,11907,14246,6837,3261,2487,37536


<h3>Test data</h3>

In [63]:
test_X = physionet2012_dataset['test_X']

In [64]:
test_X["Gender"].value_counts()

Gender
 1.0    1339
 0.0    1059
-1.0       1
Name: count, dtype: int64

In [65]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            50056
ALT            50026
AST            50027
Age             3262
Albumin        50212
BUN            47121
Bilirubin      50009
Cholesterol    50759
Creatinine     47100
DiasABP        25918
FiO2           43389
GCS            34766
Gender         49773
Glucose        47273
HCO3           47170
HCT            46219
HR              5189
Height          3262
ICUType        49773
K              46918
Lactate        48836
MAP            26123
MechVent       43680
Mg             47240
NIDiasABP      27646
NIMAP          28036
NISysABP       27636
Na             47158
PaCO2          45927
PaO2           45942
Platelets      47241
RespRate       36558
SaO2           49171
SysABP         25915
Temp           33600
TroponinI      50715
TroponinT      50296
Urine          16076
WBC            47471
Weight         23055
pH             45734
dtype: int64

In [66]:
female_gender_test_ids.unique().size

1059

In [67]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            63231
ALT            63197
AST            63195
Age             3481
Albumin        63506
BUN            59613
Bilirubin      63185
Cholesterol    64146
Creatinine     59582
DiasABP        28089
FiO2           53748
GCS            43586
Gender         62933
Glucose        59957
HCO3           59740
HCT            58218
HR              6094
Height          3481
ICUType        62933
K              59467
Lactate        61774
MAP            28364
MechVent       54556
Mg             59732
NIDiasABP      37807
NIMAP          38099
NISysABP       37792
Na             59773
PaCO2          56508
PaO2           56522
Platelets      59485
RespRate       50239
SaO2           61551
SysABP         28085
Temp           39815
TroponinI      64151
TroponinT      63565
Urine          19093
WBC            59972
Weight         30476
pH             56088
dtype: int64

In [68]:
male_gender_test_ids.unique().size

1339

In [69]:
undefined_gender_ids_test = test_X[test_X["Gender"] == -1.0]
undefined_gender_ids_test = undefined_gender_ids_test["RecordID"]
undefined_gender_missing_rate_test = test_X[test_X["RecordID"].isin(undefined_gender_ids_test)]
undefined_gender_missing_rate_test = undefined_gender_missing_rate_test.isna().sum()
undefined_gender_missing_rate_test

RecordID        0
level_1         0
Time            0
ALP            48
ALT            48
AST            48
Age            46
Albumin        48
BUN            47
Bilirubin      48
Cholesterol    48
Creatinine     47
DiasABP        48
FiO2           48
GCS            48
Gender         47
Glucose        47
HCO3           47
HCT            47
HR             48
Height         46
ICUType        47
K              47
Lactate        48
MAP            48
MechVent       48
Mg             47
NIDiasABP      48
NIMAP          48
NISysABP       48
Na             47
PaCO2          48
PaO2           48
Platelets      47
RespRate       48
SaO2           48
SysABP         48
Temp           48
TroponinI      48
TroponinT      48
Urine          48
WBC            47
Weight         47
pH             48
dtype: int64

In [70]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            16787
ALT            16777
AST            16775
Age             1443
Albumin        16814
BUN            15764
Bilirubin      16773
Cholesterol    16885
Creatinine     15735
DiasABP        10482
FiO2           15140
GCS            12732
Gender         16638
Glucose        15812
HCO3           15810
HCT            15498
HR              2252
Height          1443
ICUType        16638
K              15561
Lactate        16631
MAP            10516
MechVent       15341
Mg             15786
NIDiasABP       8344
NIMAP           8363
NISysABP        8344
Na             15811
PaCO2          15699
PaO2           15700
Platelets      15762
RespRate       11297
SaO2           16260
SysABP         10482
Temp           12054
TroponinI      16938
TroponinT      16600
Urine           7366
WBC            15886
Weight          9260
pH             15673
dtype: int64

In [71]:
ICUType_1_test_ids.unique().size

354

In [72]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            23498
ALT            23496
AST            23495
Age              864
Albumin        23555
BUN            22123
Bilirubin      23494
Cholesterol    23658
Creatinine     22114
DiasABP         4774
FiO2           19644
GCS            17225
Gender         23171
Glucose        22587
HCO3           22236
HCT            21078
HR              2009
Height           864
ICUType        23171
K              22453
Lactate        22815
MAP             4804
MechVent       19739
Mg             22183
NIDiasABP      18942
NIMAP          19002
NISysABP       18934
Na             22490
PaCO2          18893
PaO2           18912
Platelets      21713
RespRate       22893
SaO2           21111
SysABP          4773
Temp            9842
TroponinI      23648
TroponinT      23601
Urine           3842
WBC            22047
Weight         11160
pH             18396
dtype: int64

In [73]:
ICUType_2_test_ids.unique().size

493

In [74]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            41187
ALT            41149
AST            41151
Age             3058
Albumin        41426
BUN            38911
Bilirubin      41114
Cholesterol    42054
Creatinine     38898
DiasABP        27496
FiO2           35910
GCS            30982
Gender         41219
Glucose        38912
HCO3           38894
HCT            38465
HR              4454
Height          3058
ICUType        41219
K              38651
Lactate        40489
MAP            27751
MechVent       36542
Mg             39078
NIDiasABP      17832
NIMAP          18258
NISysABP       17822
Na             38844
PaCO2          39077
PaO2           39077
Platelets      39270
RespRate       28645
SaO2           41627
SysABP         27492
Temp           30301
TroponinI      41968
TroponinT      41519
Urine          16099
WBC            39396
Weight         14679
pH             39029
dtype: int64

In [75]:
ICUType_3_test_ids.unique().size

877

In [76]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            31863
ALT            31849
AST            31849
Age             1424
Albumin        31971
BUN            29983
Bilirubin      31861
Cholesterol    32356
Creatinine     29982
DiasABP        11303
FiO2           26491
GCS            17461
Gender         31725
Glucose        29966
HCO3           30017
HCT            29443
HR              2616
Height          1424
ICUType        31725
K              29767
Lactate        30723
MAP            11464
MechVent       26662
Mg             29972
NIDiasABP      20383
NIMAP          20560
NISysABP       20376
Na             29833
PaCO2          28814
PaO2           28823
Platelets      30028
RespRate       24010
SaO2           31772
SysABP         11301
Temp           21266
TroponinI      32360
TroponinT      32189
Urine           7910
WBC            30161
Weight         18479
pH             28772
dtype: int64

In [77]:
ICUType_4_test_ids.unique().size

675

In [78]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            61391
ALT            61372
AST            61372
Age             3498
Albumin        61517
BUN            57785
Bilirubin      61356
Cholesterol    62038
Creatinine     57759
DiasABP        28988
FiO2           52365
GCS            42849
Gender         60865
Glucose        58089
HCO3           57865
HCT            56443
HR              5898
Height          3498
ICUType        60865
K              57599
Lactate        59880
MAP            29251
MechVent       53147
Mg             57897
NIDiasABP      35240
NIMAP          35548
NISysABP       35230
Na             57942
PaCO2          55178
PaO2           55197
Platelets      57719
RespRate       46915
SaO2           59471
SysABP         28984
Temp           38658
TroponinI      61996
TroponinT      61318
Urine          17764
WBC            58097
Weight         28714
pH             54822
dtype: int64

In [79]:
more_than_or_equal_to_65_test_ids.unique().size

1295

In [80]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            51944
ALT            51899
AST            51898
Age             3291
Albumin        52249
BUN            48996
Bilirubin      51886
Cholesterol    52915
Creatinine     48970
DiasABP        25067
FiO2           44820
GCS            35551
Gender         51888
Glucose        49188
HCO3           49092
HCT            48041
HR              5433
Height          3291
ICUType        51888
K              48833
Lactate        50778
MAP            25284
MechVent       45137
Mg             49122
NIDiasABP      30261
NIMAP          30635
NISysABP       30246
Na             49036
PaCO2          47305
PaO2           47315
Platelets      49054
RespRate       39930
SaO2           51299
SysABP         25064
Temp           34805
TroponinI      52918
TroponinT      52591
Urine          17453
WBC            49393
Weight         24864
pH             47048
dtype: int64

In [81]:
less_than_65_test_ids.unique().size

1104

In [82]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [83]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

720       1.676
738       1.676
739       1.676
740       1.676
741       1.676
          ...  
574460    1.930
574461    1.930
574462    1.930
574463    1.930
575088    1.727
Name: Height, Length: 30366, dtype: float64

In [84]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
720,132575,0,0.0,,,,78.0,,,,...,,,,,,12.5,63.0,7.34,22.4,Peso normal
738,132575,18,18.0,,,,78.0,,,,...,122.0,37.4,,,38.0,,72.4,,25.8,Sobrepeso
739,132575,19,19.0,,,,78.0,,,,...,121.0,37.3,,,45.0,,72.4,,25.8,Sobrepeso
740,132575,20,20.0,,,,78.0,,,,...,97.5,37.3,,,30.0,,72.4,,25.8,Sobrepeso
741,132575,21,21.0,,,,78.0,,,,...,90.0,37.2,,,20.0,,72.4,,25.8,Sobrepeso


In [85]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132575,0,0.0,,,,78.0,,18.0,,...,122.0,37.4,,,38.0,12.5,63.0,7.34,22.4,Peso normal
1,132597,0,0.0,,,,66.0,,27.0,,...,,36.5,1.2,,,18.6,82.0,,43.6,Obesidade grau 3
2,132601,0,0.0,,,,74.0,,,,...,,,,,,,75.9,7.39,24.0,Peso normal
3,132637,0,0.0,,,,78.0,,13.0,,...,99.0,37.0,,,90.0,14.2,56.0,7.39,19.3,Peso normal
4,132658,0,0.0,71.0,9.0,42.0,81.0,,18.0,1.3,...,97.0,38.4,,,90.0,61.3,105.4,7.42,30.7,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1186,162889,0,0.0,,,,64.0,,15.0,,...,125.0,37.0,,,160.0,9.3,89.2,7.42,26.7,Sobrepeso
1187,162933,0,0.0,,,,77.0,,,,...,109.0,,,,90.0,,65.9,7.43,26.6,Sobrepeso
1188,162983,0,0.0,95.0,369.0,366.0,75.0,3.1,28.0,6.4,...,124.0,35.3,1.2,,80.0,25.0,90.0,7.33,31.1,Obesidade grau 1
1189,162987,0,0.0,,,,57.0,,,,...,92.0,36.4,,,380.0,,83.0,7.34,22.3,Peso normal


In [86]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           401
Peso normal         354
Obesidade grau 1    235
Obesidade grau 2     89
Obesidade grau 3     77
Baixo peso           35
Name: count, dtype: int64

In [87]:
classificacao_undefined_ids_test = bmi_data_test["RecordID"]
classificacao_undefined_missing_test = test_X[~test_X["RecordID"].isin(classificacao_undefined_ids_test)]
classificacao_undefined_missing_test = classificacao_undefined_missing_test.isna().sum()
classificacao_undefined_missing_test

RecordID           0
level_1            0
Time               0
ALP            57066
ALT            57022
AST            57025
Age             4343
Albumin        57261
BUN            53879
Bilirubin      57011
Cholesterol    57883
Creatinine     53852
DiasABP        34562
FiO2           49848
GCS            38815
Gender         56776
Glucose        53887
HCO3           53897
HCT            53166
HR              6424
Height          4343
ICUType        56776
K              53504
Lactate        56129
MAP            34813
MechVent       50552
Mg             54015
NIDiasABP      27177
NIMAP          27656
NISysABP       27164
Na             53731
PaCO2          53909
PaO2           53916
Platelets      54241
RespRate       38380
SaO2           57283
SysABP         34559
Temp           42539
TroponinI      57873
TroponinT      57273
Urine          20926
WBC            54395
Weight         26776
pH             53828
dtype: int64

In [88]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1652
ALT            1653
AST            1652
Age             116
Albumin        1658
BUN            1541
Bilirubin      1651
Cholesterol    1675
Creatinine     1541
DiasABP         713
FiO2           1416
GCS            1179
Gender         1645
Glucose        1539
HCO3           1543
HCT            1502
HR              170
Height          116
ICUType        1645
K              1532
Lactate        1595
MAP             713
MechVent       1418
Mg             1541
NIDiasABP      1065
NIMAP          1089
NISysABP       1065
Na             1540
PaCO2          1509
PaO2           1509
Platelets      1553
RespRate       1232
SaO2           1617
SysABP          712
Temp           1098
TroponinI      1677
TroponinT      1659
Urine           499
WBC            1567
Weight          690
pH             1497
dtype: int64

In [89]:
classificacao_baixo_peso_ids_test.unique().size

35

In [90]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            16722
ALT            16719
AST            16719
Age              681
Albumin        16785
BUN            15774
Bilirubin      16717
Cholesterol    16966
Creatinine     15766
DiasABP         5798
FiO2           14131
GCS            11554
Gender         16638
Glucose        15901
HCO3           15807
HCT            15355
HR              1360
Height           681
ICUType        16638
K              15763
Lactate        16228
MAP             5902
MechVent       14115
Mg             15796
NIDiasABP      11229
NIMAP          11285
NISysABP       11221
Na             15846
PaCO2          14553
PaO2           14562
Platelets      15670
RespRate       14624
SaO2           15998
SysABP          5798
Temp            9249
TroponinI      16950
TroponinT      16858
Urine           4424
WBC            15828
Weight          8085
pH             14406
dtype: int64

In [91]:
classificacao_normal_peso_ids_test.unique().size

354

In [92]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            18956
ALT            18945
AST            18944
Age              735
Albumin        19028
BUN            17802
Bilirubin      18940
Cholesterol    19214
Creatinine     17793
DiasABP         6370
FiO2           15962
GCS            13408
Gender         18847
Glucose        17997
HCO3           17864
HCT            17243
HR              1598
Height           735
ICUType        18847
K              17859
Lactate        18415
MAP             6460
MechVent       16107
Mg             17835
NIDiasABP      12835
NIMAP          12878
NISysABP       12832
Na             17941
PaCO2          16352
PaO2           16360
Platelets      17648
RespRate       16756
SaO2           17956
SysABP          6368
Temp           10374
TroponinI      19215
TroponinT      19077
Urine           4651
WBC            17842
Weight          9269
pH             16136
dtype: int64

In [93]:
classificacao_sobrepeso_ids_test.unique().size

401

In [94]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID           0
level_1            0
Time               0
ALP            11095
ALT            11089
AST            11087
Age              510
Albumin        11165
BUN            10423
Bilirubin      11087
Cholesterol    11265
Creatinine     10421
DiasABP         3797
FiO2            9334
GCS             7822
Gender         11045
Glucose        10530
HCO3           10469
HCT            10070
HR              1023
Height           510
ICUType        11045
K              10422
Lactate        10737
MAP             3844
MechVent        9489
Mg             10453
NIDiasABP       7648
NIMAP           7694
NISysABP        7648
Na             10514
PaCO2           9494
PaO2            9496
Platelets      10339
RespRate        9344
SaO2           10534
SysABP          3797
Temp            5852
TroponinI      11252
TroponinT      11167
Urine           2576
WBC            10468
Weight          5117
pH              9383
dtype: int64

In [95]:
classificacao_obesidade_1_ids_test.unique().size

235

In [96]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            4189
ALT            4189
AST            4189
Age             229
Albumin        4209
BUN            3941
Bilirubin      4182
Cholesterol    4265
Creatinine     3934
DiasABP        1647
FiO2           3579
GCS            3082
Gender         4183
Glucose        3966
HCO3           3945
HCT            3817
HR              412
Height          229
ICUType        4183
K              3926
Lactate        4075
MAP            1632
MechVent       3624
Mg             3955
NIDiasABP      2783
NIMAP          2800
NISysABP       2782
Na             3960
PaCO2          3665
PaO2           3667
Platelets      3910
RespRate       3318
SaO2           3986
SysABP         1646
Temp           2355
TroponinI      4261
TroponinT      4213
Urine          1180
WBC            3948
Weight         1917
pH             3644
dtype: int64

In [97]:
classificacao_obesidade_2_ids_test.unique().size

89

In [98]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            3655
ALT            3654
AST            3654
Age             175
Albumin        3660
BUN            3421
Bilirubin      3654
Cholesterol    3685
Creatinine     3422
DiasABP        1168
FiO2           2915
GCS            2540
Gender         3619
Glucose        3457
HCO3           3432
HCT            3331
HR              344
Height          175
ICUType        3619
K              3426
Lactate        3479
MAP            1171
MechVent       2979
Mg             3424
NIDiasABP      2764
NIMAP          2781
NISysABP       2764
Na             3446
PaCO2          3001
PaO2           3002
Platelets      3412
RespRate       3191
SaO2           3396
SysABP         1168
Temp           1996
TroponinI      3686
TroponinT      3662
Urine           961
WBC            3442
Weight         1724
pH             2976
dtype: int64

In [99]:
classificacao_obesidade_3_ids_test.unique().size

77

In [100]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test["Undefined gender"] = undefined_gender_missing_rate_test
df_missing_transpose_test["ICUType 1"] = ICUType_1_test_missing
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
df_missing_transpose_test["Undefined classification"] = classificacao_undefined_missing_test
df_missing_transpose_test = df_missing_transpose_test.drop("RecordID", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("level_1", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Time", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Age", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,50056,63231,48,16787,23498,41187,31863,61391,51944,1652,16722,18956,11095,4189,3655,57066
ALT,50026,63197,48,16777,23496,41149,31849,61372,51899,1653,16719,18945,11089,4189,3654,57022
AST,50027,63195,48,16775,23495,41151,31849,61372,51898,1652,16719,18944,11087,4189,3654,57025
Albumin,50212,63506,48,16814,23555,41426,31971,61517,52249,1658,16785,19028,11165,4209,3660,57261
BUN,47121,59613,47,15764,22123,38911,29983,57785,48996,1541,15774,17802,10423,3941,3421,53879
Bilirubin,50009,63185,48,16773,23494,41114,31861,61356,51886,1651,16717,18940,11087,4182,3654,57011
Cholesterol,50759,64146,48,16885,23658,42054,32356,62038,52915,1675,16966,19214,11265,4265,3685,57883
Creatinine,47100,59582,47,15735,22114,38898,29982,57759,48970,1541,15766,17793,10421,3934,3422,53852
DiasABP,25918,28089,48,10482,4774,27496,11303,28988,25067,713,5798,6370,3797,1647,1168,34562
FiO2,43389,53748,48,15140,19644,35910,26491,52365,44820,1416,14131,15962,9334,3579,2915,49848
