In [2]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h2>Loading dataset</h2>

In [3]:
from pypotsModify.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-28 23:37:47 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-28 23:37:47 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-28 23:37:47 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-28 23:37:47 [INFO]: Loaded successfully!


<h3>Training data</h3>

<h4>Loading training dataset</h4>

In [5]:
train_X = physionet2012_dataset['train_X']

In [6]:
female_gender_ids = train_X[train_X["Gender"] == 0.0]
female_gender_ids = female_gender_ids["RecordID"]
female_gender_missing_rate = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_missing_rate = female_gender_missing_rate.isna().sum()
female_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            157139
ALT            157069
AST            157070
Age              9305
Albumin        157729
BUN            148262
Bilirubin      157033
Cholesterol    159472
Creatinine     148212
DiasABP         77793
FiO2           134887
GCS            108162
Gender         156416
Glucose        148806
HCO3           148443
HCT            144993
HR              15371
Height           9305
ICUType        156416
K              147623
Lactate        153435
MAP             78372
MechVent       135310
Mg             148511
NIDiasABP       88837
NIMAP           89889
NISysABP        88779
Na             148344
PaCO2          142459
PaO2           142477
Platelets      148376
RespRate       117105
SaO2           153646
SysABP          77787
Temp           102704
TroponinI      159380
TroponinT      158066
Urine           49123
WBC            149171
Weight          74139
pH             141836
dtype: int64

In [7]:
female_gender_measurements_training = train_X[train_X["RecordID"].isin(female_gender_ids)]
female_gender_measurements_training = female_gender_measurements_training.count()
female_gender_measurements_training

RecordID       159744
level_1        159744
Time           159744
ALP              2605
ALT              2675
AST              2674
Age            150439
Albumin          2015
BUN             11482
Bilirubin        2711
Cholesterol       272
Creatinine      11532
DiasABP         81951
FiO2            24857
GCS             51582
Gender           3328
Glucose         10938
HCO3            11301
HCT             14751
HR             144373
Height         150439
ICUType          3328
K               12121
Lactate          6309
MAP             81372
MechVent        24434
Mg              11233
NIDiasABP       70907
NIMAP           69855
NISysABP        70965
Na              11400
PaCO2           17285
PaO2            17267
Platelets       11368
RespRate        42639
SaO2             6098
SysABP          81957
Temp            57040
TroponinI         364
TroponinT        1678
Urine          110621
WBC             10573
Weight          85605
pH              17908
dtype: int64

In [8]:
male_gender_ids = train_X[train_X["Gender"] == 1.0]
male_gender_ids = male_gender_ids["RecordID"]
male_gender_missing_rate = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_missing_rate = male_gender_missing_rate.isna().sum()
male_gender_missing_rate

RecordID            0
level_1             0
Time                0
ALP            204737
ALT            204637
AST            204635
Age             12214
Albumin        205609
BUN            193084
Bilirubin      204615
Cholesterol    207904
Creatinine     193024
DiasABP         91082
FiO2           175082
GCS            142178
Gender         203933
Glucose        194079
HCO3           193463
HCT            187838
HR              20755
Height          12214
ICUType        203933
K              192423
Lactate        199298
MAP             91648
MechVent       176694
Mg             193381
NIDiasABP      124237
NIMAP          125417
NISysABP       124153
Na             193538
PaCO2          182886
PaO2           182928
Platelets      192466
RespRate       161952
SaO2           199361
SysABP          91072
Temp           127835
TroponinI      207925
TroponinT      205987
Urine           64736
WBC            194003
Weight          99905
pH             181589
dtype: int64

In [10]:
male_gender_measurements_training = train_X[train_X["RecordID"].isin(male_gender_ids)]
male_gender_measurements_training  = male_gender_measurements_training.count()
male_gender_measurements_training 

RecordID       208272
level_1        208272
Time           208272
ALP              3535
ALT              3635
AST              3637
Age            196058
Albumin          2663
BUN             15188
Bilirubin        3657
Cholesterol       368
Creatinine      15248
DiasABP        117190
FiO2            33190
GCS             66094
Gender           4339
Glucose         14193
HCO3            14809
HCT             20434
HR             187517
Height         196058
ICUType          4339
K               15849
Lactate          8974
MAP            116624
MechVent        31578
Mg              14891
NIDiasABP       84035
NIMAP           82855
NISysABP        84119
Na              14734
PaCO2           25386
PaO2            25344
Platelets       15806
RespRate        46320
SaO2             8911
SysABP         117200
Temp            80437
TroponinI         347
TroponinT        2285
Urine          143536
WBC             14269
Weight         108367
pH              26683
dtype: int64

In [11]:
undefined_gender_ids = train_X[train_X["Gender"] == -1.0]
undefined_gender_ids = undefined_gender_ids["RecordID"]
undefined_gender_missing_rate = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_missing_rate = undefined_gender_missing_rate.isna().sum()
undefined_gender_missing_rate

RecordID         0
level_1          0
Time             0
ALP            189
ALT            189
AST            189
Age              6
Albumin        191
BUN            180
Bilirubin      189
Cholesterol    192
Creatinine     180
DiasABP         66
FiO2           176
GCS            119
Gender         188
Glucose        181
HCO3           180
HCT            182
HR               9
Height           6
ICUType        188
K              181
Lactate        190
MAP             68
MechVent       166
Mg             181
NIDiasABP      101
NIMAP          101
NISysABP       101
Na             181
PaCO2          183
PaO2           183
Platelets      182
RespRate       150
SaO2           191
SysABP          66
Temp           147
TroponinI      192
TroponinT      192
Urine           87
WBC            181
Weight          65
pH             182
dtype: int64

In [14]:
undefined_gender_measurements_training = train_X[train_X["RecordID"].isin(undefined_gender_ids)]
undefined_gender_measurements_training = undefined_gender_measurements_training.count()
undefined_gender_measurements_training

RecordID       192
level_1        192
Time           192
ALP              3
ALT              3
AST              3
Age            186
Albumin          1
BUN             12
Bilirubin        3
Cholesterol      0
Creatinine      12
DiasABP        126
FiO2            16
GCS             73
Gender           4
Glucose         11
HCO3            12
HCT             10
HR             183
Height         186
ICUType          4
K               11
Lactate          2
MAP            124
MechVent        26
Mg              11
NIDiasABP       91
NIMAP           91
NISysABP        91
Na              11
PaCO2            9
PaO2             9
Platelets       10
RespRate        42
SaO2             1
SysABP         126
Temp            45
TroponinI        0
TroponinT        0
Urine          105
WBC             11
Weight         127
pH              10
dtype: int64

In [16]:
ICUType_1_training_ids = train_X[train_X['ICUType'] == 1.0]
ICUType_1_training_ids = ICUType_1_training_ids[ICUType_1_training_ids["Time"] == 0.0]
ICUType_1_training_ids = ICUType_1_training_ids["RecordID"]
ICUType_1_training = train_X[train_X["RecordID"].isin(ICUType_1_training_ids)]
ICUType_1_training_missing = ICUType_1_training.isna().sum()
ICUType_1_training_missing

RecordID           0
level_1            0
Time               0
ALP            53008
ALT            52953
AST            52952
Age             4316
Albumin        53210
BUN            49927
Bilirubin      52971
Cholesterol    53481
Creatinine     49866
DiasABP        31670
FiO2           47637
GCS            39903
Gender         52687
Glucose        50098
HCO3           50063
HCT            49101
HR              6680
Height          4316
ICUType        52687
K              49298
Lactate        52525
MAP            31771
MechVent       48033
Mg             49942
NIDiasABP      27702
NIMAP          27843
NISysABP       27684
Na             50076
PaCO2          49346
PaO2           49344
Platelets      49960
RespRate       35513
SaO2           51101
SysABP         31669
Temp           37707
TroponinI      53641
TroponinT      52634
Urine          23292
WBC            50335
Weight         29479
pH             49251
dtype: int64

In [17]:
ICUType_1_measurements_training = ICUType_1_training.count() 
ICUType_1_measurements_training

RecordID       53808
level_1        53808
Time           53808
ALP              800
ALT              855
AST              856
Age            49492
Albumin          598
BUN             3881
Bilirubin        837
Cholesterol      327
Creatinine      3942
DiasABP        22138
FiO2            6171
GCS            13905
Gender          1121
Glucose         3710
HCO3            3745
HCT             4707
HR             47128
Height         49492
ICUType         1121
K               4510
Lactate         1283
MAP            22037
MechVent        5775
Mg              3866
NIDiasABP      26106
NIMAP          25965
NISysABP       26124
Na              3732
PaCO2           4462
PaO2            4464
Platelets       3848
RespRate       18295
SaO2            2707
SysABP         22139
Temp           16101
TroponinI        167
TroponinT       1174
Urine          30516
WBC             3473
Weight         24329
pH              4557
dtype: int64

In [18]:
ICUType_2_training_ids = train_X[train_X['ICUType'] == 2.0]
ICUType_2_training_ids = ICUType_2_training_ids[ICUType_2_training_ids["Time"] == 0.0]
ICUType_2_training_ids = ICUType_2_training_ids["RecordID"]
ICUType_2_training = train_X[train_X["RecordID"].isin(ICUType_2_training_ids)]
ICUType_2_training_missing = ICUType_2_training.isna().sum()
ICUType_2_training_missing

RecordID           0
level_1            0
Time               0
ALP            78109
ALT            78093
AST            78093
Age             2995
Albumin        78291
BUN            73558
Bilirubin      78101
Cholesterol    78650
Creatinine     73541
DiasABP        17015
FiO2           65295
GCS            57842
Gender         77033
Glucose        74972
HCO3           73942
HCT            70065
HR              6837
Height          2995
ICUType        77033
K              74467
Lactate        75918
MAP            16971
MechVent       65530
Mg             73693
NIDiasABP      61794
NIMAP          61919
NISysABP       61753
Na             74686
PaCO2          62826
PaO2           62872
Platelets      72291
RespRate       75467
SaO2           70159
SysABP         17013
Temp           32794
TroponinI      78584
TroponinT      78471
Urine          12956
WBC            73351
Weight         37944
pH             61347
dtype: int64

In [20]:
ICUType_2_measurements_training = ICUType_2_training.count() 
ICUType_2_measurements_training

RecordID       78672
level_1        78672
Time           78672
ALP              563
ALT              579
AST              579
Age            75677
Albumin          381
BUN             5114
Bilirubin        571
Cholesterol       22
Creatinine      5131
DiasABP        61657
FiO2           13377
GCS            20830
Gender          1639
Glucose         3700
HCO3            4730
HCT             8607
HR             71835
Height         75677
ICUType         1639
K               4205
Lactate         2754
MAP            61701
MechVent       13142
Mg              4979
NIDiasABP      16878
NIMAP          16753
NISysABP       16919
Na              3986
PaCO2          15846
PaO2           15800
Platelets       6381
RespRate        3205
SaO2            8513
SysABP         61659
Temp           45878
TroponinI         88
TroponinT        201
Urine          65716
WBC             5321
Weight         40728
pH             17325
dtype: int64

In [21]:
ICUType_3_training_ids = train_X[train_X['ICUType'] == 3.0]
ICUType_3_training_ids = ICUType_3_training_ids[ICUType_3_training_ids["Time"] == 0.0]
ICUType_3_training_ids = ICUType_3_training_ids["RecordID"]
ICUType_3_training = train_X[train_X["RecordID"].isin(ICUType_3_training_ids)]
ICUType_3_training_missing = ICUType_3_training.isna().sum()
ICUType_3_training_missing

RecordID            0
level_1             0
Time                0
ALP            128116
ALT            128042
AST            128043
Age              9357
Albumin        128787
BUN            121177
Bilirubin      127925
Cholesterol    130803
Creatinine     121148
DiasABP         84975
FiO2           111946
GCS             96074
Gender         128216
Glucose        121192
HCO3           121131
HCT            119208
HR              13944
Height           9357
ICUType        128216
K              120329
Lactate        125847
MAP             85584
MechVent       113219
Mg             121725
NIDiasABP       56137
NIMAP           57614
NISysABP        56092
Na             120902
PaCO2          120969
PaO2           120950
Platelets      121973
RespRate        89507
SaO2           129348
SysABP          84968
Temp            93284
TroponinI      130638
TroponinT      129182
Urine           52077
WBC            122347
Weight          46164
pH             120807
dtype: int64

In [22]:
ICUType_3_measurements_training = ICUType_3_training.count() 
ICUType_3_measurements_training

RecordID       130944
level_1        130944
Time           130944
ALP              2828
ALT              2902
AST              2901
Age            121587
Albumin          2157
BUN              9767
Bilirubin        3019
Cholesterol       141
Creatinine       9796
DiasABP         45969
FiO2            18998
GCS             34870
Gender           2728
Glucose          9752
HCO3             9813
HCT             11736
HR             117000
Height         121587
ICUType          2728
K               10615
Lactate          5097
MAP             45360
MechVent        17725
Mg               9219
NIDiasABP       74807
NIMAP           73330
NISysABP        74852
Na              10042
PaCO2            9975
PaO2             9994
Platelets        8971
RespRate        41437
SaO2             1596
SysABP          45976
Temp            37660
TroponinI         306
TroponinT        1762
Urine           78867
WBC              8597
Weight          84780
pH              10137
dtype: int64

In [23]:
ICUType_4_training_ids = train_X[train_X['ICUType'] == 4.0]
ICUType_4_training_ids = ICUType_4_training_ids[ICUType_4_training_ids["Time"] == 0.0]
ICUType_4_training_ids = ICUType_4_training_ids["RecordID"]
ICUType_4_training = train_X[train_X["RecordID"].isin(ICUType_4_training_ids)]
ICUType_4_training_missing = ICUType_4_training.isna().sum()
ICUType_4_training_missing

RecordID            0
level_1             0
Time                0
ALP            102832
ALT            102807
AST            102806
Age              4857
Albumin        103241
BUN             96864
Bilirubin      102840
Cholesterol    104634
Creatinine      96861
DiasABP         35281
FiO2            85267
GCS             56640
Gender         102601
Glucose         96804
HCO3            96950
HCT             94639
HR               8674
Height           4857
ICUType        102601
K               96133
Lactate         98633
MAP             35762
MechVent        85388
Mg              96713
NIDiasABP       67542
NIMAP           68031
NISysABP        67504
Na              96399
PaCO2           92387
PaO2            92422
Platelets       96800
RespRate        78720
SaO2           102590
SysABP          35275
Temp            66901
TroponinI      104634
TroponinT      103958
Urine           25621
WBC             97322
Weight          60522
pH              92202
dtype: int64

In [24]:
ICUType_4_measurements_training = ICUType_4_training.count() 
ICUType_4_measurements_training 

RecordID       104784
level_1        104784
Time           104784
ALP              1952
ALT              1977
AST              1978
Age             99927
Albumin          1543
BUN              7920
Bilirubin        1944
Cholesterol       150
Creatinine       7923
DiasABP         69503
FiO2            19517
GCS             48144
Gender           2183
Glucose          7980
HCO3             7834
HCT             10145
HR              96110
Height          99927
ICUType          2183
K                8651
Lactate          6151
MAP             69022
MechVent        19396
Mg               8071
NIDiasABP       37242
NIMAP           36753
NISysABP        37280
Na               8385
PaCO2           12397
PaO2            12362
Platelets        7984
RespRate        26064
SaO2             2194
SysABP          69509
Temp            37883
TroponinI         150
TroponinT         826
Urine           79163
WBC              7462
Weight          44262
pH              12582
dtype: int64

In [27]:
more_than_or_equal_to_65_train_ids = train_X[train_X["Age"] >= 65]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids[more_than_or_equal_to_65_train_ids["Time"] == 0.0]
more_than_or_equal_to_65_train_ids = more_than_or_equal_to_65_train_ids["RecordID"]
more_than_or_equal_to_65_train = train_X[train_X["RecordID"].isin(more_than_or_equal_to_65_train_ids)]
more_than_or_equal_to_65_train_missing = more_than_or_equal_to_65_train.isna().sum()
more_than_or_equal_to_65_train_missing

RecordID            0
level_1             0
Time                0
ALP            198842
ALT            198776
AST            198767
Age             11039
Albumin        199296
BUN            187352
Bilirubin      198720
Cholesterol    201170
Creatinine     187289
DiasABP         91170
FiO2           169665
GCS            138779
Gender         197353
Glucose        188371
HCO3           187663
HCT            182485
HR              18681
Height          11039
ICUType        197353
K              186737
Lactate        193618
MAP             91761
MechVent       171392
Mg             187563
NIDiasABP      116172
NIMAP          117236
NISysABP       116098
Na             187807
PaCO2          177968
PaO2           178009
Platelets      186992
RespRate       151858
SaO2           192251
SysABP          91162
Temp           123180
TroponinI      201044
TroponinT      198819
Urine           58986
WBC            188267
Weight          93903
pH             176870
dtype: int64

In [28]:
age_65_and_above_measurements_training = more_than_or_equal_to_65_train.count()
age_65_and_above_measurements_training

RecordID       201552
level_1        201552
Time           201552
ALP              2710
ALT              2776
AST              2785
Age            190513
Albumin          2256
BUN             14200
Bilirubin        2832
Cholesterol       382
Creatinine      14263
DiasABP        110382
FiO2            31887
GCS             62773
Gender           4199
Glucose         13181
HCO3            13889
HCT             19067
HR             182871
Height         190513
ICUType          4199
K               14815
Lactate          7934
MAP            109791
MechVent        30160
Mg              13989
NIDiasABP       85380
NIMAP           84316
NISysABP        85454
Na              13745
PaCO2           23584
PaO2            23543
Platelets       14560
RespRate        49694
SaO2             9301
SysABP         110390
Temp            78372
TroponinI         508
TroponinT        2733
Urine          142566
WBC             13285
Weight         107649
pH              24682
dtype: int64

In [30]:
less_than_65_train_ids = train_X[train_X["Age"] < 65]
less_than_65_train_ids = less_than_65_train_ids[less_than_65_train_ids["Time"] == 0.0]
less_than_65_train_ids = less_than_65_train_ids["RecordID"]
less_than_65_train = train_X[train_X["RecordID"].isin(less_than_65_train_ids)]
less_than_65_train_missing = less_than_65_train.isna().sum()
less_than_65_train_missing


RecordID            0
level_1             0
Time                0
ALP            163223
ALT            163119
AST            163127
Age             10486
Albumin        164233
BUN            154174
Bilirubin      163117
Cholesterol    166398
Creatinine     154127
DiasABP         77771
FiO2           140480
GCS            111680
Gender         163184
Glucose        154695
HCO3           154423
HCT            150528
HR              17454
Height          10486
ICUType        163184
K              153490
Lactate        159305
MAP             78327
MechVent       140778
Mg             154510
NIDiasABP       97003
NIMAP           98171
NISysABP        96935
Na             154256
PaCO2          147560
PaO2           147579
Platelets      154032
RespRate       127349
SaO2           160947
SysABP          77763
Temp           107506
TroponinI      166453
TroponinT      165426
Urine           54960
WBC            155088
Weight          80206
pH             146737
dtype: int64

In [31]:
age_under_65_measurements_training  = less_than_65_train_missing = less_than_65_train.isna().sum()
less_than_65_train_missing .count()
age_under_65_measurements_training 

RecordID            0
level_1             0
Time                0
ALP            163223
ALT            163119
AST            163127
Age             10486
Albumin        164233
BUN            154174
Bilirubin      163117
Cholesterol    166398
Creatinine     154127
DiasABP         77771
FiO2           140480
GCS            111680
Gender         163184
Glucose        154695
HCO3           154423
HCT            150528
HR              17454
Height          10486
ICUType        163184
K              153490
Lactate        159305
MAP             78327
MechVent       140778
Mg             154510
NIDiasABP       97003
NIMAP           98171
NISysABP        96935
Na             154256
PaCO2          147560
PaO2           147579
Platelets      154032
RespRate       127349
SaO2           160947
SysABP          77763
Temp           107506
TroponinI      166453
TroponinT      165426
Urine           54960
WBC            155088
Weight          80206
pH             146737
dtype: int64

In [117]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [118]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [119]:
filtered_train_X_metros = filtered_train_X.copy()
filtered_train_X_metros["Height"] = filtered_train_X["Height"]/100
filtered_train_X_metros["Height"]

144       1.803
145       1.803
146       1.803
147       1.803
148       1.803
          ...  
575321    1.727
575322    1.727
575323    1.727
575325    1.727
575327    1.727
Name: Height, Length: 101297, dtype: float64

In [120]:
bmi_data_train = filtered_train_X_metros
bmi_data_train["BMI"] = round(filtered_train_X_metros["Weight"] / (filtered_train_X_metros["Height"]**2), 1)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,,11.5,84.6,,26.0,Sobrepeso
145,132543,1,1.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
146,132543,2,2.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso
147,132543,3,3.0,,,,68.0,,,,...,,36.4,,,,,84.6,,26.0,Sobrepeso
148,132543,4,4.0,,,,68.0,,,,...,,,,,,,84.6,,26.0,Sobrepeso


In [121]:
bmi_data_train = bmi_data_train.groupby("RecordID").first().reset_index()
bmi_data_train

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,,36.3,,,600.0,11.5,84.6,,26.0,Sobrepeso
1,132548,0,0.0,,,,68.0,,32.0,,...,205.00,36.3,0.7,,120.0,6.2,87.0,,32.9,Obesidade grau 1
2,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,102.75,38.0,3.5,,120.0,16.1,48.4,7.40,18.3,Baixo peso
3,132568,0,0.0,,,,66.0,,18.0,,...,,36.1,,,220.0,14.8,84.5,,34.1,Obesidade grau 1
4,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,,36.6,,,600.0,8.8,102.6,,35.4,Obesidade grau 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4030,163008,0,0.0,,,,59.0,,24.0,,...,97.00,37.6,,,45.0,6.9,98.5,7.38,34.0,Obesidade grau 1
4031,163013,0,0.0,82.0,11.0,30.0,74.0,2.5,30.0,1.2,...,118.00,36.5,,0.03,40.0,9.6,68.6,7.35,29.5,Sobrepeso
4032,163021,0,0.0,,,,72.0,,9.0,,...,,,,,,8.6,62.0,,20.8,Peso normal
4033,163029,0,0.0,,,,61.0,,,,...,,,,,,,85.0,,28.5,Sobrepeso


In [122]:
bmi_data_train["Classificacao"].value_counts()

Classificacao
Sobrepeso           1374
Peso normal         1207
Obesidade grau 1     753
Obesidade grau 3     291
Obesidade grau 2     280
Baixo peso           130
Name: count, dtype: int64

In [123]:
classificacao_undefined_ids = bmi_data_train["RecordID"]
classificacao_undefined_missing = train_X[~train_X["RecordID"].isin(classificacao_undefined_ids)]
classificacao_undefined_missing = classificacao_undefined_missing.isna().sum()
classificacao_undefined_missing

RecordID            0
level_1             0
Time                0
ALP            171667
ALT            171578
AST            171575
Age             13815
Albumin        172206
BUN            162293
Bilirubin      171546
Cholesterol    174243
Creatinine     162237
DiasABP        103807
FiO2           150907
GCS            116091
Gender         170892
Glucose        162317
HCO3           162370
HCT            159801
HR              20171
Height          13815
ICUType        170892
K              161076
Lactate        168572
MAP            104492
MechVent       152473
Mg             162584
NIDiasABP       83061
NIMAP           84438
NISysABP        82990
Na             161788
PaCO2          162215
PaO2           162246
Platelets      163180
RespRate       114987
SaO2           172474
SysABP         103797
Temp           127743
TroponinI      174217
TroponinT      172385
Urine           64440
WBC            163689
Weight          82708
pH             161989
dtype: int64

In [124]:
classificacao_baixo_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Baixo peso"]
#classificacao_baixo_peso_ids = classificacao_baixo_peso_ids[classificacao_baixo_peso_ids["Time"] == 0.0]
classificacao_baixo_peso_ids = classificacao_baixo_peso_ids["RecordID"]
classificacao_baixo_peso_missing = train_X[train_X["RecordID"].isin(classificacao_baixo_peso_ids)]
classificacao_baixo_peso_missing = classificacao_baixo_peso_missing.isna().sum()
classificacao_baixo_peso_missing

RecordID          0
level_1           0
Time              0
ALP            6153
ALT            6151
AST            6150
Age             257
Albumin        6169
BUN            5783
Bilirubin      6151
Cholesterol    6232
Creatinine     5781
DiasABP        2276
FiO2           5194
GCS            4289
Gender         6110
Glucose        5802
HCO3           5786
HCT            5676
HR              469
Height          257
ICUType        6110
K              5757
Lactate        5921
MAP            2256
MechVent       5165
Mg             5779
NIDiasABP      4016
NIMAP          4073
NISysABP       4016
Na             5799
PaCO2          5367
PaO2           5362
Platelets      5795
RespRate       5279
SaO2           5878
SysABP         2276
Temp           3611
TroponinI      6209
TroponinT      6174
Urine          1734
WBC            5831
Weight         2994
pH             5330
dtype: int64

In [125]:
teste = classificacao_baixo_peso_ids.unique()
teste.size

130

In [126]:
classificacao_normal_peso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Peso normal"]
#classificacao_normal_peso_ids = classificacao_normal_peso_ids[classificacao_normal_peso_ids["Time"] == 0.0]
classificacao_normal_peso_ids = classificacao_normal_peso_ids["RecordID"]
classificacao_normal_peso_missing = train_X[train_X["RecordID"].isin(classificacao_normal_peso_ids)]
classificacao_normal_peso_missing = classificacao_normal_peso_missing.isna().sum()
classificacao_normal_peso_missing

RecordID           0
level_1            0
Time               0
ALP            57062
ALT            57037
AST            57040
Age             2393
Albumin        57261
BUN            53754
Bilirubin      57041
Cholesterol    57834
Creatinine     53738
DiasABP        20209
FiO2           48274
GCS            39266
Gender         56729
Glucose        54122
HCO3           53891
HCT            52032
HR              4809
Height          2393
ICUType        56729
K              53629
Lactate        55278
MAP            20369
MechVent       48315
Mg             53755
NIDiasABP      38235
NIMAP          38388
NISysABP       38213
Na             53965
PaCO2          49558
PaO2           49582
Platelets      53368
RespRate       48447
SaO2           54538
SysABP         20209
Temp           32330
TroponinI      57791
TroponinT      57377
Urine          15538
WBC            53885
Weight         28238
pH             49048
dtype: int64

In [127]:
teste = classificacao_normal_peso_ids.unique()
teste.size

1207

In [128]:
classificacao_sobrepeso_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Sobrepeso"]
#classificacao_sobrepeso_ids = classificacao_sobrepeso_ids[classificacao_sobrepeso_ids["Time"] == 0.0]
classificacao_sobrepeso_ids = classificacao_sobrepeso_ids["RecordID"]
classificacao_sobrepeso_missing = train_X[train_X["RecordID"].isin(classificacao_sobrepeso_ids)]
classificacao_sobrepeso_missing = classificacao_sobrepeso_missing.isna().sum()
classificacao_sobrepeso_missing

RecordID           0
level_1            0
Time               0
ALP            64892
ALT            64859
AST            64864
Age             2793
Albumin        65198
BUN            61029
Bilirubin      64861
Cholesterol    65840
Creatinine     61002
DiasABP        20561
FiO2           54853
GCS            45623
Gender         64578
Glucose        61598
HCO3           61203
HCT            59035
HR              5663
Height          2793
ICUType        64578
K              61101
Lactate        62985
MAP            20666
MechVent       54981
Mg             61090
NIDiasABP      45696
NIMAP          45940
NISysABP       45669
Na             61430
PaCO2          55467
PaO2           55486
Platelets      60514
RespRate       56152
SaO2           61189
SysABP         20558
Temp           34082
TroponinI      65782
TroponinT      65346
Urine          16837
WBC            61180
Weight         31906
pH             54828
dtype: int64

In [129]:
teste = classificacao_sobrepeso_ids.unique()
teste.size

1374

In [130]:
classificacao_obesidade_1_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 1"]
#classificacao_obesidade_1_ids = classificacao_obesidade_1_ids[classificacao_obesidade_1_ids["Time"] == 0.0]
classificacao_obesidade_1_ids = classificacao_obesidade_1_ids["RecordID"]
classificacao_obesidade_1_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_1_ids)]
classificacao_obesidade_1_missing = classificacao_obesidade_1_missing.isna().sum()
classificacao_obesidade_1_missing

RecordID           0
level_1            0
Time               0
ALP            35592
ALT            35577
AST            35582
Age             1421
Albumin        35724
BUN            33464
Bilirubin      35574
Cholesterol    36085
Creatinine     33451
DiasABP        11129
FiO2           29713
GCS            25026
Gender         35391
Glucose        33735
HCO3           33562
HCT            32366
HR              2938
Height          1421
ICUType        35391
K              33444
Lactate        34373
MAP            11295
MechVent       29827
Mg             33536
NIDiasABP      25312
NIMAP          25444
NISysABP       25306
Na             33640
PaCO2          30261
PaO2           30266
Platelets      33240
RespRate       31211
SaO2           33663
SysABP         11129
Temp           18861
TroponinI      36033
TroponinT      35791
Urine           8560
WBC            33584
Weight         17214
pH             29933
dtype: int64

In [131]:
teste = classificacao_obesidade_1_ids.unique()
teste.size

753

In [132]:
classificacao_obesidade_2_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 2"]
#classificacao_obesidade_2_ids = classificacao_obesidade_2_ids[classificacao_obesidade_2_ids["Time"] == 0.0]
classificacao_obesidade_2_ids = classificacao_obesidade_2_ids["RecordID"]
classificacao_obesidade_2_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_2_ids)]
classificacao_obesidade_2_missing = classificacao_obesidade_2_missing.isna().sum()
classificacao_obesidade_2_missing

RecordID           0
level_1            0
Time               0
ALP            13229
ALT            13228
AST            13228
Age              624
Albumin        13301
BUN            12420
Bilirubin      13223
Cholesterol    13414
Creatinine     12410
DiasABP         4323
FiO2           10991
GCS             9453
Gender         13160
Glucose        12539
HCO3           12455
HCT            12044
HR              1162
Height           624
ICUType        13160
K              12428
Lactate        12783
MAP             4319
MechVent       11117
Mg             12443
NIDiasABP       9341
NIMAP           9414
NISysABP        9337
Na             12498
PaCO2          11257
PaO2           11257
Platelets      12353
RespRate       11754
SaO2           12468
SysABP          4321
Temp            6912
TroponinI      13419
TroponinT      13285
Urine           3344
WBC            12457
Weight          5772
pH             11162
dtype: int64

In [133]:
teste = classificacao_obesidade_2_ids.unique()
teste.size

280

In [134]:
classificacao_obesidade_3_ids = bmi_data_train[bmi_data_train["Classificacao"] == "Obesidade grau 3"]
#classificacao_obesidade_3_ids = classificacao_obesidade_3_ids[classificacao_obesidade_3_ids["Time"] == 0.0]
classificacao_obesidade_3_ids = classificacao_obesidade_3_ids["RecordID"]
classificacao_obesidade_3_missing = train_X[train_X["RecordID"].isin(classificacao_obesidade_3_ids)]
classificacao_obesidade_3_missing = classificacao_obesidade_3_missing.isna().sum()
classificacao_obesidade_3_missing

RecordID           0
level_1            0
Time               0
ALP            13713
ALT            13707
AST            13707
Age              623
Albumin        13804
BUN            12899
Bilirubin      13706
Cholesterol    13942
Creatinine     12898
DiasABP         4871
FiO2           11033
GCS             9982
Gender         13677
Glucose        13003
HCO3           12933
HCT            12617
HR              1192
Height           623
ICUType        13677
K              12880
Lactate        13125
MAP             4854
MechVent       11078
Mg             12910
NIDiasABP       9658
NIMAP           9723
NISysABP        9650
Na             12975
PaCO2          11616
PaO2           11614
Platelets      12904
RespRate       12300
SaO2           13028
SysABP          4871
Temp            8026
TroponinI      13941
TroponinT      13813
Urine           3482
WBC            12993
Weight          6259
pH             11547
dtype: int64

In [135]:
teste = classificacao_obesidade_3_ids.unique()
teste.size

291

In [136]:
df_columns = train_X.columns
df_columns

Index(['RecordID', 'level_1', 'Time', 'ALP', 'ALT', 'AST', 'Age', 'Albumin',
       'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Gender', 'Glucose', 'HCO3', 'HCT', 'HR', 'Height', 'ICUType',
       'K', 'Lactate', 'MAP', 'MechVent', 'Mg', 'NIDiasABP', 'NIMAP',
       'NISysABP', 'Na', 'PaCO2', 'PaO2', 'Platelets', 'RespRate', 'SaO2',
       'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight',
       'pH'],
      dtype='object')

In [137]:
df_missing = pd.DataFrame(columns=df_columns)
df_missing_transpose = df_missing.T
df_missing_transpose["Female"] = female_gender_missing_rate
df_missing_transpose["Male"] = male_gender_missing_rate
df_missing_transpose["Undefined gender"] = undefined_gender_missing_rate
df_missing_transpose["ICUType 1"] = ICUType_1_training_missing
df_missing_transpose["ICUType 2"] = ICUType_2_training_missing
df_missing_transpose["ICUType 3"] = ICUType_3_training_missing
df_missing_transpose["ICUType 4"] = ICUType_4_training_missing
df_missing_transpose["Age 65+"] = more_than_or_equal_to_65_train_missing
df_missing_transpose["Age 65-"] = less_than_65_train_missing
df_missing_transpose["Low Weight"] = classificacao_baixo_peso_missing
df_missing_transpose["Normal Weight"] = classificacao_normal_peso_missing
df_missing_transpose["Overweight"] = classificacao_sobrepeso_missing
df_missing_transpose["Obesity Grade 1"] = classificacao_obesidade_1_missing
df_missing_transpose["Obesity Grade 2"] = classificacao_obesidade_2_missing
df_missing_transpose["Obesity Grade 3"] = classificacao_obesidade_3_missing
df_missing_transpose["Undefined classification"] = classificacao_undefined_missing
df_missing_transpose = df_missing_transpose.drop("RecordID", axis=0)
df_missing_transpose = df_missing_transpose.drop("level_1", axis=0)
df_missing_transpose = df_missing_transpose.drop("Time", axis=0)
df_missing_transpose = df_missing_transpose.drop("Age", axis=0)
df_missing_transpose = df_missing_transpose.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographics - Train</h2>"))
df_missing_transpose

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,158173,203849,286,55237,76703,126967,103401,199457,162851,6153,57062,64892,35592,13229,13713,171667
ALT,158094,203758,285,55187,76687,126890,103373,199389,162748,6151,57037,64859,35577,13228,13707,171578
AST,158100,203761,285,55192,76688,126892,103374,199390,162756,6150,57040,64864,35582,13228,13707,171575
Albumin,158740,204638,285,55452,76887,127562,103762,199876,163787,6169,57261,65198,35724,13301,13804,172206
BUN,149098,192275,269,52092,72231,119959,97360,187885,153757,5783,53754,61029,33464,12420,12899,162293
Bilirubin,158067,203749,286,55206,76700,126785,103411,199337,162765,6151,57041,64861,35574,13223,13706,171546
Cholesterol,160438,206864,288,55725,77216,129613,105036,201717,165873,6232,57834,65840,36085,13414,13942,174243
Creatinine,149057,192191,269,52025,72214,119925,97353,187812,153705,5781,53738,61002,33451,12410,12898,162237
DiasABP,76086,90920,170,32283,16594,83314,34985,90306,76870,2276,20209,20561,11129,4323,4871,103807
FiO2,135795,174907,263,49795,64327,110847,85996,170678,140287,5194,48274,54853,29713,10991,11033,150907


<h3>Validation data</h3>

In [138]:
validation_X = physionet2012_dataset['val_X']

In [139]:
female_gender_validation_ids = validation_X[validation_X["Gender"] == 0.0]
female_gender_validation_ids = female_gender_validation_ids["RecordID"]
female_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(female_gender_validation_ids)]
female_gender_missing_rate_validation = female_gender_missing_rate_validation.isna().sum()
female_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            39880
ALT            39859
AST            39858
Age             2487
Albumin        40005
BUN            37661
Bilirubin      39845
Cholesterol    40501
Creatinine     37641
DiasABP        20758
FiO2           34366
GCS            27942
Gender         39715
Glucose        37806
HCO3           37705
HCT            36908
HR              4074
Height          2487
ICUType        39715
K              37497
Lactate        38918
MAP            20946
MechVent       34582
Mg             37765
NIDiasABP      21833
NIMAP          22119
NISysABP       21812
Na             37667
PaCO2          36312
PaO2           36325
Platelets      37677
RespRate       29363
SaO2           39138
SysABP         20755
Temp           26846
TroponinI      40509
TroponinT      40122
Urine          12457
WBC            37916
Weight         18136
pH             36157
dtype: int64

In [140]:
male_gender_validation_ids = validation_X[validation_X["Gender"] == 1.0]
male_gender_validation_ids = male_gender_validation_ids["RecordID"]
male_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(male_gender_validation_ids)]
male_gender_missing_rate_validation = male_gender_missing_rate_validation.isna().sum()
male_gender_missing_rate_validation

RecordID           0
level_1            0
Time               0
ALP            50553
ALT            50528
AST            50524
Age             3082
Albumin        50778
BUN            47622
Bilirubin      50506
Cholesterol    51314
Creatinine     47607
DiasABP        22892
FiO2           43325
GCS            35166
Gender         50337
Glucose        47921
HCO3           47720
HCT            46207
HR              5196
Height          3082
ICUType        50337
K              47512
Lactate        49334
MAP            23094
MechVent       43634
Mg             47737
NIDiasABP      30305
NIMAP          30608
NISysABP       30288
Na             47733
PaCO2          45095
PaO2           45103
Platelets      47361
RespRate       39973
SaO2           49118
SysABP         22891
Temp           31498
TroponinI      51322
TroponinT      50876
Urine          16053
WBC            47774
Weight         24097
pH             44755
dtype: int64

In [141]:
undefined_gender_ids_validation = validation_X[validation_X["Gender"] == -1.0]
undefined_gender_ids_validation = undefined_gender_ids_validation["RecordID"]
undefined_gender_missing_rate_validation = validation_X[validation_X["RecordID"].isin(undefined_gender_ids_validation)]
undefined_gender_missing_rate_validation = undefined_gender_missing_rate_validation.isna().sum()
undefined_gender_missing_rate_validation

RecordID        0
level_1         0
Time            0
ALP            96
ALT            96
AST            96
Age            46
Albumin        95
BUN            90
Bilirubin      96
Cholesterol    96
Creatinine     90
DiasABP        50
FiO2           85
GCS            81
Gender         94
Glucose        90
HCO3           90
HCT            90
HR             51
Height         46
ICUType        94
K              90
Lactate        96
MAP            52
MechVent       89
Mg             91
NIDiasABP      86
NIMAP          86
NISysABP       86
Na             90
PaCO2          89
PaO2           89
Platelets      90
RespRate       96
SaO2           96
SysABP         50
Temp           74
TroponinI      96
TroponinT      93
Urine          56
WBC            90
Weight         60
pH             89
dtype: int64

In [142]:
ICUType_1_validation_ids = validation_X[validation_X["ICUType"] == 1.0]
ICUType_1_validation_ids = ICUType_1_validation_ids[ICUType_1_validation_ids["Time"] == 0.0]
ICUType_1_validation_ids = ICUType_1_validation_ids["RecordID"]
ICUType_1_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_1_validation_ids)]
ICUType_1_validation_missing = ICUType_1_validation_missing.isna().sum()
ICUType_1_validation_missing

RecordID           0
level_1            0
Time               0
ALP            13056
ALT            13046
AST            13044
Age             1036
Albumin        13107
BUN            12316
Bilirubin      13044
Cholesterol    13165
Creatinine     12301
DiasABP         8175
FiO2           11931
GCS             9814
Gender         12972
Glucose        12369
HCO3           12354
HCT            12095
HR              1615
Height          1036
ICUType        12972
K              12145
Lactate        12978
MAP             8196
MechVent       12054
Mg             12308
NIDiasABP       6459
NIMAP           6489
NISysABP        6450
Na             12363
PaCO2          12278
PaO2           12278
Platelets      12283
RespRate        8255
SaO2           12660
SysABP          8175
Temp            9262
TroponinI      13218
TroponinT      12970
Urine           5716
WBC            12392
Weight          7190
pH             12254
dtype: int64

In [143]:
ICUType_2_validation_ids = validation_X[validation_X["ICUType"] == 2.0]
ICUType_2_validation_ids = ICUType_2_validation_ids[ICUType_2_validation_ids["Time"] == 0.0]
ICUType_2_validation_ids = ICUType_2_validation_ids["RecordID"]
ICUType_2_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_2_validation_ids)]
ICUType_2_validation_missing = ICUType_2_validation_missing.isna().sum()
ICUType_2_validation_missing

RecordID           0
level_1            0
Time               0
ALP            19447
ALT            19444
AST            19445
Age              616
Albumin        19489
BUN            18315
Bilirubin      19442
Cholesterol    19578
Creatinine     18312
DiasABP         4147
FiO2           16176
GCS            14404
Gender         19176
Glucose        18695
HCO3           18411
HCT            17364
HR              1588
Height           616
ICUType        19176
K              18582
Lactate        18813
MAP             4131
MechVent       16286
Mg             18388
NIDiasABP      15243
NIMAP          15275
NISysABP       15234
Na             18612
PaCO2          15398
PaO2           15413
Platelets      17921
RespRate       18758
SaO2           17328
SysABP          4147
Temp            8218
TroponinI      19574
TroponinT      19537
Urine           2958
WBC            18223
Weight          9685
pH             15003
dtype: int64

In [144]:
ICUType_3_validation_ids = validation_X[validation_X["ICUType"] == 3.0]
ICUType_3_validation_ids = ICUType_3_validation_ids[ICUType_3_validation_ids["Time"] == 0.0]
ICUType_3_validation_ids = ICUType_3_validation_ids["RecordID"]
ICUType_3_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_3_validation_ids)]
ICUType_3_validation_missing = ICUType_3_validation_missing.isna().sum()
ICUType_3_validation_missing

RecordID           0
level_1            0
Time               0
ALP            33716
ALT            33688
AST            33687
Age             2821
Albumin        33888
BUN            31879
Bilirubin      33646
Cholesterol    34437
Creatinine     31864
DiasABP        23066
FiO2           29546
GCS            25529
Gender         33746
Glucose        31885
HCO3           31866
HCT            31433
HR              4030
Height          2821
ICUType        33746
K              31676
Lactate        33175
MAP            23225
MechVent       29985
Mg             32048
NIDiasABP      14576
NIMAP          14947
NISysABP       14571
Na             31788
PaCO2          31933
PaO2           31936
Platelets      32118
RespRate       23825
SaO2           34042
SysABP         23063
Temp           24994
TroponinI      34394
TroponinT      34042
Urine          14045
WBC            32215
Weight         11346
pH             31901
dtype: int64

In [145]:
ICUType_4_validation_ids = validation_X[validation_X["ICUType"] == 4.0]
ICUType_4_validation_ids = ICUType_4_validation_ids[ICUType_4_validation_ids["Time"] == 0.0]
ICUType_4_validation_ids = ICUType_4_validation_ids["RecordID"]
ICUType_4_validation_missing = validation_X[validation_X["RecordID"].isin(ICUType_4_validation_ids)]
ICUType_4_validation_missing = ICUType_4_validation_missing.isna().sum()
ICUType_4_validation_missing

RecordID           0
level_1            0
Time               0
ALP            24310
ALT            24305
AST            24302
Age             1142
Albumin        24394
BUN            22863
Bilirubin      24315
Cholesterol    24731
Creatinine     22861
DiasABP         8312
FiO2           20123
GCS            13442
Gender         24252
Glucose        22868
HCO3           22884
HCT            22313
HR              2088
Height          1142
ICUType        24252
K              22696
Lactate        23382
MAP             8540
MechVent       19980
Mg             22849
NIDiasABP      15946
NIMAP          16102
NISysABP       15931
Na             22727
PaCO2          21887
PaO2           21890
Platelets      22806
RespRate       18594
SaO2           24322
SysABP          8311
Temp           15944
TroponinI      24741
TroponinT      24542
Urine           5847
WBC            22950
Weight         14072
pH             21843
dtype: int64

In [146]:
more_than_or_equal_to_65_validation_ids = validation_X[validation_X["Age"] >= 65]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids[more_than_or_equal_to_65_validation_ids["Time"] == 0.0]
more_than_or_equal_to_65_validation_ids = more_than_or_equal_to_65_validation_ids["RecordID"]
more_than_or_equal_to_65_validation_missing = validation_X[validation_X["RecordID"].isin(more_than_or_equal_to_65_validation_ids)]
more_than_or_equal_to_65_validation_missing = more_than_or_equal_to_65_validation_missing.isna().sum()
more_than_or_equal_to_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            49697
ALT            49685
AST            49684
Age             2908
Albumin        49837
BUN            46834
Bilirubin      49670
Cholesterol    50303
Creatinine     46813
DiasABP        24103
FiO2           42583
GCS            35045
Gender         49350
Glucose        47149
HCO3           46921
HCT            45566
HR              4814
Height          2908
ICUType        49350
K              46739
Lactate        48479
MAP            24225
MechVent       43171
Mg             46961
NIDiasABP      28178
NIMAP          28447
NISysABP       28156
Na             46981
PaCO2          44596
PaO2           44610
Platelets      46659
RespRate       37471
SaO2           48074
SysABP         24101
Temp           31263
TroponinI      50302
TroponinT      49719
Urine          14529
WBC            47024
Weight         22608
pH             44300
dtype: int64

In [147]:
less_than_65_validation_ids = validation_X[validation_X["Age"] < 65]
less_than_65_validation_ids = less_than_65_validation_ids[less_than_65_validation_ids["Time"] == 0.0]
less_than_65_validation_ids = less_than_65_validation_ids["RecordID"]
less_than_65_validation_missing = validation_X[validation_X["RecordID"].isin(less_than_65_validation_ids)]
less_than_65_validation_missing = less_than_65_validation_missing.isna().sum()
less_than_65_validation_missing

RecordID           0
level_1            0
Time               0
ALP            40832
ALT            40798
AST            40794
Age             2707
Albumin        41041
BUN            38539
Bilirubin      40777
Cholesterol    41608
Creatinine     38525
DiasABP        19597
FiO2           35193
GCS            28144
Gender         40796
Glucose        38668
HCO3           38594
HCT            37639
HR              4507
Height          2707
ICUType        40796
K              38360
Lactate        39869
MAP            19867
MechVent       35134
Mg             38632
NIDiasABP      24046
NIMAP          24366
NISysABP       24030
Na             38509
PaCO2          36900
PaO2           36907
Platelets      38469
RespRate       31961
SaO2           40278
SysABP         19595
Temp           27155
TroponinI      41625
TroponinT      41372
Urine          14037
WBC            38756
Weight         19685
pH             36701
dtype: int64

In [148]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [149]:
filtered_validation_X_metros = filtered_validation_X.copy()
filtered_validation_X_metros["Height"] = filtered_validation_X["Height"]/100
filtered_validation_X_metros["Height"]

48        1.753
67        1.753
68        1.753
69        1.753
70        1.753
          ...  
575034    1.600
575035    1.600
575037    1.600
575038    1.600
575039    1.600
Name: Height, Length: 25076, dtype: float64

In [150]:
bmi_data_validation = filtered_validation_X_metros
bmi_data_validation["BMI"] = round(filtered_validation_X_metros["Weight"] / (filtered_validation_X_metros["Height"]**2), 1)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,76.0,7.45,24.7,Peso normal
67,132540,19,19.0,,,,76.0,,,,...,122.0,37.5,,,50.0,,80.6,,26.2,Sobrepeso
68,132540,20,20.0,,,,76.0,,,,...,107.0,37.4,,,380.0,,80.6,,26.2,Sobrepeso
69,132540,21,21.0,,,,76.0,,,,...,121.0,37.5,,,170.0,,80.6,,26.2,Sobrepeso
70,132540,22,22.0,,,,76.0,,,,...,128.0,37.5,,,130.0,,80.6,,26.2,Sobrepeso


In [151]:
bmi_data_validation = bmi_data_validation.groupby("RecordID").first().reset_index()
bmi_data_validation

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132540,0,0.0,,,,76.0,,21.0,,...,122.0,37.5,,,50.0,13.3,76.0,7.45,24.7,Peso normal
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,114.0,,35.1,Obesidade grau 2
2,132555,0,0.0,,,,74.0,,19.0,,...,98.0,34.8,,,35.0,9.0,66.1,7.39,21.5,Peso normal
3,132575,0,0.0,,,,78.0,,18.0,,...,122.0,37.4,,,38.0,12.5,63.0,7.34,22.4,Peso normal
4,132588,0,0.0,,,,48.0,,,,...,,,,,,,42.3,,17.6,Baixo peso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,162926,0,0.0,,,,83.0,,18.0,,...,117.0,36.9,,,60.0,12.3,104.5,7.34,35.0,Obesidade grau 2
976,162942,0,0.0,67.0,61.0,92.0,40.0,3.3,12.0,0.3,...,,36.3,,,600.0,20.6,120.7,7.38,37.1,Obesidade grau 2
977,162952,0,0.0,,,,64.0,,,,...,,,,,,,47.7,,16.5,Baixo peso
978,162983,0,0.0,95.0,369.0,366.0,75.0,3.1,28.0,6.4,...,124.0,35.3,1.2,,80.0,25.0,90.0,7.33,31.1,Obesidade grau 1


In [152]:
bmi_data_validation["Classificacao"].value_counts()

Classificacao
Sobrepeso           326
Peso normal         299
Obesidade grau 1    181
Obesidade grau 2     84
Obesidade grau 3     60
Baixo peso           30
Name: count, dtype: int64

In [153]:
classificacao_undefined_ids_validation = bmi_data_validation["RecordID"]
classificacao_undefined_missing_validation = validation_X[~validation_X["RecordID"].isin(classificacao_undefined_ids_validation)]
classificacao_undefined_missing_validation = classificacao_undefined_missing_validation.isna().sum()
classificacao_undefined_missing_validation

RecordID           0
level_1            0
Time               0
ALP            44265
ALT            44239
AST            44235
Age             3566
Albumin        44398
BUN            41819
Bilirubin      44218
Cholesterol    44950
Creatinine     41796
DiasABP        28025
FiO2           38879
GCS            30328
Gender         44086
Glucose        41827
HCO3           41831
HCT            41167
HR              5172
Height          3566
ICUType        44086
K              41532
Lactate        43524
MAP            28284
MechVent       39303
Mg             41937
NIDiasABP      20562
NIMAP          21021
NISysABP       20547
Na             41672
PaCO2          41930
PaO2           41939
Platelets      42057
RespRate       29758
SaO2           44509
SysABP         28023
Temp           33064
TroponinI      44977
TroponinT      44473
Urine          16586
WBC            42170
Weight         20329
pH             41879
dtype: int64

In [154]:
classificacao_baixo_peso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_validation = classificacao_baixo_peso_ids_validation["RecordID"]
classificacao_baixo_peso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_baixo_peso_ids_validation)]
classificacao_baixo_peso_missing_validation = classificacao_baixo_peso_missing_validation.isna().sum()
classificacao_baixo_peso_missing_validation

RecordID          0
level_1           0
Time              0
ALP            1410
ALT            1410
AST            1410
Age              42
Albumin        1417
BUN            1325
Bilirubin      1409
Cholesterol    1437
Creatinine     1326
DiasABP         514
FiO2           1223
GCS            1024
Gender         1410
Glucose        1332
HCO3           1329
HCT            1280
HR              117
Height           42
ICUType        1410
K              1320
Lactate        1349
MAP             517
MechVent       1206
Mg             1328
NIDiasABP       888
NIMAP           892
NISysABP        887
Na             1329
PaCO2          1272
PaO2           1273
Platelets      1321
RespRate       1084
SaO2           1398
SysABP          513
Temp            835
TroponinI      1436
TroponinT      1427
Urine           435
WBC            1332
Weight          655
pH             1262
dtype: int64

In [155]:
teste = classificacao_baixo_peso_ids_validation.unique()
teste.size

30

In [156]:
classificacao_peso_normal_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Peso normal"]
classificacao_peso_normal_ids_validation = classificacao_peso_normal_ids_validation["RecordID"]
classificacao_peso_normal_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_peso_normal_ids_validation)]
classificacao_peso_normal_missing_validation = classificacao_peso_normal_missing_validation.isna().sum()
classificacao_peso_normal_missing_validation

RecordID           0
level_1            0
Time               0
ALP            14122
ALT            14116
AST            14116
Age              531
Albumin        14179
BUN            13277
Bilirubin      14115
Cholesterol    14327
Creatinine     13275
DiasABP         4404
FiO2           11993
GCS             9849
Gender         14053
Glucose        13396
HCO3           13319
HCT            12855
HR              1130
Height           531
ICUType        14053
K              13278
Lactate        13736
MAP             4453
MechVent       11886
Mg             13322
NIDiasABP       9958
NIMAP           9982
NISysABP        9953
Na             13338
PaCO2          12187
PaO2           12192
Platelets      13139
RespRate       12218
SaO2           13424
SysABP          4404
Temp            7637
TroponinI      14320
TroponinT      14243
Urine           3738
WBC            13303
Weight          6754
pH             12028
dtype: int64

In [157]:
teste = classificacao_peso_normal_ids_validation.unique()
teste.size

299

In [158]:
classificacao_sobrepeso_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_validation = classificacao_sobrepeso_ids_validation["RecordID"]
classificacao_sobrepeso_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_sobrepeso_ids_validation)]
classificacao_sobrepeso_missing_validation = classificacao_sobrepeso_missing_validation.isna().sum()
classificacao_sobrepeso_missing_validation

RecordID           0
level_1            0
Time               0
ALP            15394
ALT            15390
AST            15388
Age              777
Albumin        15475
BUN            14522
Bilirubin      15385
Cholesterol    15623
Creatinine     14511
DiasABP         5868
FiO2           13010
GCS            10893
Gender         15322
Glucose        14668
HCO3           14564
HCT            13935
HR              1527
Height           777
ICUType        15322
K              14520
Lactate        14901
MAP             5888
MechVent       13143
Mg             14557
NIDiasABP       9930
NIMAP           9983
NISysABP        9922
Na             14617
PaCO2          13250
PaO2           13257
Platelets      14290
RespRate       12815
SaO2           14607
SysABP          5867
Temp            8888
TroponinI      15621
TroponinT      15510
Urine           4101
WBC            14497
Weight          7667
pH             13109
dtype: int64

In [159]:
teste = classificacao_sobrepeso_ids_validation.unique()
teste.size

326

In [160]:
classificacao_obesidade_1_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_validation = classificacao_obesidade_1_ids_validation["RecordID"]
classificacao_obesidade_1_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_1_ids_validation)]
classificacao_obesidade_1_missing_validation = classificacao_obesidade_1_missing_validation.isna().sum()
classificacao_obesidade_1_missing_validation

RecordID          0
level_1           0
Time              0
ALP            8555
ALT            8551
AST            8550
Age             384
Albumin        8599
BUN            8052
Bilirubin      8545
Cholesterol    8677
Creatinine     8051
DiasABP        2674
FiO2           7130
GCS            6157
Gender         8507
Glucose        8148
HCO3           8080
HCT            7805
HR              786
Height          384
ICUType        8507
K              8050
Lactate        8324
MAP            2709
MechVent       7123
Mg             8066
NIDiasABP      6093
NIMAP          6120
NISysABP       6085
Na             8108
PaCO2          7202
PaO2           7203
Platelets      8005
RespRate       7703
SaO2           8030
SysABP         2674
Temp           4316
TroponinI      8677
TroponinT      8605
Urine          1937
WBC            8096
Weight         3973
pH             7118
dtype: int64

In [161]:
teste = classificacao_obesidade_1_ids_validation.unique()
teste.size

181

In [162]:
classificacao_obesidade_2_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_validation = classificacao_obesidade_2_ids_validation["RecordID"]
classificacao_obesidade_2_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_2_ids_validation)]
classificacao_obesidade_2_missing_validation = classificacao_obesidade_2_missing_validation.isna().sum()
classificacao_obesidade_2_missing_validation

RecordID          0
level_1           0
Time              0
ALP            3953
ALT            3949
AST            3950
Age             216
Albumin        3975
BUN            3710
Bilirubin      3949
Cholesterol    4023
Creatinine     3710
DiasABP        1214
FiO2           3301
GCS            2905
Gender         3948
Glucose        3754
HCO3           3717
HCT            3579
HR              383
Height          216
ICUType        3948
K              3730
Lactate        3788
MAP            1231
MechVent       3329
Mg             3727
NIDiasABP      2874
NIMAP          2889
NISysABP       2873
Na             3737
PaCO2          3297
PaO2           3300
Platelets      3666
RespRate       3294
SaO2           3713
SysABP         1214
Temp           2035
TroponinI      4022
TroponinT      3988
Urine          1038
WBC            3711
Weight         1719
pH             3271
dtype: int64

In [163]:
teste = classificacao_obesidade_2_ids_validation.unique()
teste.size

84

In [164]:
classificacao_obesidade_3_ids_validation = bmi_data_validation[bmi_data_validation["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_validation = classificacao_obesidade_3_ids_validation["RecordID"]
classificacao_obesidade_3_missing_validation = validation_X[validation_X["RecordID"].isin(classificacao_obesidade_3_ids_validation)]
classificacao_obesidade_3_missing_validation = classificacao_obesidade_3_missing_validation.isna().sum()
classificacao_obesidade_3_missing_validation

RecordID          0
level_1           0
Time              0
ALP            2830
ALT            2828
AST            2829
Age              99
Albumin        2835
BUN            2668
Bilirubin      2826
Cholesterol    2874
Creatinine     2669
DiasABP        1001
FiO2           2240
GCS            2033
Gender         2820
Glucose        2692
HCO3           2675
HCT            2584
HR              206
Height           99
ICUType        2820
K              2669
Lactate        2726
MAP            1010
MechVent       2315
Mg             2656
NIDiasABP      1919
NIMAP          1926
NISysABP       1919
Na             2689
PaCO2          2358
PaO2           2353
Platelets      2650
RespRate       2560
SaO2           2671
SysABP         1001
Temp           1643
TroponinI      2874
TroponinT      2845
Urine           731
WBC            2671
Weight         1196
pH             2334
dtype: int64

In [165]:
teste = classificacao_obesidade_3_ids_validation.unique()
teste.size

60

In [166]:
df_missing_validation = pd.DataFrame(columns=df_columns)
df_missing_transpose_validation = df_missing_validation.T
df_missing_transpose_validation["Female"] = female_gender_missing_rate_validation
df_missing_transpose_validation["Male"] = male_gender_missing_rate_validation
df_missing_transpose_validation["Undefined gender"] = undefined_gender_missing_rate_validation
df_missing_transpose_validation["ICUType 1"] = ICUType_1_validation_missing
df_missing_transpose_validation["ICUType 2"] = ICUType_2_validation_missing
df_missing_transpose_validation["ICUType 3"] = ICUType_3_validation_missing
df_missing_transpose_validation["ICUType 4"] = ICUType_4_validation_missing
df_missing_transpose_validation["Age 65+"] = more_than_or_equal_to_65_validation_missing
df_missing_transpose_validation["Age 65-"] = less_than_65_validation_missing
df_missing_transpose_validation["Low Weight"] = classificacao_baixo_peso_missing_validation
df_missing_transpose_validation["Normal Weight"] = classificacao_peso_normal_missing_validation
df_missing_transpose_validation["Overweight"] = classificacao_sobrepeso_missing_validation
df_missing_transpose_validation["Obesity Grade 1"] = classificacao_obesidade_1_missing_validation
df_missing_transpose_validation["Obesity Grade 2"] = classificacao_obesidade_2_missing_validation
df_missing_transpose_validation["Obesity Grade 3"] = classificacao_obesidade_3_missing_validation
df_missing_transpose_validation["Undefined classification"] = classificacao_undefined_missing_validation
df_missing_transpose_validation = df_missing_transpose_validation.drop("RecordID", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("level_1", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Time", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Age", axis=0)
df_missing_transpose_validation = df_missing_transpose_validation.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>Original Missing Rate per Variable by demographcs - Validation</h2>"))
df_missing_transpose_validation

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,39880,50553,96,13056,19447,33716,24310,49697,40832,1410,14122,15394,8555,3953,2830,44265
ALT,39859,50528,96,13046,19444,33688,24305,49685,40798,1410,14116,15390,8551,3949,2828,44239
AST,39858,50524,96,13044,19445,33687,24302,49684,40794,1410,14116,15388,8550,3950,2829,44235
Albumin,40005,50778,95,13107,19489,33888,24394,49837,41041,1417,14179,15475,8599,3975,2835,44398
BUN,37661,47622,90,12316,18315,31879,22863,46834,38539,1325,13277,14522,8052,3710,2668,41819
Bilirubin,39845,50506,96,13044,19442,33646,24315,49670,40777,1409,14115,15385,8545,3949,2826,44218
Cholesterol,40501,51314,96,13165,19578,34437,24731,50303,41608,1437,14327,15623,8677,4023,2874,44950
Creatinine,37641,47607,90,12301,18312,31864,22861,46813,38525,1326,13275,14511,8051,3710,2669,41796
DiasABP,20758,22892,50,8175,4147,23066,8312,24103,19597,514,4404,5868,2674,1214,1001,28025
FiO2,34366,43325,85,11931,16176,29546,20123,42583,35193,1223,11993,13010,7130,3301,2240,38879


<h3>Test data</h3>

In [167]:
test_X = physionet2012_dataset['test_X']

In [168]:
test_X["Gender"].value_counts()

Gender
 1.0    1329
 0.0    1066
-1.0       4
Name: count, dtype: int64

In [169]:
female_gender_test_ids = test_X[test_X['Gender'] == 0.0]
female_gender_test_ids = female_gender_test_ids["RecordID"]
female_gender_missing_rate_test = test_X[test_X["RecordID"].isin(female_gender_test_ids)]
female_gender_missing_rate_test = female_gender_missing_rate_test.isna().sum()
female_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            50277
ALT            50258
AST            50258
Age             2732
Albumin        50473
BUN            47452
Bilirubin      50247
Cholesterol    51078
Creatinine     47439
DiasABP        25131
FiO2           43317
GCS            34195
Gender         50102
Glucose        47583
HCO3           47502
HCT            46324
HR              4638
Height          2732
ICUType        50102
K              47222
Lactate        49202
MAP            25301
MechVent       43649
Mg             47495
NIDiasABP      27861
NIMAP          28188
NISysABP       27847
Na             47482
PaCO2          45856
PaO2           45867
Platelets      47402
RespRate       36878
SaO2           49415
SysABP         25131
Temp           32762
TroponinI      51071
TroponinT      50668
Urine          15175
WBC            47697
Weight         23577
pH             45649
dtype: int64

In [170]:
female_gender_test_ids.unique().size

1066

In [171]:
male_gender_test_ids = test_X[test_X['Gender'] == 1.0]
male_gender_test_ids = male_gender_test_ids["RecordID"]
male_gender_missing_rate_test = test_X[test_X["RecordID"].isin(male_gender_test_ids)]
male_gender_missing_rate_test = male_gender_missing_rate_test.isna().sum()
male_gender_missing_rate_test

RecordID           0
level_1            0
Time               0
ALP            62696
ALT            62662
AST            62657
Age             3389
Albumin        62980
BUN            59107
Bilirubin      62646
Cholesterol    63659
Creatinine     59091
DiasABP        27553
FiO2           52860
GCS            43760
Gender         62463
Glucose        59458
HCO3           59242
HCT            57463
HR              6036
Height          3389
ICUType        62463
K              58945
Lactate        60953
MAP            27693
MechVent       53627
Mg             59223
NIDiasABP      37848
NIMAP          38165
NISysABP       37822
Na             59299
PaCO2          55729
PaO2           55745
Platelets      58953
RespRate       50392
SaO2           61105
SysABP         27549
Temp           38899
TroponinI      63668
TroponinT      63055
Urine          19083
WBC            59405
Weight         30330
pH             55308
dtype: int64

In [172]:
male_gender_test_ids.unique().size

1329

In [173]:
undefined_gender_ids_test = test_X[test_X["Gender"] == -1.0]
undefined_gender_ids_test = undefined_gender_ids_test["RecordID"]
undefined_gender_missing_rate_test = test_X[test_X["RecordID"].isin(undefined_gender_ids_test)]
undefined_gender_missing_rate_test = undefined_gender_missing_rate_test.isna().sum()
undefined_gender_missing_rate_test

RecordID         0
level_1          0
Time             0
ALP            189
ALT            189
AST            189
Age              6
Albumin        191
BUN            180
Bilirubin      189
Cholesterol    192
Creatinine     180
DiasABP         66
FiO2           176
GCS            119
Gender         188
Glucose        181
HCO3           180
HCT            182
HR               9
Height           6
ICUType        188
K              181
Lactate        190
MAP             68
MechVent       166
Mg             181
NIDiasABP      101
NIMAP          101
NISysABP       101
Na             181
PaCO2          183
PaO2           183
Platelets      182
RespRate       150
SaO2           191
SysABP          66
Temp           147
TroponinI      192
TroponinT      192
Urine           87
WBC            181
Weight          65
pH             182
dtype: int64

In [174]:
ICUType_1_test_ids = test_X[test_X["ICUType"] == 1.0]
ICUType_1_test_ids = ICUType_1_test_ids[ICUType_1_test_ids["Time"] == 0.0]
ICUType_1_test_ids = ICUType_1_test_ids["RecordID"]
ICUType_1_test_missing = test_X[test_X["RecordID"].isin(ICUType_1_test_ids)]
ICUType_1_test_missing = ICUType_1_test_missing.isna().sum()
ICUType_1_test_missing

RecordID           0
level_1            0
Time               0
ALP            15168
ALT            15153
AST            15152
Age             1283
Albumin        15202
BUN            14259
Bilirubin      15160
Cholesterol    15292
Creatinine     14240
DiasABP         9261
FiO2           13756
GCS            11459
Gender         15087
Glucose        14292
HCO3           14289
HCT            14023
HR              1954
Height          1283
ICUType        15087
K              14060
Lactate        15050
MAP             9291
MechVent       13843
Mg             14301
NIDiasABP       7773
NIMAP           7824
NISysABP        7772
Na             14290
PaCO2          14163
PaO2           14164
Platelets      14251
RespRate        9554
SaO2           14724
SysABP          9261
Temp           10882
TroponinI      15354
TroponinT      15042
Urine           6854
WBC            14384
Weight          8942
pH             14139
dtype: int64

In [175]:
ICUType_1_test_ids.unique().size

321

In [176]:
ICUType_2_test_ids = test_X[test_X["ICUType"] == 2.0]
ICUType_2_test_ids = ICUType_2_test_ids[ICUType_2_test_ids["Time"] == 0.0]
ICUType_2_test_ids = ICUType_2_test_ids["RecordID"]
ICUType_2_test_missing = test_X[test_X["RecordID"].isin(ICUType_2_test_ids)]
ICUType_2_test_missing = ICUType_2_test_missing.isna().sum()
ICUType_2_test_missing

RecordID           0
level_1            0
Time               0
ALP            24337
ALT            24332
AST            24331
Age              800
Albumin        24403
BUN            22925
Bilirubin      24336
Cholesterol    24517
Creatinine     22927
DiasABP         5403
FiO2           20258
GCS            18058
Gender         24017
Glucose        23384
HCO3           23047
HCT            21845
HR              2061
Height           800
ICUType        24017
K              23212
Lactate        23643
MAP             5333
MechVent       20483
Mg             22929
NIDiasABP      19196
NIMAP          19218
NISysABP       19186
Na             23296
PaCO2          19511
PaO2           19533
Platelets      22539
RespRate       23517
SaO2           21893
SysABP          5401
Temp           10270
TroponinI      24495
TroponinT      24469
Urine           3670
WBC            22874
Weight         11691
pH             19019
dtype: int64

In [177]:
ICUType_2_test_ids.unique().size

511

In [178]:
ICUType_3_test_ids = test_X[test_X["ICUType"] == 3.0]
ICUType_3_test_ids = ICUType_3_test_ids[ICUType_3_test_ids["Time"] == 0.0]
ICUType_3_test_ids = ICUType_3_test_ids["RecordID"]
ICUType_3_test_missing = test_X[test_X["RecordID"].isin(ICUType_3_test_ids)]
ICUType_3_test_missing = ICUType_3_test_missing.isna().sum()
ICUType_3_test_missing

RecordID           0
level_1            0
Time               0
ALP            40640
ALT            40618
AST            40615
Age             2683
Albumin        40892
BUN            38426
Bilirubin      40579
Cholesterol    41524
Creatinine     38418
DiasABP        26561
FiO2           35155
GCS            30359
Gender         40702
Glucose        38434
HCO3           38426
HCT            37722
HR              4102
Height          2683
ICUType        40702
K              38149
Lactate        39934
MAP            26787
MechVent       35797
Mg             38590
NIDiasABP      17872
NIMAP          18295
NISysABP       17859
Na             38356
PaCO2          38342
PaO2           38327
Platelets      38678
RespRate       28814
SaO2           41077
SysABP         26560
Temp           29208
TroponinI      41481
TroponinT      41008
Urine          16008
WBC            38796
Weight         15052
pH             38282
dtype: int64

In [179]:
ICUType_3_test_ids.unique().size

866

In [180]:
ICUType_4_test_ids = test_X[test_X["ICUType"] == 4.0]
ICUType_4_test_ids = ICUType_4_test_ids[ICUType_4_test_ids["Time"] == 0.0]
ICUType_4_test_ids = ICUType_4_test_ids["RecordID"]
ICUType_4_test_missing = test_X[test_X["RecordID"].isin(ICUType_4_test_ids)]
ICUType_4_test_missing = ICUType_4_test_missing.isna().sum()
ICUType_4_test_missing

RecordID           0
level_1            0
Time               0
ALP            33017
ALT            33006
AST            33006
Age             1361
Albumin        33147
BUN            31129
Bilirubin      33007
Cholesterol    33596
Creatinine     31125
DiasABP        11525
FiO2           27184
GCS            18198
Gender         32947
Glucose        31112
HCO3           31162
HCT            30379
HR              2566
Height          1361
ICUType        32947
K              30927
Lactate        31718
MAP            11651
MechVent       27319
Mg             31079
NIDiasABP      20969
NIMAP          21117
NISysABP       20953
Na             31020
PaCO2          29752
PaO2           29771
Platelets      31069
RespRate       25535
SaO2           33017
SysABP         11524
Temp           21448
TroponinI      33601
TroponinT      33396
Urine           7813
WBC            31229
Weight         18287
pH             29699
dtype: int64

In [181]:
ICUType_4_test_ids.unique().size

701

In [182]:
more_than_or_equal_to_65_test_ids = test_X[test_X["Age"] >= 65]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids[more_than_or_equal_to_65_test_ids["Time"] == 0.0]
more_than_or_equal_to_65_test_ids = more_than_or_equal_to_65_test_ids["RecordID"]
more_than_or_equal_to_65_test_missing = test_X[test_X["RecordID"].isin(more_than_or_equal_to_65_test_ids)]
more_than_or_equal_to_65_test_missing = more_than_or_equal_to_65_test_missing.isna().sum()
more_than_or_equal_to_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            61303
ALT            61284
AST            61276
Age             3149
Albumin        61437
BUN            57719
Bilirubin      61271
Cholesterol    62030
Creatinine     57703
DiasABP        27707
FiO2           52014
GCS            42493
Gender         60865
Glucose        58008
HCO3           57822
HCT            56158
HR              5548
Height          3149
ICUType        60865
K              57510
Lactate        59611
MAP            27879
MechVent       52663
Mg             57801
NIDiasABP      35810
NIMAP          36110
NISysABP       35793
Na             57889
PaCO2          54825
PaO2           54833
Platelets      57586
RespRate       46914
SaO2           59522
SysABP         27706
Temp           37962
TroponinI      62007
TroponinT      61281
Urine          17440
WBC            58000
Weight         29064
pH             54459
dtype: int64

In [183]:
more_than_or_equal_to_65_test_ids.unique().size

1295

In [184]:
less_than_65_test_ids = test_X[test_X["Age"] < 65]
less_than_65_test_ids = less_than_65_test_ids[less_than_65_test_ids["Time"] == 0.0]
less_than_65_test_ids = less_than_65_test_ids["RecordID"]
less_than_65_test_missing = test_X[test_X["RecordID"].isin(less_than_65_test_ids)]
less_than_65_test_missing = less_than_65_test_missing.isna().sum()
less_than_65_test_missing

RecordID           0
level_1            0
Time               0
ALP            51859
ALT            51825
AST            51828
Age             2978
Albumin        52207
BUN            49020
Bilirubin      51811
Cholesterol    52899
Creatinine     49007
DiasABP        25043
FiO2           44339
GCS            35581
Gender         51888
Glucose        49214
HCO3           49102
HCT            47811
HR              5135
Height          2978
ICUType        51888
K              48838
Lactate        50734
MAP            25183
MechVent       44779
Mg             49098
NIDiasABP      30000
NIMAP          30344
NISysABP       29977
Na             49073
PaCO2          46943
PaO2           46962
Platelets      48951
RespRate       40506
SaO2           51189
SysABP         25040
Temp           33846
TroponinI      52924
TroponinT      52634
Urine          16905
WBC            49283
Weight         24908
pH             46680
dtype: int64

In [185]:
less_than_65_test_ids.unique().size

1104

In [186]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [187]:
filtered_test_X_metros = filtered_test_X.copy()
filtered_test_X_metros["Height"] = filtered_test_X["Height"]/100
filtered_test_X_metros["Height"]

528       1.575
549       1.575
550       1.575
551       1.575
552       1.575
          ...  
573884    1.600
573885    1.600
573886    1.600
573887    1.600
574752    1.473
Name: Height, Length: 32426, dtype: float64

In [188]:
bmi_data_test = filtered_test_X_metros
bmi_data_test["BMI"] = round(filtered_test_X_metros["Weight"] / (filtered_test_X_metros["Height"]**2), 1)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
528,132567,0,0.0,,,,71.0,,,,...,111.5,35.6,,,,,56.0,7.44,22.6,Peso normal
549,132567,21,21.0,,,,71.0,,,,...,110.0,37.5,,,15.0,,55.8,,22.5,Peso normal
550,132567,22,22.0,,,,71.0,,,,...,106.0,37.6,,,20.0,,55.8,,22.5,Peso normal
551,132567,23,23.0,,,,71.0,,,,...,129.0,37.7,,,30.0,,55.8,,22.5,Peso normal
552,132567,24,24.0,,,,71.0,,,,...,94.0,37.8,,,20.0,,55.8,,22.5,Peso normal


In [189]:
bmi_data_test = bmi_data_test.groupby("RecordID").first().reset_index()
bmi_data_test

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH,BMI,Classificacao
0,132567,0,0.0,,,,71.0,,9.0,,...,111.500000,35.600000,,,15.0,9.0,56.0,7.44,22.6,Peso normal
1,132573,0,0.0,,,,77.0,,,,...,,36.900000,,,120.0,,90.1,,34.1,Obesidade grau 1
2,132602,0,0.0,,,,80.0,,,,...,,37.300000,,,150.0,,70.0,,21.5,Peso normal
3,132614,0,0.0,,,,77.0,,,,...,,,,,,,59.0,,22.3,Peso normal
4,132622,0,0.0,,,,71.0,,64.0,,...,,37.400000,19.0,,80.0,7.2,79.0,,30.9,Obesidade grau 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,162899,0,0.0,,,,64.0,,11.0,,...,91.666667,36.966667,,,50.0,10.1,74.7,7.39,24.3,Peso normal
1249,162912,0,0.0,34.0,17.0,24.0,63.0,2.3,6.0,0.4,...,123.133333,35.000000,,,370.0,5.4,80.0,7.03,25.3,Sobrepeso
1250,162929,0,0.0,,,,63.0,,26.0,,...,113.000000,37.000000,,,55.0,13.0,100.0,7.41,36.7,Obesidade grau 2
1251,162944,0,0.0,,,,58.0,,21.0,,...,115.000000,35.150000,,,400.0,19.6,121.1,7.56,47.3,Obesidade grau 3


In [190]:
bmi_data_test["Classificacao"].value_counts()

Classificacao
Sobrepeso           460
Peso normal         356
Obesidade grau 1    205
Obesidade grau 2    115
Obesidade grau 3     82
Baixo peso           35
Name: count, dtype: int64

In [191]:
classificacao_undefined_ids_test = bmi_data_test["RecordID"]
classificacao_undefined_missing_test = test_X[~test_X["RecordID"].isin(classificacao_undefined_ids_test)]
classificacao_undefined_missing_test = classificacao_undefined_missing_test.isna().sum()
classificacao_undefined_missing_test

RecordID           0
level_1            0
Time               0
ALP            54082
ALT            54052
AST            54053
Age             3760
Albumin        54274
BUN            51138
Bilirubin      54038
Cholesterol    54912
Creatinine     51119
DiasABP        33253
FiO2           46835
GCS            36523
Gender         53862
Glucose        51134
HCO3           51168
HCT            50178
HR              5729
Height          3760
ICUType        53862
K              50761
Lactate        53151
MAP            33489
MechVent       47669
Mg             51256
NIDiasABP      24923
NIMAP          25356
NISysABP       24907
Na             51023
PaCO2          50980
PaO2           50981
Platelets      51375
RespRate       36717
SaO2           54460
SysABP         33251
Temp           39831
TroponinI      54915
TroponinT      54362
Urine          19248
WBC            51528
Weight         26254
pH             50905
dtype: int64

In [192]:
classificacao_baixo_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Baixo peso"]
classificacao_baixo_peso_ids_test = classificacao_baixo_peso_ids_test["RecordID"]
classificacao_baixo_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_baixo_peso_ids_test)]
classificacao_baixo_peso_missing_test = classificacao_baixo_peso_missing_test.isna().sum()
classificacao_baixo_peso_missing_test

RecordID          0
level_1           0
Time              0
ALP            1654
ALT            1654
AST            1654
Age              73
Albumin        1660
BUN            1553
Bilirubin      1653
Cholesterol    1676
Creatinine     1552
DiasABP         599
FiO2           1389
GCS            1126
Gender         1645
Glucose        1563
HCO3           1554
HCT            1495
HR              122
Height           73
ICUType        1645
K              1550
Lactate        1604
MAP             585
MechVent       1396
Mg             1552
NIDiasABP      1134
NIMAP          1137
NISysABP       1130
Na             1547
PaCO2          1466
PaO2           1462
Platelets      1532
RespRate       1390
SaO2           1564
SysABP          599
Temp            998
TroponinI      1679
TroponinT      1660
Urine           460
WBC            1551
Weight          903
pH             1447
dtype: int64

In [193]:
classificacao_baixo_peso_ids_test.unique().size

35

In [194]:
classificacao_normal_peso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Peso normal"]
classificacao_normal_peso_ids_test = classificacao_normal_peso_ids_test["RecordID"]
classificacao_normal_peso_missing_test = test_X[test_X["RecordID"].isin(classificacao_normal_peso_ids_test)]
classificacao_normal_peso_missing_test = classificacao_normal_peso_missing_test.isna().sum()
classificacao_normal_peso_missing_test

RecordID           0
level_1            0
Time               0
ALP            16803
ALT            16794
AST            16793
Age              734
Albumin        16869
BUN            15804
Bilirubin      16788
Cholesterol    17055
Creatinine     15799
DiasABP         5978
FiO2           14125
GCS            11588
Gender         16732
Glucose        15911
HCO3           15833
HCT            15312
HR              1438
Height           734
ICUType        16732
K              15754
Lactate        16221
MAP             6016
MechVent       14244
Mg             15788
NIDiasABP      11147
NIMAP          11226
NISysABP       11142
Na             15880
PaCO2          14575
PaO2           14587
Platelets      15680
RespRate       14309
SaO2           16066
SysABP          5977
Temp            9198
TroponinI      17046
TroponinT      16932
Urine           4343
WBC            15850
Weight          8451
pH             14430
dtype: int64

In [195]:
classificacao_normal_peso_ids_test.unique().size

356

In [196]:
classificacao_sobrepeso_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Sobrepeso"]
classificacao_sobrepeso_ids_test = classificacao_sobrepeso_ids_test["RecordID"]
classificacao_sobrepeso_missing_test = test_X[test_X["RecordID"].isin(classificacao_sobrepeso_ids_test)]
classificacao_sobrepeso_missing_test = classificacao_sobrepeso_missing_test.isna().sum()
classificacao_sobrepeso_missing_test

RecordID           0
level_1            0
Time               0
ALP            21696
ALT            21685
AST            21684
Age              933
Albumin        21799
BUN            20416
Bilirubin      21683
Cholesterol    22022
Creatinine     20413
DiasABP         6561
FiO2           18190
GCS            15405
Gender         21620
Glucose        20622
HCO3           20491
HCT            19703
HR              1937
Height           933
ICUType        21620
K              20454
Lactate        21111
MAP             6584
MechVent       18210
Mg             20412
NIDiasABP      15594
NIMAP          15654
NISysABP       15588
Na             20581
PaCO2          18627
PaO2           18639
Platelets      20248
RespRate       19201
SaO2           20616
SysABP          6560
Temp           11422
TroponinI      22035
TroponinT      21863
Urine           5505
WBC            20459
Weight         10051
pH             18381
dtype: int64

In [197]:
classificacao_sobrepeso_ids_test.unique().size

460

In [198]:
classificacao_obesidade_1_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 1"]
classificacao_obesidade_1_ids_test = classificacao_obesidade_1_ids_test["RecordID"]
classificacao_obesidade_1_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_1_ids_test)]
classificacao_obesidade_1_missing_test = classificacao_obesidade_1_missing_test.isna().sum()
classificacao_obesidade_1_missing_test

RecordID          0
level_1           0
Time              0
ALP            9614
ALT            9611
AST            9608
Age             316
Albumin        9698
BUN            9067
Bilirubin      9611
Cholesterol    9820
Creatinine     9068
DiasABP        3205
FiO2           7967
GCS            6781
Gender         9635
Glucose        9156
HCO3           9094
HCT            8764
HR              721
Height          316
ICUType        9635
K              9057
Lactate        9265
MAP            3219
MechVent       8055
Mg             9085
NIDiasABP      6496
NIMAP          6517
NISysABP       6491
Na             9129
PaCO2          8224
PaO2           8227
Platelets      8996
RespRate       8235
SaO2           9223
SysABP         3205
Temp           5270
TroponinI      9815
TroponinT      9723
Urine          2235
WBC            9092
Weight         4377
pH             8141
dtype: int64

In [199]:
classificacao_obesidade_1_ids_test.unique().size

205

In [200]:
classificacao_obesidade_2_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 2"]
classificacao_obesidade_2_ids_test = classificacao_obesidade_2_ids_test["RecordID"]
classificacao_obesidade_2_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_2_ids_test)]
classificacao_obesidade_2_missing_test = classificacao_obesidade_2_missing_test.isna().sum()
classificacao_obesidade_2_missing_test

RecordID          0
level_1           0
Time              0
ALP            5429
ALT            5430
AST            5429
Age             179
Albumin        5453
BUN            5100
Bilirubin      5426
Cholesterol    5513
Creatinine     5097
DiasABP        1763
FiO2           4504
GCS            3849
Gender         5405
Glucose        5146
HCO3           5116
HCT            4928
HR              449
Height          179
ICUType        5405
K              5108
Lactate        5222
MAP            1766
MechVent       4553
Mg             5122
NIDiasABP      3838
NIMAP          3871
NISysABP       3837
Na             5126
PaCO2          4566
PaO2           4568
Platelets      5045
RespRate       4445
SaO2           5112
SysABP         1763
Temp           2901
TroponinI      5507
TroponinT      5467
Urine          1493
WBC            5112
Weight         2504
pH             4533
dtype: int64

In [201]:
classificacao_obesidade_2_ids_test.unique().size

115

In [202]:
classificacao_obesidade_3_ids_test = bmi_data_test[bmi_data_test["Classificacao"] == "Obesidade grau 3"]
classificacao_obesidade_3_ids_test = classificacao_obesidade_3_ids_test["RecordID"]
classificacao_obesidade_3_missing_test = test_X[test_X["RecordID"].isin(classificacao_obesidade_3_ids_test)]
classificacao_obesidade_3_missing_test = classificacao_obesidade_3_missing_test.isna().sum()
classificacao_obesidade_3_missing_test

RecordID          0
level_1           0
Time              0
ALP            3884
ALT            3883
AST            3883
Age             132
Albumin        3891
BUN            3661
Bilirubin      3883
Cholesterol    3931
Creatinine     3662
DiasABP        1391
FiO2           3343
GCS            2802
Gender         3854
Glucose        3690
HCO3           3668
HCT            3589
HR              287
Height          132
ICUType        3854
K              3664
Lactate        3771
MAP            1403
MechVent       3315
Mg             3684
NIDiasABP      2678
NIMAP          2693
NISysABP       2675
Na             3676
PaCO2          3330
PaO2           3331
Platelets      3661
RespRate       3123
SaO2           3670
SysABP         1391
Temp           2188
TroponinI      3934
TroponinT      3908
Urine          1061
WBC            3691
Weight         1432
pH             3302
dtype: int64

In [203]:
classificacao_obesidade_3_ids_test.unique().size

82

In [204]:
df_missing_test = pd.DataFrame(columns=df_columns)
df_missing_transpose_test = df_missing_test.T
df_missing_transpose_test ["Female"] = female_gender_missing_rate_test
df_missing_transpose_test ["Male"] = male_gender_missing_rate_test
df_missing_transpose_test["Undefined gender"] = undefined_gender_missing_rate_test
df_missing_transpose_test["ICUType 1"] = ICUType_1_test_missing
df_missing_transpose_test ["ICUType 2"] = ICUType_2_test_missing
df_missing_transpose_test ["ICUType 3"] = ICUType_3_test_missing
df_missing_transpose_test ["ICUType 4"] = ICUType_4_test_missing
df_missing_transpose_test["Age 65+"] = more_than_or_equal_to_65_test_missing
df_missing_transpose_test["Age 65-"] = less_than_65_test_missing
df_missing_transpose_test ["Low Weight"] = classificacao_baixo_peso_missing_test
df_missing_transpose_test ["Normal Weight"] = classificacao_normal_peso_missing_test
df_missing_transpose_test ["Overweight"] = classificacao_sobrepeso_missing_test
df_missing_transpose_test ["Obesity Grade 1"] = classificacao_obesidade_1_missing_test
df_missing_transpose_test ["Obesity Grade 2"] = classificacao_obesidade_2_missing_test
df_missing_transpose_test ["Obesity Grade 3"] = classificacao_obesidade_3_missing_test
df_missing_transpose_test["Undefined classification"] = classificacao_undefined_missing_test
df_missing_transpose_test = df_missing_transpose_test.drop("RecordID", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("level_1", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Time", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Age", axis=0)
df_missing_transpose_test = df_missing_transpose_test.drop("Gender", axis=0)
display(HTML("<h2 style='text-align: center; font-size: 24px; font-weight: bold;'>original Missing rate per Variable by demographics - Test</h2>"))
df_missing_transpose_test 

Unnamed: 0,Female,Male,Undefined gender,ICUType 1,ICUType 2,ICUType 3,ICUType 4,Age 65+,Age 65-,Low Weight,Normal Weight,Overweight,Obesity Grade 1,Obesity Grade 2,Obesity Grade 3,Undefined classification
ALP,50277,62696,189,15168,24337,40640,33017,61303,51859,1654,16803,21696,9614,5429,3884,54082
ALT,50258,62662,189,15153,24332,40618,33006,61284,51825,1654,16794,21685,9611,5430,3883,54052
AST,50258,62657,189,15152,24331,40615,33006,61276,51828,1654,16793,21684,9608,5429,3883,54053
Albumin,50473,62980,191,15202,24403,40892,33147,61437,52207,1660,16869,21799,9698,5453,3891,54274
BUN,47452,59107,180,14259,22925,38426,31129,57719,49020,1553,15804,20416,9067,5100,3661,51138
Bilirubin,50247,62646,189,15160,24336,40579,33007,61271,51811,1653,16788,21683,9611,5426,3883,54038
Cholesterol,51078,63659,192,15292,24517,41524,33596,62030,52899,1676,17055,22022,9820,5513,3931,54912
Creatinine,47439,59091,180,14240,22927,38418,31125,57703,49007,1552,15799,20413,9068,5097,3662,51119
DiasABP,25131,27553,66,9261,5403,26561,11525,27707,25043,599,5978,6561,3205,1763,1391,33253
FiO2,43317,52860,176,13756,20258,35155,27184,52014,44339,1389,14125,18190,7967,4504,3343,46835
