In [4]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [5]:
from pypotsModify.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-28 08:53:05 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-28 08:53:05 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-28 08:53:05 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-28 08:53:05 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [6]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [7]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training =  round(distribution_gender_training, 1) # 0: female, or 1: male

In [8]:
if(distribution_gender_training.count() != 3):
    distribution_gender_training[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [9]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training =  round(distribution_ICUType_training, 1)

<h3>Filtering one measurement per patient</h3>

In [10]:
uniques_train_per_variable = train_X[train_X["Time"] == 0.0]
uniques_train_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,HCO3,HCT,HR,Height,ICUType,K,Lactate,MAP,MechVent,Mg,NIDiasABP,NIMAP,NISysABP,Na,PaCO2,PaO2,Platelets,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,,,,,,,15.0,0.0,,,,75.000000,-1.0,4.0,,,,,,61.500000,91.665000,152.000000,,,,,19.0,,,35.35,,,480.0,,-1.0,
48,132540,0,0.0,,,,76.0,,,,,,,,,1.0,,,,,175.3,2.0,,,,,,,,,,34.0,344.0,,,,,,,,,,76.0,7.45
96,132541,0,0.0,,,,44.0,,,,,,,,7.0,0.0,,,,89.000000,-1.0,3.0,,,,,,83.500000,99.650000,132.000000,,,,,,,,37.80,,,100.0,,56.7,
240,132547,0,0.0,,,,64.0,,,,,,,,,1.0,,,,,180.3,1.0,,,,,,,,,,,,,,,,,,,,,114.0,
288,132548,0,0.0,,,,68.0,,,,,,,,15.0,0.0,,,,73.000000,162.6,3.0,,,,,,83.500000,116.500000,182.500000,,,,,14.5,,,36.30,,,,,87.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575040,163017,0,0.0,,,,64.0,,,,,,,,15.0,0.0,,,,101.000000,-1.0,3.0,,,,,,71.666667,101.676667,161.666667,,,,,23.0,,,37.00,,,200.0,,84.5,
575184,163029,0,0.0,,,,61.0,,,,,,,,,0.0,,,,,172.7,4.0,,,,,,,,,,,,,,,,,,,,,85.0,
575232,163033,0,0.0,,,,51.0,,,,,,,,,0.0,,,,,-1.0,3.0,,,,,,,,,,,,,,,,,,,,,208.0,
575280,163034,0,0.0,,,,60.0,,,,,,,,,0.0,,,,,172.7,4.0,,2.1,,,,,,,,37.0,167.0,,,98.0,,,,,,,85.0,7.43


In [11]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP             346
ALT             354
AST             353
Age            7671
Albumin         304
BUN             914
Bilirubin       343
Cholesterol      37
Creatinine      914
DiasABP        1429
FiO2           1424
GCS            2670
Gender         7671
Glucose         805
HCO3            825
HCT            1026
HR             3802
Height         7671
ICUType        7671
K               823
Lactate         910
MAP            1381
MechVent       1345
Mg              629
NIDiasABP      2830
NIMAP          2815
NISysABP       2832
Na              815
PaCO2          1754
PaO2           1726
Platelets      1040
RespRate       1139
SaO2            264
SysABP         1430
Temp           2876
TroponinI        30
TroponinT       202
Urine          2352
WBC             953
Weight         7671
pH             1767
dtype: int64

<h3>Showing total occurrences of age</h3>

In [12]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [13]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train =  round(percentage_group_one_train, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [14]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train =  round(percentage_group_two_train, 1)

<h3>Filtering only one height and one weight per patient</h3>

<h3>Calculate the BMI</h3>

In [15]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [16]:
train_count = train_X.groupby("RecordID").first().reset_index()
train_count = train_count["RecordID"].count()
train_count

7671

In [17]:
filtered_train = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())]
filtered_train = filtered_train.groupby("RecordID").first().reset_index()
filtered_train_ids = filtered_train["RecordID"]

In [18]:
filtered_train_ids.count()

4034

In [19]:
undefined_train = train_X[~train_X["RecordID"].isin(filtered_train_ids)]
undefined_train = undefined_train.groupby("RecordID").first().reset_index()
undefined_train_ids = undefined_train["RecordID"]

In [20]:
undefined_train_ids.count()

3637

In [21]:
filtered_train_metros = filtered_train.copy()
filtered_train_metros["Height"] = filtered_train_metros["Height"]/100

In [22]:
filtered_train_metros["BMI"] = round(filtered_train_metros["Weight"]/(filtered_train_metros["Height"]**2),1)

In [23]:
filtered_train_metros["Classification"] = filtered_train_metros["BMI"].apply(classify_BMI)

In [24]:
baixo_peso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Baixo peso"]
baixo_peso_train_percentage = round((baixo_peso_train_percentage["RecordID"].count()/train_count)*100,1)
baixo_peso_train_percentage

1.7

In [25]:
peso_normal_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Peso normal"]
peso_normal_train_percentage = round((peso_normal_train_percentage["RecordID"].count()/train_count)*100,1)
peso_normal_train_percentage

16.1

In [26]:
sobrepeso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"]=="Sobrepeso"]
sobrepeso_train_percentage = round((sobrepeso_train_percentage["RecordID"].count()/train_count)*100,1)
sobrepeso_train_percentage

18.0

In [27]:
obesidade_1_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_train_percentege = round((obesidade_1_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_1_train_percentege

9.4

In [28]:
obesidade_2_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_train_percentege = round((obesidade_2_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_2_train_percentege

3.9

In [29]:
obesidade_3_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_train_percentege = round((obesidade_3_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_3_train_percentege

3.5

In [30]:
undefined_train_percentege = round((undefined_train["RecordID"].count()/train_count)*100,1)
undefined_train_percentege

47.4

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [31]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [32]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation =  round(distribution_gender_validation, 1) # 0: female, or 1: male

In [33]:
if(distribution_gender_validation.count() != 3):
    distribution_gender_validation[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [34]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation = round(distribution_ICUType_validation, 1)

<h3>Filtering only one age per patient</h3>

In [35]:
uniques_validation_per_variable = validation_X[validation_X["Time"] == 0.0]
uniques_validation_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,HCO3,HCT,HR,Height,ICUType,K,Lactate,MAP,MechVent,Mg,NIDiasABP,NIMAP,NISysABP,Na,PaCO2,PaO2,Platelets,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
192,132545,0,0.0,,,,88.0,,,,,,,,,0.0,,,,,-1.0,3.0,,,,,,,,,,,,,,,,,,,,,-1.0,
528,132567,0,0.0,,,,71.0,,,,,,58.00,1.0,,0.0,,,,84.000000,157.5,2.0,,,79.00,1.0,,,,,,35.0,245.0,,,,111.50,35.6,,,,,56.0,7.44
576,132568,0,0.0,,,,66.0,,,,,,,0.7,,0.0,,,,87.333333,157.5,3.0,,,,,,58.333333,71.666667,115.333333,,,,,,,,,,,220.0,,84.5,
1392,132602,0,0.0,,,,80.0,,,,,,,,15.0,1.0,,,,67.000000,180.3,3.0,,,,,,72.000000,86.000000,130.000000,,,,,,,,37.3,,,150.0,,70.0,
1536,132612,0,0.0,,,,52.0,,,,,,,1.0,11.0,1.0,,,,102.000000,-1.0,4.0,,,,1.0,,43.000000,59.670000,93.000000,,,,,,,,35.3,,,,,109.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573600,162929,0,0.0,,,,63.0,,,,,,,,,1.0,,,,,165.1,2.0,,,,,,,,,,42.0,460.0,,,,,,,,,,100.0,7.41
573888,162946,0,0.0,,,,65.0,,,,,,,,,1.0,,,,,167.6,2.0,,,,,,,,,,36.0,226.0,,,,,,,,,,97.9,7.37
574272,162980,0,0.0,,,,88.0,,,,,,,,,1.0,,,,,160.0,2.0,,,,,,,,,,35.0,300.0,,,,,,,,,,76.5,7.38
574320,162981,0,0.0,,,,90.0,,49.0,,,1.0,,,,0.0,177.0,28.0,21.3,,-1.0,3.0,4.9,,,,2.0,,,,140.0,,,151.0,,,,,,0.02,,7.7,49.5,


<h3>Showing total occurrences of age</h3>

In [36]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [37]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation = round(percentage_group_one_validation, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [38]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation =  round(percentage_group_two_validation, 1)

<h3>Filtering only one height and one weight per patient</h3>

In [39]:
validation_count = validation_X.groupby("RecordID").first().reset_index()
validation_count = validation_count["RecordID"].count()
validation_count

1918

In [40]:
filtered_validation = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())]
filtered_validation = filtered_validation.groupby("RecordID").first().reset_index()
filtered_validation_ids = filtered_validation["RecordID"]

In [41]:
filtered_validation_ids.count()

1011

In [42]:
undefined_validation = validation_X[~validation_X["RecordID"].isin(filtered_validation_ids)]
undefined_validation = undefined_validation.groupby("RecordID").first().reset_index()
undefined_validation_ids = undefined_validation["RecordID"]

In [43]:
undefined_validation_ids.count()

907

In [44]:
filtered_validation_metros = filtered_validation.copy()
filtered_validation_metros["Height"] = filtered_validation_metros["Height"]/100

In [45]:
filtered_validation_metros["BMI"] = round(filtered_validation_metros["Weight"]/(filtered_validation_metros["Height"]**2),1)

In [46]:
filtered_validation_metros["Classification"] = filtered_validation_metros["BMI"].apply(classify_BMI)

In [47]:
filtered_validation_metros["Classification"].value_counts()

Classification
Sobrepeso           351
Peso normal         284
Obesidade grau 1    202
Obesidade grau 3     73
Obesidade grau 2     67
Baixo peso           34
Name: count, dtype: int64

In [48]:
baixo_peso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Baixo peso"]
baixo_peso_validation_percentage = round((baixo_peso_validation_percentage["RecordID"].count()/validation_count)*100,1)
baixo_peso_validation_percentage

1.8

In [49]:
peso_normal_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Peso normal"]
peso_normal_validation_percentage = round((peso_normal_validation_percentage["RecordID"].count()/validation_count)*100,1)
peso_normal_validation_percentage

14.8

In [50]:
sobrepeso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"]=="Sobrepeso"]
sobrepeso_validation_percentage = round((sobrepeso_validation_percentage["RecordID"].count()/validation_count)*100,1)
sobrepeso_validation_percentage

18.3

In [51]:
obesidade_1_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_validation_percentege = round((obesidade_1_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_1_validation_percentege

10.5

In [52]:
obesidade_2_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_validation_percentege = round((obesidade_2_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_2_validation_percentege

3.5

In [53]:
obesidade_3_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_validation_percentege = round((obesidade_3_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_3_validation_percentege

3.8

In [54]:
undefined_validation_percentege = round((undefined_validation["RecordID"].count()/validation_count)*100,1)
undefined_validation_percentege

47.3

<h2>Test data</h2>

In [55]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [56]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test = round(distribution_gender_test, 1) # 0: female, or 1: male

In [57]:
if(distribution_gender_test.count() != 3):
    distribution_gender_test[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [58]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test = round(distribution_ICUType_test, 1)

<h3>Filtering only one age per patient</h3>


In [59]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,FiO2,GCS,Gender,Glucose,HCO3,HCT,HR,Height,ICUType,K,Lactate,MAP,MechVent,Mg,NIDiasABP,NIMAP,NISysABP,Na,PaCO2,PaO2,Platelets,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,,0.9,,,15.0,1.0,129.0,28.0,41.3,76.333333,180.3,3.0,4.0,,,,2.1,65.0,85.89,127.666667,140.0,,,391.0,18.0,,,36.3,,,600.0,11.5,84.6,
1,132575,0,0.0,,,,78.0,,21.0,,,1.0,56.0,1.0,3.0,1.0,90.0,22.0,26.0,84.4,167.6,2.0,4.7,1.5,76.0,1.0,3.0,,,,137.0,43.0,262.5,121.0,,99.0,111.6,34.72,,,210.0,12.5,63.0,7.34
2,132590,0,0.0,,,,58.0,,18.0,,,0.8,60.4,1.0,3.0,1.0,213.0,21.0,34.6,115.8,188.0,2.0,3.7,4.0,76.2,1.0,2.4,55.0,75.67,117.0,140.0,46.0,328.0,244.0,,98.0,110.0,36.8,,,440.0,22.2,98.0,7.38
3,132591,0,0.0,,,,81.0,,27.0,,,1.4,,0.8,15.0,1.0,140.0,22.0,31.7,65.666667,-1.0,3.0,4.6,1.6,,,2.2,67.333333,83.89,117.0,136.0,,,321.0,27.0,,,37.0,,0.15,30.0,17.1,63.7,
4,132610,0,0.0,,,,72.0,,68.0,,,1.0,45.5,,15.0,1.0,118.0,24.0,20.6,75.0,172.9,3.0,4.2,,63.0,,1.7,42.0,58.33,91.0,148.0,,,116.0,13.0,,103.5,37.2,,,,4.9,72.26,


<h3>Showing total occurrences of age</h3>

In [60]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [61]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test = round(percentage_group_one_test, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [62]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test = round(percentage_group_two_test, 1)

<h3>Filtering only one height and one weight per patient</h3>


In [63]:
test_count = test_X.groupby("RecordID").first().reset_index()
test_count = test_count["RecordID"].count()
test_count

2399

In [64]:
filtered_test =  test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())]
filtered_test = filtered_test.groupby("RecordID").first().reset_index()
filtered_test_ids = filtered_test["RecordID"]

In [65]:
filtered_test_ids.count()

1223

In [66]:
undefined_test = test_X[~test_X["RecordID"].isin(filtered_test_ids)]
undefined_test = undefined_test.groupby("RecordID").first().reset_index()
undefined_test_ids = undefined_test["RecordID"]

In [67]:
undefined_test_ids.count()

1176

In [68]:
filtered_test_metros = filtered_test.copy()
filtered_test_metros["Height"] = filtered_test_metros["Height"]/100

In [69]:
filtered_test_metros["BMI"] = round(filtered_test_metros["Weight"]/(filtered_test_metros["Height"]**2),1)

In [70]:
filtered_test_metros["Classification"] = filtered_test_metros["BMI"].apply(classify_BMI)

In [71]:
filtered_test_metros["Classification"].value_counts()

Classification
Sobrepeso           428
Peso normal         341
Obesidade grau 1    218
Obesidade grau 2    110
Obesidade grau 3     92
Baixo peso           34
Name: count, dtype: int64

In [72]:
baixo_peso_test_percentage = filtered_test_metros[filtered_test_metros["Classification"] == "Baixo peso"]
baixo_peso_test_percentage = round((baixo_peso_test_percentage["RecordID"].count()/test_count)*100,1)
baixo_peso_test_percentage

1.4

In [73]:
peso_normal_test_percentage = filtered_test_metros[filtered_test_metros["Classification"] == "Peso normal"]
peso_normal_test_percentage = round((peso_normal_test_percentage["RecordID"].count()/test_count)*100,1)
peso_normal_test_percentage

14.2

In [74]:
sobrepeso_test_percentage = filtered_test_metros[filtered_test_metros["Classification"]=="Sobrepeso"]
sobrepeso_test_percentage = round((sobrepeso_test_percentage["RecordID"].count()/test_count)*100,1)
sobrepeso_test_percentage

17.8

In [75]:
obesidade_1_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_test_percentege = round((obesidade_1_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_1_test_percentege

9.1

In [76]:
obesidade_2_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_test_percentege = round((obesidade_2_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_2_test_percentege

4.6

In [77]:
obesidade_3_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_test_percentege = round((obesidade_3_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_3_test_percentege

3.8

In [78]:
undefined_test_percentege = round((undefined_test["RecordID"].count()/test_count)*100,1)
undefined_test_percentege

49.0

<h3>Percentage of BMI classification groups</h3>

In [79]:
subgroups = [("Female"), ("Male"),("Undefined gender") ,("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Low Weight"), ("Normal Weight"), ("Overweight"),
    ("Obesity Grade 1"), ("Obesity Grade 2"), ("Obesity Grade 3"),("Undefined_classification")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [80]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]),(distribution_gender_training[-1]) ,(distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (baixo_peso_train_percentage), (peso_normal_train_percentage), (sobrepeso_train_percentage), (obesidade_1_train_percentege),
    (obesidade_2_train_percentege), (obesidade_3_train_percentege), (undefined_train_percentege)
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [81]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]),(distribution_gender_validation[-1]) ,(distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (baixo_peso_validation_percentage), (peso_normal_validation_percentage), (sobrepeso_validation_percentage), (obesidade_1_validation_percentege),
    (obesidade_2_validation_percentege), (obesidade_3_validation_percentege), (undefined_validation_percentege)
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [82]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]),(distribution_gender_test[-1]) ,(distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (baixo_peso_test_percentage), (peso_normal_test_percentage), (sobrepeso_test_percentage), (obesidade_1_test_percentege),
    (obesidade_2_test_percentege), (obesidade_3_test_percentege), (undefined_test_percentege)
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

<h2>Table 3</h2>

In [83]:
table_descriptive_statistics = pd.DataFrame(columns=['Subgroups', 'Train', 'Validation', 'Test'])
table_descriptive_statistics["Subgroups"] = df_subgroups
table_descriptive_statistics["Train"] = df_train_subgroups
table_descriptive_statistics["Validation"] = df_validation_subgroups
table_descriptive_statistics["Test"] = df_test_subgroups

display(HTML("<h2 style=' font-size: 24px; font-weight: bold;'>Descriptive statistics stratified by demographics</h2>"))
table_descriptive_statistics

Unnamed: 0,Subgroups,Train,Validation,Test
0,Female,43.5,44.6,44.4
1,Male,56.4,55.3,55.5
2,Undefined gender,0.1,0.2,0.1
3,ICUType 1,14.6,16.7,13.5
4,ICUType 2,21.2,19.6,22.0
5,ICUType 3,35.5,35.0,37.2
6,ICUType 4,28.7,28.7,27.3
7,Age 65+,54.5,55.8,54.3
8,Age 65-,45.5,44.2,45.7
9,Low Weight,1.7,1.8,1.4
