In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-11 14:18:36 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-11 14:18:36 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-11 14:18:36 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-11 14:18:36 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [3]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [4]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training =  round(distribution_gender_training, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [5]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training =  round(distribution_ICUType_training, 1)

<h3>Filtering one measurement per patient</h3>

In [6]:
uniques_train_per_variable = train_X[train_X["Time"] == 0.0]
uniques_train_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,,,...,19.0,,,35.35,,,480.0,,-1.0,
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,,,76.0,7.45
96,132541,0,0.0,,,,44.0,,,,...,,,,37.80,,,100.0,,56.7,
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.30,,,,11.5,84.6,
192,132545,0,0.0,,,,88.0,,,,...,,,,,,,,,-1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575088,163021,0,0.0,,,,72.0,,9.0,,...,,,,,,,,8.6,62.0,
575184,163029,0,0.0,,,,61.0,,,,...,,,,,,,,,85.0,
575232,163033,0,0.0,,,,51.0,,,,...,,,,,,,,,208.0,
575280,163034,0,0.0,,,,60.0,,,,...,,98.0,,,,,,,85.0,7.43


In [7]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP             370
ALT             378
AST             379
Age            7671
Albumin         314
BUN             953
Bilirubin       370
Cholesterol      39
Creatinine      954
DiasABP        1414
FiO2           1422
GCS            2736
Gender         7671
Glucose         839
HCO3            861
HCT            1035
HR             3898
Height         7671
ICUType        7671
K               856
Lactate         877
MAP            1372
MechVent       1343
Mg              666
NIDiasABP      2935
NIMAP          2915
NISysABP       2939
Na              849
PaCO2          1775
PaO2           1743
Platelets      1054
RespRate       1177
SaO2            270
SysABP         1415
Temp           2922
TroponinI        30
TroponinT       184
Urine          2410
WBC             965
Weight         7671
pH             1794
dtype: int64

<h3>Showing total occurrences of age</h3>

In [8]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [9]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train =  round(percentage_group_one_train, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [10]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train =  round(percentage_group_two_train, 1)

<h3>Filtering only one height and one weight per patient</h3>

<h3>Calculate the BMI</h3>

In [11]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [12]:
train_count = train_X.groupby("RecordID").first().reset_index()
train_count = train_count["RecordID"].count()
train_count

7671

In [13]:
filtered_train = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())]
filtered_train = filtered_train.groupby("RecordID").first().reset_index()
filtered_train_ids = filtered_train["RecordID"]

In [14]:
filtered_train_ids.count()

3990

In [15]:
undefined_train = train_X[~train_X["RecordID"].isin(filtered_train_ids)]
undefined_train = undefined_train.groupby("RecordID").first().reset_index()
undefined_train_ids = undefined_train["RecordID"]

In [16]:
undefined_train_ids.count()

3681

In [17]:
filtered_train_metros = filtered_train.copy()
filtered_train_metros["Height"] = filtered_train_metros["Height"]/100

In [18]:
filtered_train_metros["BMI"] = round(filtered_train_metros["Weight"]/(filtered_train_metros["Height"]**2),1)

In [19]:
filtered_train_metros["Classification"] = filtered_train_metros["BMI"].apply(classify_BMI)

In [20]:
baixo_peso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Baixo peso"]
baixo_peso_train_percentage = round((baixo_peso_train_percentage["RecordID"].count()/train_count)*100,1)
baixo_peso_train_percentage

1.7

In [21]:
peso_normal_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Peso normal"]
peso_normal_train_percentage = round((peso_normal_train_percentage["RecordID"].count()/train_count)*100,1)
peso_normal_train_percentage

15.7

In [22]:
sobrepeso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"]=="Sobrepeso"]
sobrepeso_train_percentage = round((sobrepeso_train_percentage["RecordID"].count()/train_count)*100,1)
sobrepeso_train_percentage

17.9

In [23]:
obesidade_1_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_train_percentege = round((obesidade_1_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_1_train_percentege

9.2

In [24]:
obesidade_2_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_train_percentege = round((obesidade_2_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_2_train_percentege

3.9

In [25]:
obesidade_3_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_train_percentege = round((obesidade_3_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_3_train_percentege

3.6

In [29]:
undefined_train_percentege = round((undefined_train["RecordID"].count()/train_count)*100,1)
undefined_train_percentege

48.0

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [33]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [34]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation =  round(distribution_gender_validation, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [32]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation = round(distribution_ICUType_validation, 1)

<h3>Filtering only one age per patient</h3>

In [35]:
uniques_validation_per_variable = validation_X[validation_X["Time"] == 0.0]
uniques_validation_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
384,132554,0,0.0,,,,64.0,,,,...,,,,,,,,,60.7,
432,132555,0,0.0,,,,74.0,,,,...,,,98.0,34.80,,,35.0,,66.1,7.39
528,132567,0,0.0,,,,71.0,,,,...,,,111.5,35.60,,,,,56.0,7.44
576,132568,0,0.0,,,,66.0,,,,...,,,,,,,220.0,,84.5,
816,132582,0,0.0,,,,84.0,,,,...,,,,,,,,,82.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574608,162995,0,0.0,60.0,21.0,20.0,84.0,,93.0,0.4,...,,98.0,121.0,37.10,0.6,,,17.1,96.5,7.31
574704,162999,0,0.0,,,,70.0,,30.0,,...,,,0.0,36.30,,,,2.5,68.1,
574848,163007,0,0.0,,,,19.0,,,,...,,,0.0,40.55,,,150.0,,114.3,
574896,163008,0,0.0,,,,59.0,,,,...,,,,,,,,,98.5,


<h3>Showing total occurrences of age</h3>

In [36]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [37]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation = round(percentage_group_one_validation, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [38]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation =  round(percentage_group_two_validation, 1)

<h3>Filtering only one height and one weight per patient</h3>

In [41]:
validation_count = validation_X.groupby("RecordID").first().reset_index()
validation_count = validation_count["RecordID"].count()
validation_count

1918

In [43]:
filtered_validation = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())]
filtered_validation = filtered_validation.groupby("RecordID").first().reset_index()
filtered_validation_ids = filtered_validation["RecordID"]

In [44]:
filtered_validation_ids.count()

1002

In [45]:
undefined_validation = validation_X[~validation_X["RecordID"].isin(filtered_validation_ids)]
undefined_validation = undefined_validation.groupby("RecordID").first().reset_index()
undefined_validation_ids = undefined_validation["RecordID"]

In [46]:
undefined_validation_ids.count()

916

In [48]:
filtered_validation_metros = filtered_validation.copy()
filtered_validation_metros["Height"] = filtered_validation_metros["Height"]/100

In [50]:
filtered_validation_metros["BMI"] = round(filtered_validation_metros["Weight"]/(filtered_validation_metros["Height"]**2),1)

In [51]:
filtered_validation_metros["Classification"] = filtered_validation_metros["BMI"].apply(classify_BMI)

In [52]:
filtered_validation_metros["Classification"].value_counts()

Classification
Sobrepeso           337
Peso normal         287
Obesidade grau 1    203
Obesidade grau 3     74
Obesidade grau 2     73
Baixo peso           28
Name: count, dtype: int64

In [53]:
baixo_peso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Baixo peso"]
baixo_peso_validation_percentage = round((baixo_peso_validation_percentage["RecordID"].count()/validation_count)*100,1)
baixo_peso_validation_percentage

1.5

In [54]:
peso_normal_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Peso normal"]
peso_normal_validation_percentage = round((peso_normal_validation_percentage["RecordID"].count()/validation_count)*100,1)
peso_normal_validation_percentage

15.0

In [55]:
sobrepeso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"]=="Sobrepeso"]
sobrepeso_validation_percentage = round((sobrepeso_validation_percentage["RecordID"].count()/validation_count)*100,1)
sobrepeso_validation_percentage

17.6

In [56]:
obesidade_1_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_validation_percentege = round((obesidade_1_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_1_validation_percentege

10.6

In [57]:
obesidade_2_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_validation_percentege = round((obesidade_2_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_2_validation_percentege

3.8

In [58]:
obesidade_3_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_validation_percentege = round((obesidade_3_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_3_validation_percentege

3.9

In [59]:
undefined_validation_percentege = round((undefined_validation["RecordID"].count()/validation_count)*100,1)
undefined_validation_percentege

47.8

<h2>Test data</h2>

In [123]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [124]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test = round(distribution_gender_test, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [125]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test = round(distribution_ICUType_test, 1)

<h3>Filtering only one age per patient</h3>


In [126]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,16.0,,...,,99.0,103.0,34.88,,,316.666667,7.4,76.0,7.45
1,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
2,132555,0,0.0,,,,74.0,,19.0,,...,,99.0,98.0,34.8,,,35.0,9.0,66.1,7.39
3,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,16.5,98.0,,36.6,,,600.0,8.8,102.6,
4,132577,0,0.0,,,,65.0,,36.0,,...,24.0,96.0,145.0,38.8,,,80.0,9.4,66.3,7.34


<h3>Showing total occurrences of age</h3>

In [127]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [128]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test = round(percentage_group_one_test, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [129]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test = round(percentage_group_two_test, 1)

<h3>Filtering only one height and one weight per patient</h3>


In [130]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [131]:
filtered_uniques_test = filtered_test_X.groupby('RecordID').first().reset_index() 
filtered_uniques_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,21.0,,...,,93.0,122.0,37.5,,,50.0,13.3,76.0,7.45
1,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
2,132555,0,0.0,,,,74.0,,19.0,,...,,99.0,98.0,34.8,,,35.0,9.0,66.1,7.39
3,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,16.5,98.0,,36.6,,,600.0,8.8,102.6,
4,132618,0,0.0,,,,72.0,,,,...,,,56.0,37.475,,,,,69.1,


<h3>Calculate the BMI</h3>


In [132]:
filtered_uniques_test['Height'] = filtered_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [133]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = filtered_uniques_test["RecordID"]
bmi_data_test["Height"] = filtered_uniques_test["Height"]
bmi_data_test["Weight"] = filtered_uniques_test["Weight"]
bmi_data_test["BMI"] = filtered_uniques_test["Weight"] / (filtered_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132551,1.626,48.4,18.306456,Baixo peso
2,132555,1.753,66.1,21.509862,Peso normal
3,132570,1.702,102.6,35.418344,Obesidade grau 2
4,132618,1.524,69.1,29.751448,Sobrepeso


<h3>Percentage of BMI classification groups</h3>

In [134]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100
percentage_bmi_test = round(percentage_bmi_test, 1)


In [135]:
subgroups = [("Female"), ("Male"),("Undefined gender") ,("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [136]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]),(distribution_gender_training[-1]) ,(distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [137]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]),(distribution_gender_validation[-1]) ,(distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [138]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]),(distribution_gender_test[-1]) ,(distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

<h2>Table 3</h2>

In [139]:
table_descriptive_statistics = pd.DataFrame(columns=['Subgroups', 'Train', 'Validation', 'Test'])
table_descriptive_statistics["Subgroups"] = df_subgroups
table_descriptive_statistics["Train"] = df_train_subgroups
table_descriptive_statistics["Validation"] = df_validation_subgroups
table_descriptive_statistics["Test"] = df_test_subgroups

display(HTML("<h2 style=' font-size: 24px; font-weight: bold;'>Descriptive statistics stratified by demographics</h2>"))
table_descriptive_statistics

Unnamed: 0,Subgroups,Train,Validation,Test
0,Female,44.0,44.1,43.2
1,Male,55.9,55.8,56.7
2,Undefined gender,0.1,0.1,0.1
3,ICUType 1,14.5,14.6,15.5
4,ICUType 2,21.0,21.1,21.5
5,ICUType 3,35.8,36.1,35.3
6,ICUType 4,28.7,28.2,27.8
7,Age 65+,54.8,53.8,54.9
8,Age 65-,45.2,46.2,45.1
9,Baixo Peso,2.9,1.8,3.9
