In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-31 13:19:24 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-31 13:19:24 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-31 13:19:24 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-31 13:19:24 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [3]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [60]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training =  round(distribution_gender_training, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [61]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training =  round(distribution_ICUType_training, 1)

<h3>Filtering one measurement per patient</h3>

In [6]:
uniques_train_per_variable = train_X.groupby('RecordID').first().reset_index()
uniques_train_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,16.0,,...,,99.0,103.0,34.88,,,316.666667,7.4,76.0,7.45
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
2,132545,0,0.0,,,,88.0,3.3,45.0,,...,24.0,,,37.8,,,140.0,3.8,-1.0,
3,132547,0,0.0,101.0,45.0,47.0,64.0,,15.0,0.4,...,,96.0,141.0,35.8,1.3,,1200.0,24.0,114.0,7.29
4,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4


In [7]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP            3217
ALT            3295
AST            3295
Age            7671
Albumin        3069
BUN            7557
Bilirubin      3298
Cholesterol     627
Creatinine     7557
DiasABP        5416
FiO2           5235
GCS            7548
Gender         7671
Glucose        7491
HCO3           7540
HCT            7553
HR             7548
Height         7671
ICUType        7671
K              7518
Lactate        4240
MAP            5402
MechVent       4868
Mg             7482
NIDiasABP      6681
NIMAP          6665
NISysABP       6699
Na             7537
PaCO2          5833
PaO2           5834
Platelets      7548
RespRate       2103
SaO2           3472
SysABP         5416
Temp           7548
TroponinI       372
TroponinT      1730
Urine          7467
WBC            7538
Weight         7671
pH             5869
dtype: int64

<h3>Showing total occurrences of age</h3>

In [8]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [62]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train =  round(percentage_group_one_train, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [63]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train =  round(percentage_group_two_train, 1)

<h3>Filtering only one height and one weight per patient</h3>

In [15]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [16]:
filtered_uniques_train = filtered_train_X.groupby('RecordID').first().reset_index() 
filtered_uniques_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,21.0,,...,,93.0,122.0,37.5,,,50.0,13.3,76.0,7.45
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
2,132547,0,0.0,,,,64.0,,,,...,,,,,,,,,114.0,
3,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
4,132555,0,0.0,,,,74.0,,19.0,,...,,99.0,98.0,34.8,,,35.0,9.0,66.1,7.39


<h3>Calculate the BMI</h3>

In [17]:
def classify_BMI(BMI):
    if BMI < 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [18]:
filtered_uniques_train['Height'] = filtered_uniques_train['Height'] / 100 # Converting Height from cm to meters

In [19]:
bmi_data_train = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_train["RecordID"] = filtered_uniques_train["RecordID"]
bmi_data_train["Height"] = filtered_uniques_train["Height"]
bmi_data_train["Weight"] = filtered_uniques_train["Weight"]
bmi_data_train["BMI"] = filtered_uniques_train["Weight"] / (filtered_uniques_train["Height"] ** 2)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132543,1.803,84.6,26.024291,Sobrepeso
2,132547,1.803,114.0,35.068194,Obesidade grau 2
3,132551,1.626,48.4,18.306456,Baixo peso
4,132555,1.753,66.1,21.509862,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [20]:
percentage_bmi_train= bmi_data_train["Classificacao"].value_counts(normalize=True) * 100

In [64]:
percentage_bmi_train =  round(percentage_bmi_train, 1)

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [22]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [65]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation =  round(distribution_gender_validation, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [66]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation = round(distribution_ICUType_validation, 1)

<h3>Filtering only one age per patient</h3>

In [28]:
uniques_validation_per_variable = validation_X.groupby('RecordID').first().reset_index()
uniques_validation_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,13.0,,...,19.0,,,35.35,,,480.0,11.2,-1.0,
1,132556,0,0.0,402.0,36.0,47.0,64.0,2.7,64.0,0.1,...,10.0,,,36.7,,,200.0,21.8,65.0,
2,132568,0,0.0,,,,66.0,,18.0,,...,,,,36.1,,,220.0,14.8,84.5,
3,132575,0,0.0,,,,78.0,,21.0,,...,,99.0,111.6,34.72,,,210.0,12.5,63.0,7.34
4,132588,0,0.0,202.0,58.0,102.0,48.0,2.0,7.0,6.8,...,17.0,,,38.4,,,,7.0,42.3,


<h3>Showing total occurrences of age</h3>

In [29]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [67]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation = round(percentage_group_one_validation, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [68]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation =  round(percentage_group_two_validation, 1)

<h3>Filtering only one height and one weight per patient</h3>

In [32]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [33]:
filtered_uniques_validation = filtered_validation_X.groupby('RecordID').first().reset_index() 
filtered_uniques_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132568,0,0.0,,,,66.0,,18.0,,...,,,,36.1,,,220.0,14.8,84.5,
1,132575,0,0.0,,,,78.0,,18.0,,...,,96.0,122.0,37.4,,,38.0,12.5,63.0,7.34
2,132588,0,0.0,,,,48.0,,,,...,,,,,,,,,42.3,
3,132590,0,0.0,,,,58.0,,,,...,,,119.0,36.8,,,70.0,,98.0,
4,132614,0,0.0,,,,77.0,,,,...,,,,,,,,,59.0,


In [34]:
filtered_uniques_validation['Height'] = filtered_uniques_validation['Height'] / 100 # Converting Height from cm to meters

In [35]:
bmi_data_validation = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_validation["RecordID"] = filtered_uniques_validation["RecordID"]
bmi_data_validation["Height"] = filtered_uniques_validation["Height"]
bmi_data_validation["Weight"] = filtered_uniques_validation["Weight"]
bmi_data_validation["BMI"] = filtered_uniques_validation["Weight"] / (filtered_uniques_validation["Height"] ** 2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132568,1.575,84.5,34.063996,Obesidade grau 1
1,132575,1.676,63.0,22.428102,Peso normal
2,132588,1.549,42.3,17.6294,Baixo peso
3,132590,1.88,98.0,27.727478,Sobrepeso
4,132614,1.626,59.0,22.315721,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [36]:
percentage_bmi_validation = bmi_data_validation["Classificacao"].value_counts(normalize=True) * 100

In [69]:
percentage_bmi_validation = round(percentage_bmi_validation, 1)

<h2>Test data</h2>

In [39]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [70]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test = round(distribution_gender_test, 1) # 0: female, or 1: male

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [71]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test = round(distribution_ICUType_test, 1)

<h3>Filtering only one age per patient</h3>


In [44]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132541,0,0.0,127.0,91.0,235.0,44.0,2.7,8.0,3.0,...,,95.0,148.0,37.8,,,100.0,4.2,56.7,7.51
1,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
2,132567,0,0.0,,,,71.0,,9.0,,...,,98.0,111.5,35.6,,,1003.333333,13.7,56.0,7.44
3,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,16.5,98.0,,36.6,,,600.0,8.8,102.6,
4,132585,0,0.0,,,,40.0,,10.0,,...,,99.0,101.5,35.7,,,416.666667,13.0,84.7,7.11


<h3>Showing total occurrences of age</h3>

In [45]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [72]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test = round(percentage_group_one_test, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [73]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test = round(percentage_group_two_test, 1)

<h3>Filtering only one height and one weight per patient</h3>


In [48]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [49]:
filtered_uniques_test = filtered_test_X.groupby('RecordID').first().reset_index() 
filtered_uniques_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
1,132567,0,0.0,,,,71.0,,9.0,,...,,98.0,111.5,35.6,,,15.0,9.0,56.0,7.44
2,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,16.5,98.0,,36.6,,,600.0,8.8,102.6,
3,132585,0,0.0,,,,40.0,,,,...,,,90.5,,,,320.0,,84.7,
4,132602,0,0.0,,,,80.0,,,,...,,,,37.3,,,150.0,,70.0,


<h3>Calculate the BMI</h3>


In [50]:
filtered_uniques_test['Height'] = filtered_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [51]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = filtered_uniques_test["RecordID"]
bmi_data_test["Height"] = filtered_uniques_test["Height"]
bmi_data_test["Weight"] = filtered_uniques_test["Weight"]
bmi_data_test["BMI"] = filtered_uniques_test["Weight"] / (filtered_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132548,1.626,87.0,32.906233,Obesidade grau 1
1,132567,1.575,56.0,22.574956,Peso normal
2,132570,1.702,102.6,35.418344,Obesidade grau 2
3,132585,1.651,84.7,31.073435,Obesidade grau 1
4,132602,1.803,70.0,21.533101,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [74]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100
percentage_bmi_test = round(percentage_bmi_test, 1)


In [75]:
subgroups = [("Female"), ("Male"), ("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [76]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]), (distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [77]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]), (distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [78]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]), (distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

<h2>Table 3</h2>

In [79]:
table_descriptive_statistics = pd.DataFrame(columns=['Subgroups', 'Train', 'Validation', 'Test'])
table_descriptive_statistics["Subgroups"] = df_subgroups
table_descriptive_statistics["Train"] = df_train_subgroups
table_descriptive_statistics["Validation"] = df_validation_subgroups
table_descriptive_statistics["Test"] = df_test_subgroups

display(HTML("<h2 style=' font-size: 24px; font-weight: bold;'>Descriptive statistics stratified by demographics</h2>"))
table_descriptive_statistics

Unnamed: 0,Subgroups,Train,Validation,Test
0,Female,43.6,45.2,43.6
1,Male,56.3,54.8,56.2
2,ICUType 1,14.9,14.1,14.6
3,ICUType 2,20.7,22.9,20.9
4,ICUType 3,35.9,33.4,37.3
5,ICUType 4,28.5,29.7,27.3
6,Age 65+,55.1,54.8,53.1
7,Age 65-,44.9,45.2,46.9
8,Baixo Peso,3.1,1.8,3.4
9,Peso normal,29.8,30.1,28.7
