In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-31 10:13:45 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-31 10:13:45 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-31 10:13:45 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-31 10:13:45 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [3]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [4]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training # 0: female, or 1: male

Gender
 1.0    55.768479
 0.0    44.101160
-1.0     0.130361
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [5]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training

ICUType
3.0    35.953591
4.0    28.588189
2.0    20.922957
1.0    14.535263
Name: proportion, dtype: float64

<h3>Filtering one measurement per patient</h3>

In [6]:
uniques_train_per_variable = train_X.groupby('RecordID').first().reset_index()
uniques_train_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,13.0,,...,19.0,,,35.35,,,480.0,11.2,-1.0,
1,132540,0,0.0,,,,76.0,,16.0,,...,,99.0,103.0,34.88,,,316.666667,7.4,76.0,7.45
2,132541,0,0.0,127.0,91.0,235.0,44.0,2.7,8.0,3.0,...,,95.0,148.0,37.8,,,100.0,4.2,56.7,7.51
3,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
4,132545,0,0.0,,,,88.0,3.3,45.0,,...,24.0,,,37.8,,,140.0,3.8,-1.0,


In [7]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP            3262
ALT            3338
AST            3340
Age            7671
Albumin        3125
BUN            7560
Bilirubin      3328
Cholesterol     610
Creatinine     7560
DiasABP        5397
FiO2           5164
GCS            7550
Gender         7671
Glucose        7481
HCO3           7541
HCT            7551
HR             7550
Height         7671
ICUType        7671
K              7514
Lactate        4179
MAP            5385
MechVent       4801
Mg             7491
NIDiasABP      6692
NIMAP          6680
NISysABP       6708
Na             7538
PaCO2          5783
PaO2           5783
Platelets      7550
RespRate       2146
SaO2           3397
SysABP         5397
Temp           7550
TroponinI       360
TroponinT      1638
Urine          7468
WBC            7540
Weight         7671
pH             5823
dtype: int64

<h3>Showing total occurrences of age</h3>

In [8]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [9]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train

54.53004823360709

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [10]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train

45.46995176639291

<h3>Filtering only one height and one weight per patient</h3>

In [11]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [12]:
filtered_uniques_train = filtered_train_X.groupby('RecordID').first().reset_index() 
filtered_uniques_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,21.0,,...,,93.0,122.0,37.5,,,50.0,13.3,76.0,7.45
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
2,132547,0,0.0,,,,64.0,,,,...,,,,,,,,,114.0,
3,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
4,132555,0,0.0,,,,74.0,,19.0,,...,,99.0,98.0,34.8,,,35.0,9.0,66.1,7.39


<h3>Calculate the BMI</h3>

In [13]:
def classify_BMI(BMI):
    if BMI < 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [14]:
filtered_uniques_train['Height'] = filtered_uniques_train['Height'] / 100 # Converting Height from cm to meters

In [15]:
bmi_data_train = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_train["RecordID"] = filtered_uniques_train["RecordID"]
bmi_data_train["Height"] = filtered_uniques_train["Height"]
bmi_data_train["Weight"] = filtered_uniques_train["Weight"]
bmi_data_train["BMI"] = filtered_uniques_train["Weight"] / (filtered_uniques_train["Height"] ** 2)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132543,1.803,84.6,26.024291,Sobrepeso
2,132547,1.803,114.0,35.068194,Obesidade grau 2
3,132548,1.626,87.0,32.906233,Obesidade grau 1
4,132555,1.753,66.1,21.509862,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [16]:
percentage_bmi_train= bmi_data_train["Classificacao"].value_counts(normalize=True) * 100

In [17]:
percentage_bmi_train

Classificacao
Sobrepeso           34.170082
Peso normal         29.994877
Obesidade grau 1    18.442623
Obesidade grau 2     7.300205
Obesidade grau 3     7.018443
Baixo peso           3.073770
Name: proportion, dtype: float64

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [18]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [19]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation # 0: female, or 1: male

Gender
 1.0    56.100104
 0.0    43.847758
-1.0     0.052138
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [20]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation

ICUType
3.0    36.757039
4.0    27.737226
2.0    20.750782
1.0    14.754953
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [21]:
uniques_validation_per_variable = validation_X.groupby('RecordID').first().reset_index()
uniques_validation_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
1,132556,0,0.0,402.0,36.0,47.0,64.0,2.7,64.0,0.1,...,10.0,,,36.7,,,200.0,21.8,65.0,
2,132592,0,0.0,,,,35.0,,68.0,,...,22.0,,,36.6,,0.15,120.0,15.3,71.8,
3,132601,0,0.0,,,,74.0,,10.0,,...,,99.0,127.0,36.2,,,442.5,16.4,75.9,7.39
4,132617,0,0.0,,,,77.0,,111.0,,...,,97.0,,36.4,,,100.0,7.9,75.0,7.55


<h3>Showing total occurrences of age</h3>

In [22]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [23]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation

53.910323253388945

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [24]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation

46.089676746611055

<h3>Filtering only one height and one weight per patient</h3>

In [25]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [26]:
filtered_uniques_validation = filtered_validation_X.groupby('RecordID').first().reset_index() 
filtered_uniques_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
1,132601,0,0.0,,,,74.0,,,,...,,,,,,,,,75.9,7.39
2,132617,0,0.0,,,,77.0,,110.0,,...,,97.0,,36.4,,,100.0,7.9,75.0,7.55
3,132653,0,0.0,,,,78.0,2.7,31.0,,...,,98.0,127.0,36.15,,,145.0,14.8,91.3,7.44
4,132659,0,0.0,70.0,87.0,132.0,78.0,2.7,17.0,0.8,...,,98.0,116.5,36.15,,,40.0,12.5,110.0,7.24


In [27]:
filtered_uniques_validation['Height'] = filtered_uniques_validation['Height'] / 100 # Converting Height from cm to meters

In [28]:
bmi_data_validation = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_validation["RecordID"] = filtered_uniques_validation["RecordID"]
bmi_data_validation["Height"] = filtered_uniques_validation["Height"]
bmi_data_validation["Weight"] = filtered_uniques_validation["Weight"]
bmi_data_validation["BMI"] = filtered_uniques_validation["Weight"] / (filtered_uniques_validation["Height"] ** 2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132551,1.626,48.4,18.306456,Baixo peso
1,132601,1.778,75.9,24.009232,Peso normal
2,132617,1.702,75.0,25.890602,Sobrepeso
3,132653,1.778,91.3,28.88067,Sobrepeso
4,132659,1.651,110.0,40.35511,Obesidade grau 3


<h3>Percentage of BMI classification groups</h3>

In [29]:
percentage_bmi_validation = bmi_data_validation["Classificacao"].value_counts(normalize=True) * 100

<h2>Test data</h2>

In [30]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [31]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test # 0: female, or 1: male

Gender
 1.0    56.815340
 0.0    43.142976
-1.0     0.041684
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [32]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test

ICUType
3.0    34.347645
4.0    28.470196
2.0    21.884118
1.0    15.298041
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>


In [33]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132599,0,0.0,124.0,14.0,20.0,53.0,2.0,33.0,1.3,...,,98.0,135.0,37.3,,0.02,350.0,14.2,73.5,7.48
1,132615,0,0.0,81.0,32.0,28.0,46.0,2.6,22.0,0.3,...,,,0.0,36.4,,,80.0,22.1,88.6,7.34
2,132639,0,0.0,,,,73.0,,28.0,,...,,98.0,105.666667,35.2,,,90.0,13.0,96.3,7.36
3,132644,0,0.0,,,,71.0,,40.0,,...,,96.0,142.0,37.2,,,150.0,10.2,64.5,7.42
4,132658,0,0.0,71.0,9.0,42.0,81.0,,11.0,1.3,...,,98.0,127.0,36.7,,,360.0,48.3,105.4,7.42


<h3>Showing total occurrences of age</h3>

In [34]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [35]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test

55.77323884952064

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [36]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test

44.22676115047936

<h3>Filtering only one height and one weight per patient</h3>


In [37]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [38]:
filtered_uniques_test = filtered_test_X.groupby('RecordID').first().reset_index() 
filtered_uniques_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132599,0,0.0,,,,53.0,,,,...,,,,37.3,,,350.0,,73.5,
1,132615,0,0.0,81.0,32.0,28.0,46.0,2.6,22.0,0.3,...,,,0.0,36.4,,,80.0,22.1,88.6,7.34
2,132639,0,0.0,,,,73.0,,18.0,,...,,95.0,110.0,36.9,,,30.0,10.2,96.3,7.36
3,132644,0,0.0,,,,71.0,,,,...,,,,,,,,,64.5,
4,132658,0,0.0,71.0,9.0,42.0,81.0,,18.0,1.3,...,,96.0,97.0,38.4,,,90.0,61.3,105.4,7.42


<h3>Calculate the BMI</h3>


In [39]:
filtered_uniques_test['Height'] = filtered_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [40]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = filtered_uniques_test["RecordID"]
bmi_data_test["Height"] = filtered_uniques_test["Height"]
bmi_data_test["Weight"] = filtered_uniques_test["Weight"]
bmi_data_test["BMI"] = filtered_uniques_test["Weight"] / (filtered_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132599,1.778,73.5,23.250047,Peso normal
1,132615,1.524,88.6,38.147299,Obesidade grau 2
2,132639,1.803,96.3,29.623395,Sobrepeso
3,132644,1.499,64.5,28.704927,Sobrepeso
4,132658,1.854,105.4,30.66346,Obesidade grau 1


<h3>Percentage of BMI classification groups</h3>

In [41]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100


In [42]:
subgroups = [("Female"), ("Male"), ("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [43]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]), (distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [44]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]), (distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [45]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]), (distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

In [48]:
table_descriptive_statistics = pd.DataFrame(columns=['Subgroups', 'Train', 'Validation', 'Test'])
table_descriptive_statistics["Subgroups"] = df_subgroups
table_descriptive_statistics["Train"] = df_train_subgroups
table_descriptive_statistics["Validation"] = df_validation_subgroups
table_descriptive_statistics["Test"] = df_test_subgroups

display(HTML("<h2 style=' font-size: 24px; font-weight: bold;'>Descriptive statistics stratified by demographics</h2>"))
table_descriptive_statistics

Unnamed: 0,Subgroups,Train,Validation,Test
0,Female,44.10116,43.847758,43.142976
1,Male,55.768479,56.100104,56.81534
2,ICUType 1,14.535263,14.754953,15.298041
3,ICUType 2,20.922957,20.750782,21.884118
4,ICUType 3,35.953591,36.757039,34.347645
5,ICUType 4,28.588189,27.737226,28.470196
6,Age 65+,54.530048,53.910323,55.773239
7,Age 65-,45.469952,46.089677,44.226761
8,Baixo Peso,3.07377,3.402062,2.281668
9,Peso normal,29.994877,28.247423,29.661684
