In [1]:
import os
import sys
import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-30 23:39:42 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-30 23:39:42 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-30 23:39:42 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-30 23:39:42 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [3]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [4]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training # 0: female, or 1: male

Gender
 1.0    56.276887
 0.0    43.631860
-1.0     0.091253
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [5]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training

ICUType
3.0    36.070916
4.0    28.105853
2.0    21.092426
1.0    14.730804
Name: proportion, dtype: float64

<h3>Filtering one measurement per patient</h3>

In [6]:
uniques_train_per_variable = train_X.groupby('RecordID').first().reset_index()
uniques_train_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132541,0,0.0,127.0,91.0,235.0,44.0,2.7,8.0,3.0,...,,95.0,148.0,37.8,,,100.0,4.2,56.7,7.51
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
2,132545,0,0.0,,,,88.0,3.3,45.0,,...,24.0,,,37.8,,,140.0,3.8,-1.0,
3,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
4,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4


In [7]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP            3238
ALT            3318
AST            3318
Age            7671
Albumin        3119
BUN            7553
Bilirubin      3324
Cholesterol     606
Creatinine     7553
DiasABP        5373
FiO2           5195
GCS            7568
Gender         7671
Glucose        7475
HCO3           7536
HCT            7547
HR             7568
Height         7671
ICUType        7671
K              7517
Lactate        4187
MAP            5359
MechVent       4846
Mg             7482
NIDiasABP      6704
NIMAP          6689
NISysABP       6721
Na             7535
PaCO2          5751
PaO2           5752
Platelets      7540
RespRate       2147
SaO2           3438
SysABP         5373
Temp           7568
TroponinI       349
TroponinT      1687
Urine          7485
WBC            7530
Weight         7671
pH             5793
dtype: int64

<h3>Showing total occurrences of age</h3>

In [8]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [9]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train

54.64737322383001

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [None]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train

45.35262677616999

<h3>Filtering only one height and one weight per patient</h3>

In [11]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [12]:
filtered_uniques_train = filtered_train_X.groupby('RecordID').first().reset_index() 
filtered_uniques_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
1,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
2,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4
3,132555,0,0.0,,,,74.0,,19.0,,...,,99.0,98.0,34.8,,,35.0,9.0,66.1,7.39
4,132567,0,0.0,,,,71.0,,9.0,,...,,98.0,111.5,35.6,,,15.0,9.0,56.0,7.44


<h3>Calculate the BMI</h3>

In [13]:
def classify_BMI(BMI):
    if BMI < 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [14]:
filtered_uniques_train['Height'] = filtered_uniques_train['Height'] / 100 # Converting Height from cm to meters

In [None]:
bmi_data_train = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_train["RecordID"] = filtered_uniques_train["RecordID"]
bmi_data_train["Height"] = filtered_uniques_train["Height"]
bmi_data_train["Weight"] = filtered_uniques_train["Weight"]
bmi_data_train["BMI"] = filtered_uniques_train["Weight"] / (filtered_uniques_train["Height"] ** 2)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132543,1.803,84.6,26.024291,Sobrepeso
1,132548,1.626,87.0,32.906233,Obesidade grau 1
2,132551,1.626,48.4,18.306456,Baixo peso
3,132555,1.753,66.1,21.509862,Peso normal
4,132567,1.575,56.0,22.574956,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [17]:
percentage_bmi_train= bmi_data_train["Classificacao"].value_counts(normalize=True) * 100

In [18]:
percentage_bmi_train

Classificacao
Sobrepeso           34.301131
Peso normal         29.213772
Obesidade grau 1    18.936280
Obesidade grau 2     7.399794
Obesidade grau 3     7.297020
Baixo peso           2.852004
Name: proportion, dtype: float64

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [19]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [20]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation # 0: female, or 1: male

Gender
 1.0    54.640250
 0.0    45.307612
-1.0     0.052138
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [21]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation

ICUType
3.0    36.704901
4.0    27.528676
2.0    21.480709
1.0    14.285714
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [24]:
uniques_validation_per_variable = validation_X.groupby('RecordID').first().reset_index()
uniques_validation_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132588,0,0.0,202.0,58.0,102.0,48.0,2.0,7.0,6.8,...,17.0,,,38.4,,,,7.0,42.3,
1,132601,0,0.0,,,,74.0,,10.0,,...,,99.0,127.0,36.2,,,442.5,16.4,75.9,7.39
2,132614,0,0.0,,,,77.0,,49.0,,...,,,0.0,37.3,,,50.0,17.9,59.0,7.46
3,132615,0,0.0,81.0,32.0,28.0,46.0,2.6,22.0,0.3,...,,,0.0,36.4,,,80.0,22.1,88.6,7.34
4,132632,0,0.0,140.0,18.0,11.0,49.0,3.4,121.0,0.1,...,,,127.0,36.5,,,120.0,8.0,162.2,7.21


<h3>Showing total occurrences of age</h3>

In [43]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [27]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation

54.74452554744526

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [None]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation

45.25547445255474

<h3>Filtering only one height and one weight per patient</h3>

In [31]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [33]:
filtered_uniques_validation = filtered_validation_X.groupby('RecordID').first().reset_index() 
filtered_uniques_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132588,0,0.0,,,,48.0,,,,...,,,,,,,,,42.3,
1,132601,0,0.0,,,,74.0,,,,...,,,,,,,,,75.9,7.39
2,132614,0,0.0,,,,77.0,,,,...,,,,,,,,,59.0,
3,132615,0,0.0,81.0,32.0,28.0,46.0,2.6,22.0,0.3,...,,,0.0,36.4,,,80.0,22.1,88.6,7.34
4,132648,0,0.0,,,,87.0,,,,...,,,144.0,37.8,,,1112.5,,66.0,


In [34]:
filtered_uniques_validation['Height'] = filtered_uniques_validation['Height'] / 100 # Converting Height from cm to meters

In [35]:
bmi_data_validation = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_validation["RecordID"] = filtered_uniques_validation["RecordID"]
bmi_data_validation["Height"] = filtered_uniques_validation["Height"]
bmi_data_validation["Weight"] = filtered_uniques_validation["Weight"]
bmi_data_validation["BMI"] = filtered_uniques_validation["Weight"] / (filtered_uniques_validation["Height"] ** 2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132588,1.549,42.3,17.6294,Baixo peso
1,132601,1.778,75.9,24.009232,Peso normal
2,132614,1.626,59.0,22.315721,Peso normal
3,132615,1.524,88.6,38.147299,Obesidade grau 2
4,132648,1.575,66.0,26.606198,Sobrepeso


<h3>Percentage of BMI classification groups</h3>

In [36]:
percentage_bmi_validation = bmi_data_validation["Classificacao"].value_counts(normalize=True) * 100

<h2>Test data</h2>

In [37]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [38]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test # 0: female, or 1: male

Gender
 1.0    56.356815
 0.0    43.476449
-1.0     0.166736
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [39]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test

ICUType
3.0    34.014173
4.0    30.179241
2.0    20.758649
1.0    15.047937
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>


In [40]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,13.0,,...,19.0,,,35.35,,,480.0,11.2,-1.0,
1,132540,0,0.0,,,,76.0,,16.0,,...,,99.0,103.0,34.88,,,316.666667,7.4,76.0,7.45
2,132547,0,0.0,101.0,45.0,47.0,64.0,,15.0,0.4,...,,96.0,141.0,35.8,1.3,,1200.0,24.0,114.0,7.29
3,132554,0,0.0,,,,64.0,,21.0,,...,44.0,,,37.3,,,300.0,15.2,60.7,
4,132556,0,0.0,402.0,36.0,47.0,64.0,2.7,64.0,0.1,...,10.0,,,36.7,,,200.0,21.8,65.0,


<h3>Showing total occurrences of age</h3>

In [41]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [None]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test

54.7311379741559

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [None]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test

45.2688620258441

<h3>Filtering only one height and one weight per patient</h3>


In [49]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [52]:
filtered_uniques_test = filtered_test_X.groupby('RecordID').first().reset_index() 
filtered_uniques_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,21.0,,...,,93.0,122.0,37.5,,,50.0,13.3,76.0,7.45
1,132547,0,0.0,,,,64.0,,,,...,,,,,,,,,114.0,
2,132575,0,0.0,,,,78.0,,18.0,,...,,96.0,122.0,37.4,,,38.0,12.5,63.0,7.34
3,132590,0,0.0,,,,58.0,,,,...,,,119.0,36.8,,,70.0,,98.0,
4,132602,0,0.0,,,,80.0,,,,...,,,,37.3,,,150.0,,70.0,


<h3>Calculate the BMI</h3>


In [53]:
filtered_uniques_test['Height'] = filtered_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [None]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = filtered_uniques_test["RecordID"]
bmi_data_test["Height"] = filtered_uniques_test["Height"]
bmi_data_test["Weight"] = filtered_uniques_test["Weight"]
bmi_data_test["BMI"] = filtered_uniques_test["Weight"] / (filtered_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132547,1.803,114.0,35.068194,Obesidade grau 2
2,132575,1.676,63.0,22.428102,Peso normal
3,132590,1.88,98.0,27.727478,Sobrepeso
4,132602,1.803,70.0,21.533101,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [55]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100


In [56]:
subgroups = [("Female"), ("Male"), ("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [58]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]), (distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [59]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]), (distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [60]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]), (distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

In [61]:
table1 = pd.DataFrame(columns=['Subgroups', 'train', 'validation', 'test'])
table1["Subgroups"] = df_subgroups
table1["train"] = df_train_subgroups
table1["validation"] = df_validation_subgroups
table1["test"] = df_test_subgroups



In [62]:
table1

Unnamed: 0,Subgroups,train,validation,test
0,Female,43.63186,45.307612,43.476449
1,Male,56.276887,54.64025,56.356815
2,ICUType 1,14.730804,14.285714,15.047937
3,ICUType 2,21.092426,21.480709,20.758649
4,ICUType 3,36.070916,36.704901,34.014173
5,ICUType 4,28.105853,27.528676,30.179241
6,Age 65+,54.647373,54.744526,54.731138
7,Age 65-,45.352627,45.255474,45.268862
8,Baixo Peso,2.852004,2.649657,3.56564
9,Peso normal,29.213772,30.029441,30.713128
