In [1]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [2]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-11-11 17:28:09 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-11 17:28:09 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-11 17:28:09 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-11 17:28:10 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [3]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [4]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training =  round(distribution_gender_training, 1) # 0: female, or 1: male

In [5]:
if(distribution_gender_training.count() != 3):
    distribution_gender_training[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [6]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training =  round(distribution_ICUType_training, 1)

<h3>Filtering one measurement per patient</h3>

In [7]:
uniques_train_per_variable = train_X[train_X["Time"] == 0.0]
uniques_train_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,,,...,19.0,,,35.35,,,480.0,,-1.0,
48,132540,0,0.0,,,,76.0,,,,...,,,,,,,,,76.0,7.45
144,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.30,,,,11.5,84.6,
192,132545,0,0.0,,,,88.0,,,,...,,,,,,,,,-1.0,
240,132547,0,0.0,,,,64.0,,,,...,,,,,,,,,114.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575040,163017,0,0.0,,,,64.0,,,,...,23.0,,,37.00,,,200.0,,84.5,
575088,163021,0,0.0,,,,72.0,,9.0,,...,,,,,,,,8.6,62.0,
575184,163029,0,0.0,,,,61.0,,,,...,,,,,,,,,85.0,
575232,163033,0,0.0,,,,51.0,,,,...,,,,,,,,,208.0,


In [8]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP             330
ALT             342
AST             341
Age            7671
Albumin         291
BUN             888
Bilirubin       325
Cholesterol      40
Creatinine      889
DiasABP        1415
FiO2           1443
GCS            2740
Gender         7671
Glucose         786
HCO3            808
HCT             995
HR             3844
Height         7671
ICUType        7671
K               807
Lactate         871
MAP            1367
MechVent       1373
Mg              637
NIDiasABP      2853
NIMAP          2828
NISysABP       2855
Na              795
PaCO2          1750
PaO2           1721
Platelets      1015
RespRate       1155
SaO2            263
SysABP         1416
Temp           2909
TroponinI        29
TroponinT       183
Urine          2377
WBC             922
Weight         7671
pH             1770
dtype: int64

<h3>Showing total occurrences of age</h3>

In [9]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [10]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train =  round(percentage_group_one_train, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [11]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train =  round(percentage_group_two_train, 1)

<h3>Filtering only one height and one weight per patient</h3>

<h3>Calculate the BMI</h3>

In [12]:
def classify_BMI(BMI):
    if BMI <= 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [13]:
train_count = train_X.groupby("RecordID").first().reset_index()
train_count = train_count["RecordID"].count()
train_count

7671

In [14]:
filtered_train = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())]
filtered_train = filtered_train.groupby("RecordID").first().reset_index()
filtered_train_ids = filtered_train["RecordID"]

In [15]:
filtered_train_ids.count()

4042

In [16]:
undefined_train = train_X[~train_X["RecordID"].isin(filtered_train_ids)]
undefined_train = undefined_train.groupby("RecordID").first().reset_index()
undefined_train_ids = undefined_train["RecordID"]

In [17]:
undefined_train_ids.count()

3629

In [18]:
filtered_train_metros = filtered_train.copy()
filtered_train_metros["Height"] = filtered_train_metros["Height"]/100

In [19]:
filtered_train_metros["BMI"] = round(filtered_train_metros["Weight"]/(filtered_train_metros["Height"]**2),1)

In [20]:
filtered_train_metros["Classification"] = filtered_train_metros["BMI"].apply(classify_BMI)

In [21]:
baixo_peso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Baixo peso"]
baixo_peso_train_percentage = round((baixo_peso_train_percentage["RecordID"].count()/train_count)*100,1)
baixo_peso_train_percentage

1.8

In [22]:
peso_normal_train_percentage = filtered_train_metros[filtered_train_metros["Classification"] == "Peso normal"]
peso_normal_train_percentage = round((peso_normal_train_percentage["RecordID"].count()/train_count)*100,1)
peso_normal_train_percentage

16.1

In [23]:
sobrepeso_train_percentage = filtered_train_metros[filtered_train_metros["Classification"]=="Sobrepeso"]
sobrepeso_train_percentage = round((sobrepeso_train_percentage["RecordID"].count()/train_count)*100,1)
sobrepeso_train_percentage

17.5

In [24]:
obesidade_1_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_train_percentege = round((obesidade_1_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_1_train_percentege

9.8

In [25]:
obesidade_2_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_train_percentege = round((obesidade_2_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_2_train_percentege

4.2

In [26]:
obesidade_3_train_percentege = filtered_train_metros[filtered_train_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_train_percentege = round((obesidade_3_train_percentege["RecordID"].count()/train_count)*100,1)
obesidade_3_train_percentege

3.4

In [27]:
undefined_train_percentege = round((undefined_train["RecordID"].count()/train_count)*100,1)
undefined_train_percentege

47.3

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [28]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [29]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation =  round(distribution_gender_validation, 1) # 0: female, or 1: male

In [30]:
if(distribution_gender_validation.count() != 3):
    distribution_gender_validation[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [31]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation = round(distribution_ICUType_validation, 1)

<h3>Filtering only one age per patient</h3>

In [32]:
uniques_validation_per_variable = validation_X[validation_X["Time"] == 0.0]
uniques_validation_per_variable

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
768,132577,0,0.0,,,,65.0,,36.0,,...,,,,,,,,,66.3,
816,132582,0,0.0,,,,84.0,,,,...,,,,,,,,,82.5,
1008,132590,0,0.0,,,,58.0,,,,...,,,,,,,,,98.0,
1632,132615,0,0.0,,,,46.0,,,,...,,,0.0,36.4,,,80.0,,88.6,
1680,132617,0,0.0,,,,77.0,,,,...,,,,,,,,,75.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574032,162955,0,0.0,,,,69.0,,,,...,,,,,,,,,78.0,
574224,162976,0,0.0,,,,60.0,,,,...,,,,,,,,,90.0,
574416,162987,0,0.0,,,,57.0,,,,...,,,92.0,,,,380.0,,83.0,7.34
574944,163013,0,0.0,,,,74.0,,,,...,,,,36.5,,,,,68.6,


<h3>Showing total occurrences of age</h3>

In [33]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [34]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation = round(percentage_group_one_validation, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [35]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation =  round(percentage_group_two_validation, 1)

<h3>Filtering only one height and one weight per patient</h3>

In [36]:
validation_count = validation_X.groupby("RecordID").first().reset_index()
validation_count = validation_count["RecordID"].count()
validation_count

1918

In [37]:
filtered_validation = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())]
filtered_validation = filtered_validation.groupby("RecordID").first().reset_index()
filtered_validation_ids = filtered_validation["RecordID"]

In [38]:
filtered_validation_ids.count()

992

In [39]:
undefined_validation = validation_X[~validation_X["RecordID"].isin(filtered_validation_ids)]
undefined_validation = undefined_validation.groupby("RecordID").first().reset_index()
undefined_validation_ids = undefined_validation["RecordID"]

In [40]:
undefined_validation_ids.count()

926

In [41]:
filtered_validation_metros = filtered_validation.copy()
filtered_validation_metros["Height"] = filtered_validation_metros["Height"]/100

In [42]:
filtered_validation_metros["BMI"] = round(filtered_validation_metros["Weight"]/(filtered_validation_metros["Height"]**2),1)

In [43]:
filtered_validation_metros["Classification"] = filtered_validation_metros["BMI"].apply(classify_BMI)

In [44]:
filtered_validation_metros["Classification"].value_counts()

Classification
Sobrepeso           366
Peso normal         284
Obesidade grau 1    170
Obesidade grau 3     83
Obesidade grau 2     64
Baixo peso           25
Name: count, dtype: int64

In [45]:
baixo_peso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Baixo peso"]
baixo_peso_validation_percentage = round((baixo_peso_validation_percentage["RecordID"].count()/validation_count)*100,1)
baixo_peso_validation_percentage

1.3

In [46]:
peso_normal_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"] == "Peso normal"]
peso_normal_validation_percentage = round((peso_normal_validation_percentage["RecordID"].count()/validation_count)*100,1)
peso_normal_validation_percentage

14.8

In [47]:
sobrepeso_validation_percentage = filtered_validation_metros[filtered_validation_metros["Classification"]=="Sobrepeso"]
sobrepeso_validation_percentage = round((sobrepeso_validation_percentage["RecordID"].count()/validation_count)*100,1)
sobrepeso_validation_percentage

19.1

In [48]:
obesidade_1_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_validation_percentege = round((obesidade_1_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_1_validation_percentege

8.9

In [49]:
obesidade_2_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_validation_percentege = round((obesidade_2_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_2_validation_percentege

3.3

In [50]:
obesidade_3_validation_percentege = filtered_validation_metros[filtered_validation_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_validation_percentege = round((obesidade_3_validation_percentege["RecordID"].count()/validation_count)*100,1)
obesidade_3_validation_percentege

4.3

In [51]:
undefined_validation_percentege = round((undefined_validation["RecordID"].count()/validation_count)*100,1)
undefined_validation_percentege

48.3

<h2>Test data</h2>

In [52]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [53]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test = round(distribution_gender_test, 1) # 0: female, or 1: male

In [54]:
if(distribution_gender_test.count() != 3):
    distribution_gender_test[-1] = 0

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [55]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test = round(distribution_ICUType_test, 1)

<h3>Filtering only one age per patient</h3>


In [56]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132541,0,0.0,127.0,91.0,235.0,44.0,2.7,8.0,3.0,...,,95.0,148.0,37.8,,,100.0,4.2,56.7,7.51
1,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
2,132570,0,0.0,19.0,15.0,20.0,84.0,,83.0,0.1,...,16.5,98.0,,36.6,,,600.0,8.8,102.6,
3,132584,0,0.0,48.0,10.0,13.0,78.0,2.2,16.0,0.9,...,,,6.333333,36.3,,0.03,50.0,31.7,72.8,7.1
4,132599,0,0.0,124.0,14.0,20.0,53.0,2.0,33.0,1.3,...,,98.0,135.0,37.3,,0.02,350.0,14.2,73.5,7.48


<h3>Showing total occurrences of age</h3>

In [57]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [58]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test = round(percentage_group_one_test, 1)

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [59]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test = round(percentage_group_two_test, 1)

<h3>Filtering only one height and one weight per patient</h3>


In [60]:
test_count = test_X.groupby("RecordID").first().reset_index()
test_count = test_count["RecordID"].count()
test_count

2399

In [61]:
filtered_test =  test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())]
filtered_test = filtered_test.groupby("RecordID").first().reset_index()
filtered_test_ids = filtered_test["RecordID"]

In [62]:
filtered_test_ids.count()

1234

In [63]:
undefined_test = test_X[~test_X["RecordID"].isin(filtered_test_ids)]
undefined_test = undefined_test.groupby("RecordID").first().reset_index()
undefined_test_ids = undefined_test["RecordID"]

In [64]:
undefined_test_ids.count()

1165

In [65]:
filtered_test_metros = filtered_test.copy()
filtered_test_metros["Height"] = filtered_test_metros["Height"]/100

In [66]:
filtered_test_metros["BMI"] = round(filtered_test_metros["Weight"]/(filtered_test_metros["Height"]**2),1)

In [67]:
filtered_test_metros["Classification"] = filtered_test_metros["BMI"].apply(classify_BMI)

In [68]:
filtered_test_metros["Classification"].value_counts()

Classification
Sobrepeso           455
Peso normal         343
Obesidade grau 1    220
Obesidade grau 2     92
Obesidade grau 3     91
Baixo peso           33
Name: count, dtype: int64

In [69]:
baixo_peso_test_percentage = filtered_test_metros[filtered_test_metros["Classification"] == "Baixo peso"]
baixo_peso_test_percentage = round((baixo_peso_test_percentage["RecordID"].count()/test_count)*100,1)
baixo_peso_test_percentage

1.4

In [70]:
peso_normal_test_percentage = filtered_test_metros[filtered_test_metros["Classification"] == "Peso normal"]
peso_normal_test_percentage = round((peso_normal_test_percentage["RecordID"].count()/test_count)*100,1)
peso_normal_test_percentage

14.3

In [71]:
sobrepeso_test_percentage = filtered_test_metros[filtered_test_metros["Classification"]=="Sobrepeso"]
sobrepeso_test_percentage = round((sobrepeso_test_percentage["RecordID"].count()/test_count)*100,1)
sobrepeso_test_percentage

19.0

In [72]:
obesidade_1_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 1"]
obesidade_1_test_percentege = round((obesidade_1_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_1_test_percentege

9.2

In [73]:
obesidade_2_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 2"]
obesidade_2_test_percentege = round((obesidade_2_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_2_test_percentege

3.8

In [74]:
obesidade_3_test_percentege = filtered_test_metros[filtered_test_metros["Classification"] == "Obesidade grau 3"]
obesidade_3_test_percentege = round((obesidade_3_test_percentege["RecordID"].count()/test_count)*100,1)
obesidade_3_test_percentege

3.8

In [75]:
undefined_test_percentege = round((undefined_test["RecordID"].count()/test_count)*100,1)
undefined_test_percentege

48.6

<h3>Percentage of BMI classification groups</h3>

In [76]:
subgroups = [("Female"), ("Male"),("Undefined gender") ,("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3"),("Undefined_classification")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [77]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]),(distribution_gender_training[-1]) ,(distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (baixo_peso_train_percentage), (peso_normal_train_percentage), (sobrepeso_train_percentage), (obesidade_1_train_percentege),
    (obesidade_2_train_percentege), (obesidade_3_train_percentege), (undefined_train_percentege)
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [78]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]),(distribution_gender_validation[-1]) ,(distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (baixo_peso_validation_percentage), (peso_normal_validation_percentage), (sobrepeso_validation_percentage), (obesidade_1_validation_percentege),
    (obesidade_2_validation_percentege), (obesidade_3_validation_percentege), (undefined_validation_percentege)
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [79]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]),(distribution_gender_test[-1]) ,(distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (baixo_peso_test_percentage), (peso_normal_test_percentage), (sobrepeso_test_percentage), (obesidade_1_test_percentege),
    (obesidade_2_test_percentege), (obesidade_3_test_percentege), (undefined_test_percentege)
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

<h2>Table 3</h2>

In [80]:
table_descriptive_statistics = pd.DataFrame(columns=['Subgroups', 'Train', 'Validation', 'Test'])
table_descriptive_statistics["Subgroups"] = df_subgroups
table_descriptive_statistics["Train"] = df_train_subgroups
table_descriptive_statistics["Validation"] = df_validation_subgroups
table_descriptive_statistics["Test"] = df_test_subgroups

display(HTML("<h2 style=' font-size: 24px; font-weight: bold;'>Descriptive statistics stratified by demographics</h2>"))
table_descriptive_statistics

Unnamed: 0,Subgroups,Train,Validation,Test
0,Female,44.3,43.0,43.1
1,Male,55.6,56.9,56.7
2,Undefined gender,0.1,0.1,0.1
3,ICUType 1,14.8,13.3,15.4
4,ICUType 2,21.1,21.8,20.4
5,ICUType 3,35.6,36.7,35.6
6,ICUType 4,28.5,28.2,28.6
7,Age 65+,55.4,53.8,53.1
8,Age 65-,44.6,46.2,46.9
9,Baixo Peso,1.8,1.3,1.4
