In [2]:
import os
import sys
import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [3]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-31 08:52:20 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-31 08:52:20 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-31 08:52:20 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-31 08:52:20 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [4]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [5]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training # 0: female, or 1: male

Gender
 1.0    56.055273
 0.0    43.827402
-1.0     0.117325
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [6]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training

ICUType
3.0    36.018772
4.0    28.483900
2.0    21.157607
1.0    14.339721
Name: proportion, dtype: float64

<h3>Filtering one measurement per patient</h3>

In [7]:
uniques_train_per_variable = train_X.groupby('RecordID').first().reset_index()
uniques_train_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,16.0,,...,,99.0,103.0,34.88,,,316.666667,7.4,76.0,7.45
1,132541,0,0.0,127.0,91.0,235.0,44.0,2.7,8.0,3.0,...,,95.0,148.0,37.8,,,100.0,4.2,56.7,7.51
2,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
3,132545,0,0.0,,,,88.0,3.3,45.0,,...,24.0,,,37.8,,,140.0,3.8,-1.0,
4,132547,0,0.0,101.0,45.0,47.0,64.0,,15.0,0.4,...,,96.0,141.0,35.8,1.3,,1200.0,24.0,114.0,7.29


In [8]:
uniques_train_per_variable.count()

RecordID       7671
level_1        7671
Time           7671
ALP            3238
ALT            3312
AST            3316
Age            7671
Albumin        3071
BUN            7556
Bilirubin      3302
Cholesterol     602
Creatinine     7556
DiasABP        5356
FiO2           5167
GCS            7566
Gender         7671
Glucose        7487
HCO3           7541
HCT            7554
HR             7566
Height         7671
ICUType        7671
K              7516
Lactate        4178
MAP            5341
MechVent       4837
Mg             7493
NIDiasABP      6762
NIMAP          6749
NISysABP       6778
Na             7540
PaCO2          5756
PaO2           5757
Platelets      7548
RespRate       2144
SaO2           3441
SysABP         5356
Temp           7566
TroponinI       373
TroponinT      1674
Urine          7483
WBC            7535
Weight         7671
pH             5787
dtype: int64

<h3>Showing total occurrences of age</h3>

In [9]:
# 60% de 11988 (training set)
uniques_train_per_variable["Age"].count()

7671

<h3>Dividing age into subgroup of people aged 65+</h3>

In [10]:
more_than_or_equal_to_65_training = uniques_train_per_variable[uniques_train_per_variable['Age'] >= 65]
percentage_group_one_train = (more_than_or_equal_to_65_training["Age"].count() / uniques_train_per_variable["Age"].count())*100
percentage_group_one_train

54.569156563681396

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [11]:
less_than_65 = uniques_train_per_variable[uniques_train_per_variable['Age'] < 65]
percentage_group_two_train = (less_than_65["Age"].count() / uniques_train_per_variable["Age"].count()) * 100
percentage_group_two_train

45.430843436318604

<h3>Filtering only one height and one weight per patient</h3>

In [12]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [13]:
filtered_uniques_train = filtered_train_X.groupby('RecordID').first().reset_index() 
filtered_uniques_train.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132540,0,0.0,,,,76.0,,21.0,,...,,93.0,122.0,37.5,,,50.0,13.3,76.0,7.45
1,132543,0,0.0,105.0,12.0,15.0,68.0,4.4,23.0,0.2,...,18.0,,,36.3,,,600.0,11.5,84.6,
2,132547,0,0.0,,,,64.0,,,,...,,,,,,,,,114.0,
3,132548,0,0.0,,,,68.0,,32.0,,...,14.5,,205.0,36.3,0.7,,120.0,6.2,87.0,
4,132551,0,0.0,47.0,46.0,82.0,78.0,1.9,81.0,0.3,...,,97.0,102.75,38.0,3.5,,120.0,16.1,48.4,7.4


<h3>Calculate the BMI</h3>

In [14]:
def classify_BMI(BMI):
    if BMI < 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [15]:
filtered_uniques_train['Height'] = filtered_uniques_train['Height'] / 100 # Converting Height from cm to meters

In [16]:
bmi_data_train = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_train["RecordID"] = filtered_uniques_train["RecordID"]
bmi_data_train["Height"] = filtered_uniques_train["Height"]
bmi_data_train["Weight"] = filtered_uniques_train["Weight"]
bmi_data_train["BMI"] = filtered_uniques_train["Weight"] / (filtered_uniques_train["Height"] ** 2)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132543,1.803,84.6,26.024291,Sobrepeso
2,132547,1.803,114.0,35.068194,Obesidade grau 2
3,132548,1.626,87.0,32.906233,Obesidade grau 1
4,132551,1.626,48.4,18.306456,Baixo peso


<h3>Percentage of BMI classification groups</h3>

In [17]:
percentage_bmi_train= bmi_data_train["Classificacao"].value_counts(normalize=True) * 100

In [18]:
percentage_bmi_train

Classificacao
Sobrepeso           34.088595
Peso normal         29.735234
Obesidade grau 1    18.457230
Obesidade grau 2     7.688391
Obesidade grau 3     6.975560
Baixo peso           3.054990
Name: proportion, dtype: float64

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [19]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [20]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation # 0: female, or 1: male

Gender
 1.0    56.465068
 0.0    43.430657
-1.0     0.104275
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [21]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation

ICUType
3.0    34.932221
4.0    28.154327
2.0    21.637122
1.0    15.276330
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [22]:
uniques_validation_per_variable = validation_X.groupby('RecordID').first().reset_index()
uniques_validation_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132575,0,0.0,,,,78.0,,21.0,,...,,99.0,111.6,34.72,,,210.0,12.5,63.0,7.34
1,132595,0,0.0,,,,26.0,,9.0,,...,,,,,,,,2.7,-1.0,7.37
2,132599,0,0.0,124.0,14.0,20.0,53.0,2.0,33.0,1.3,...,,98.0,135.0,37.3,,0.02,350.0,14.2,73.5,7.48
3,132605,0,0.0,,,,90.0,,23.0,,...,,,,36.3,,,50.0,10.0,55.0,7.43
4,132612,0,0.0,,,,52.0,,21.0,,...,,,94.0,35.3,,,220.0,14.1,109.0,7.29


<h3>Showing total occurrences of age</h3>

In [23]:
uniques_validation_per_variable["Age"].count()

1918

<h3>Dividing age into subgroup of people aged 65+</h3>

In [24]:
more_than_or_equal_to_65_validation = uniques_validation_per_variable[uniques_validation_per_variable['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation["Age"].count() / uniques_validation_per_variable["Age"].count())*100
percentage_group_one_validation

54.3274244004171

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [25]:
less_than_65 = uniques_validation_per_variable[uniques_validation_per_variable['Age'] < 65]
percentage_group_two_validation = (less_than_65["Age"].count() / uniques_validation_per_variable["Age"].count()) * 100
percentage_group_two_validation

45.6725755995829

<h3>Filtering only one height and one weight per patient</h3>

In [26]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [27]:
filtered_uniques_validation = filtered_validation_X.groupby('RecordID').first().reset_index() 
filtered_uniques_validation.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132575,0,0.0,,,,78.0,,18.0,,...,,96.0,122.0,37.4,,,38.0,12.5,63.0,7.34
1,132599,0,0.0,,,,53.0,,,,...,,,,37.3,,,350.0,,73.5,
2,132653,0,0.0,,,,78.0,2.7,31.0,,...,,98.0,127.0,36.15,,,145.0,14.8,91.3,7.44
3,132669,0,0.0,,,,74.0,,10.0,,...,,98.0,87.0,37.0,,,60.0,8.6,81.8,7.41
4,132704,0,0.0,57.0,33.0,73.0,63.0,3.1,8.0,0.4,...,,,0.0,33.3,,,500.0,7.1,68.9,7.32


In [28]:
filtered_uniques_validation['Height'] = filtered_uniques_validation['Height'] / 100 # Converting Height from cm to meters

In [29]:
bmi_data_validation = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_validation["RecordID"] = filtered_uniques_validation["RecordID"]
bmi_data_validation["Height"] = filtered_uniques_validation["Height"]
bmi_data_validation["Weight"] = filtered_uniques_validation["Weight"]
bmi_data_validation["BMI"] = filtered_uniques_validation["Weight"] / (filtered_uniques_validation["Height"] ** 2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132575,1.676,63.0,22.428102,Peso normal
1,132599,1.778,73.5,23.250047,Peso normal
2,132653,1.778,91.3,28.88067,Sobrepeso
3,132669,1.803,81.8,25.162967,Sobrepeso
4,132704,1.829,68.9,20.596423,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [30]:
percentage_bmi_validation = bmi_data_validation["Classificacao"].value_counts(normalize=True) * 100

<h2>Test data</h2>

In [31]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [32]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test # 0: female, or 1: male

Gender
 1.0    55.606503
 0.0    44.351813
-1.0     0.041684
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [33]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test

ICUType
3.0    35.598166
4.0    28.470196
2.0    20.425177
1.0    15.506461
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>


In [34]:
uniques_test_per_variable = test_X.groupby('RecordID').first().reset_index()
uniques_test_per_variable.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132539,0,0.0,,,,54.0,,13.0,,...,19.0,,,35.35,,,480.0,11.2,-1.0,
1,132602,0,0.0,,,,80.0,,30.0,,...,,,,37.3,,,150.0,12.6,70.0,7.49
2,132622,0,0.0,149.0,23.0,33.0,71.0,2.7,71.0,0.9,...,28.0,,,38.3,8.8,,250.0,7.2,79.0,
3,132623,0,0.0,,,,24.0,,11.0,,...,,99.0,123.0,35.1,,,1003.5,11.4,78.0,7.45
4,132635,0,0.0,,,,66.0,,18.0,,...,,95.0,131.5,37.6,,,385.0,9.1,70.1,7.48


<h3>Showing total occurrences of age</h3>

In [35]:
uniques_test_per_variable["Age"].count()

2399

<h3>Dividing age into subgroup of people aged 65+</h3>


In [36]:
more_than_or_equal_to_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] >= 65]
percentage_group_one_test = (more_than_or_equal_to_65_test["Age"].count() / uniques_test_per_variable["Age"].count())*100
percentage_group_one_test

55.31471446436015

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [37]:
less_than_65_test = uniques_test_per_variable[uniques_test_per_variable['Age'] < 65]
percentage_group_two_test = (less_than_65_test["Age"].count() / uniques_test_per_variable["Age"].count()) * 100
percentage_group_two_test

44.68528553563985

<h3>Filtering only one height and one weight per patient</h3>


In [38]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [39]:
filtered_uniques_test = filtered_test_X.groupby('RecordID').first().reset_index() 
filtered_uniques_test.head()

Unnamed: 0,RecordID,level_1,Time,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,132602,0,0.0,,,,80.0,,,,...,,,,37.3,,,150.0,,70.0,
1,132622,0,0.0,,,,71.0,,64.0,,...,28.0,,,37.4,19.0,,80.0,7.2,79.0,
2,132623,0,0.0,,,,24.0,,,,...,,,,,,,,,78.0,7.45
3,132635,0,0.0,,,,66.0,,23.0,,...,,97.0,98.5,37.55,,,20.0,7.7,70.1,7.48
4,132637,0,0.0,,,,78.0,,13.0,,...,,98.0,99.0,37.0,,,90.0,14.2,56.0,7.39


<h3>Calculate the BMI</h3>


In [40]:
filtered_uniques_test['Height'] = filtered_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [41]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = filtered_uniques_test["RecordID"]
bmi_data_test["Height"] = filtered_uniques_test["Height"]
bmi_data_test["Weight"] = filtered_uniques_test["Weight"]
bmi_data_test["BMI"] = filtered_uniques_test["Weight"] / (filtered_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132602,1.803,70.0,21.533101,Peso normal
1,132622,1.6,79.0,30.859375,Obesidade grau 1
2,132623,1.829,78.0,23.316706,Peso normal
3,132635,1.676,70.1,24.955713,
4,132637,1.702,56.0,19.33165,Peso normal


<h3>Percentage of BMI classification groups</h3>

In [42]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100


In [43]:
subgroups = [("Female"), ("Male"), ("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [44]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]), (distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train), (percentage_group_two_train),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [45]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]), (distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation), (percentage_group_two_validation),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [46]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]), (distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test), (percentage_group_two_test),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

In [47]:
table1 = pd.DataFrame(columns=['Subgroups', 'train', 'validation', 'test'])
table1["Subgroups"] = df_subgroups
table1["train"] = df_train_subgroups
table1["validation"] = df_validation_subgroups
table1["test"] = df_test_subgroups



In [48]:
table1

Unnamed: 0,Subgroups,train,validation,test
0,Female,43.827402,43.430657,44.351813
1,Male,56.055273,56.465068,55.606503
2,ICUType 1,14.339721,15.27633,15.506461
3,ICUType 2,21.157607,21.637122,20.425177
4,ICUType 3,36.018772,34.932221,35.598166
5,ICUType 4,28.4839,28.154327,28.470196
6,Age 65+,54.569157,54.327424,55.314714
7,Age 65-,45.430843,45.672576,44.685286
8,Baixo Peso,3.05499,2.683897,2.890173
9,Peso normal,29.735234,28.33002,30.470685


In [52]:
with open ('table_tex', 'w') as f:
    f.write(table1.to_latex(index=False))    
print(table1.to_latex)    

<bound method NDFrame.to_latex of            Subgroups      train  validation       test
0             Female  43.827402   43.430657  44.351813
1               Male  56.055273   56.465068  55.606503
2          ICUType 1  14.339721   15.276330  15.506461
3          ICUType 2  21.157607   21.637122  20.425177
4          ICUType 3  36.018772   34.932221  35.598166
5          ICUType 4  28.483900   28.154327  28.470196
6            Age 65+  54.569157   54.327424  55.314714
7            Age 65-  45.430843   45.672576  44.685286
8         Baixo Peso   3.054990    2.683897   2.890173
9        Peso normal  29.735234   28.330020  30.470685
10         Sobrepeso  34.088595   34.890656  35.094963
11  Obesidade grau 1  18.457230   18.687873  17.093311
12  Obesidade grau 2   7.688391    7.952286   7.762180
13  Obesidade grau 3   6.975560    7.455268   6.688687>
