In [3]:
import os
import sys
import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [4]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-30 20:56:29 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-30 20:56:29 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-30 20:56:29 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-30 20:56:29 [INFO]: Loaded successfully!


<h2>Training data</h2>

<h3>Loading training dataset</h3>

In [5]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [6]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training # 0: female, or 1: male

Gender
 1.0    55.520793
 0.0    44.400991
-1.0     0.078217
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [7]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training

ICUType
3.0    36.149133
4.0    28.066745
2.0    20.714379
1.0    15.069743
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [8]:
age_uniques_train = train_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_train.head()

Unnamed: 0,RecordID,Age
0,132539,54.0
1,132540,76.0
2,132541,44.0
3,132543,68.0
4,132545,88.0


<h3>Showing total occurrences of age</h3>

In [9]:
# 60% de 11988 (training set)
age_uniques_train.count()

RecordID    7671
Age         7671
dtype: int64

In [10]:
#first_values = train_X['Age'][::48] #first value of "Age" at every 48th index
#total_quantity_of_age_occurrences = first_values.count()
#total_quantity_of_age_occurrences

In [11]:
#first_values.head()

<h3>Dividing age into subgroup of people aged 65+</h3>

In [12]:
more_than_or_equal_to_65_training = age_uniques_train[age_uniques_train['Age'] >= 65]
# more_than_or_equal_to_65 = train_X[train_X['Age'] >= 65] 
#distribution_age_group_one = more_than_or_equal_to_65['Age'].count() 
#distribution_age_group_one
# age_uniques_more_than_or_equal_to_65 = more_than_or_equal_to_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_one_train = (more_than_or_equal_to_65_training.count() / age_uniques_train.count())*100
percentage_group_one_train

RecordID    54.686482
Age         54.686482
dtype: float64

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [13]:
less_than_65 = age_uniques_train[age_uniques_train['Age'] < 65]
# less_than_65 = train_X[train_X['Age'] < 65] 
#distribution_age_group_two = less_than_65['Age'].value_counts(normalize=True) * 100
#distribution_age_group_two

# age_uniques_less_than_65 = less_than_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_two_train = (less_than_65.count() / age_uniques_train.count()) * 100
percentage_group_two_train

RecordID    45.313518
Age         45.313518
dtype: float64

<h3>Filtering only one height and one weight per patient</h3>

In [14]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [15]:
height_uniques_train = filtered_train_X.groupby('RecordID')['Height'].first().reset_index() 
height_uniques_train.head()

Unnamed: 0,RecordID,Height
0,132540,175.3
1,132543,180.3
2,132547,180.3
3,132548,162.6
4,132551,162.6


In [16]:
weight_uniques_train = filtered_train_X.groupby('RecordID')['Weight'].first().reset_index() 
weight_uniques_train.head()

Unnamed: 0,RecordID,Weight
0,132540,76.0
1,132543,84.6
2,132547,114.0
3,132548,87.0
4,132551,48.4


<h3>Calculate the BMI</h3>

In [17]:
def classify_BMI(BMI):
    if BMI < 18.5:
        return "Baixo peso"
    elif BMI >= 18.6 and BMI <= 24.9:
        return "Peso normal"
    elif BMI >= 25 and BMI <= 29.9:
        return "Sobrepeso"
    elif BMI >= 30 and BMI <= 34.9:
        return "Obesidade grau 1"
    elif BMI >= 35 and BMI <= 39.9:
        return "Obesidade grau 2"
    elif BMI >= 40:
        return "Obesidade grau 3"

In [18]:
height_uniques_train['Height'] = height_uniques_train['Height'] / 100 # Converting Height from cm to meters

In [19]:
bmi_data_train = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_train["RecordID"] = height_uniques_train["RecordID"]
bmi_data_train["Height"] = height_uniques_train["Height"]
bmi_data_train["Weight"] = weight_uniques_train["Weight"]
bmi_data_train["BMI"] = weight_uniques_train["Weight"] / (height_uniques_train["Height"] ** 2)
bmi_data_train["Classificacao"] = bmi_data_train["BMI"].apply(classify_BMI)
bmi_data_train.head()
# Creating a new DataFrame to store relevant data
#bmi_data = filtered_train_X[['RecordID', 'Height', 'Weight', 'BMI']].reset_index(drop=True)

#bmi_data = pd.concat([height_uniques_train['RecordID'], height_uniques_train['Height'], weight_uniques_train['Weight']], sort=True)
#bmi_data = pd.merge(height_uniques_train[['RecordID', 'Height']], weight_uniques_train[['RecordID', 'Weight']], on='RecordID')
#bmi_data.head()
# bmi_data['BMI'] = weight_uniques_train['Weight'] / (height_uniques_train['Height'] ** 2)


Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132540,1.753,76.0,24.73146,Peso normal
1,132543,1.803,84.6,26.024291,Sobrepeso
2,132547,1.803,114.0,35.068194,Obesidade grau 2
3,132548,1.626,87.0,32.906233,Obesidade grau 1
4,132551,1.626,48.4,18.306456,Baixo peso


<h3>Percentage of BMI classification groups</h3>

In [124]:
percentage_bmi_train= bmi_data_train["Classificacao"].value_counts(normalize=True) * 100

In [130]:
percentage_bmi_train

Classificacao
Sobrepeso           33.682596
Peso normal         30.053667
Obesidade grau 1    18.374649
Obesidade grau 2     7.794531
Obesidade grau 3     6.900077
Baixo peso           3.194480
Name: proportion, dtype: float64

<h2>Validation data</h2>

<h3>Loading validation dataset </h3>

In [21]:
validation_X = physionet2012_dataset['val_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [22]:
distribution_gender_validation = validation_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_validation # 0: female, or 1: male

Gender
 1.0    56.882169
 0.0    43.013556
-1.0     0.104275
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [23]:
distribution_ICUType_validation = validation_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_validation

ICUType
3.0    34.567258
4.0    29.822732
2.0    21.584984
1.0    14.025026
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [24]:
age_uniques_validation = validation_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_validation.head()

Unnamed: 0,RecordID,Age
0,132554,64.0
1,132556,64.0
2,132575,78.0
3,132577,65.0
4,132622,71.0


<h3>Showing total occurrences of age</h3>

In [25]:
age_uniques_validation.count()

RecordID    1918
Age         1918
dtype: int64

<h3>Dividing age into subgroup of people aged 65+</h3>

In [26]:
more_than_or_equal_to_65_validation = age_uniques_validation[age_uniques_validation['Age'] >= 65]
percentage_group_one_validation = (more_than_or_equal_to_65_validation.count() / age_uniques_validation.count())*100
percentage_group_one_validation

RecordID    54.744526
Age         54.744526
dtype: float64

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [148]:
less_than_65 = age_uniques_validation[age_uniques_validation['Age'] < 65]
# less_than_65 = train_X[train_X['Age'] < 65] 
#distribution_age_group_two = less_than_65['Age'].value_counts(normalize=True) * 100
#distribution_age_group_two

# age_uniques_less_than_65 = less_than_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_two_validation = (less_than_65.count() / age_uniques_validation.count()) * 100
percentage_group_two_validation

RecordID    45.255474
Age         45.255474
dtype: float64

<h3>Filtering only one height and one weight per patient</h3>

In [27]:
filtered_validation_X = validation_X[(validation_X['Height'] != -1) & (validation_X['Weight'] != -1) & (validation_X['Height'].notna()) & (validation_X['Weight'].notna())] 

In [28]:
height_uniques_validation = filtered_validation_X.groupby('RecordID')['Height'].first().reset_index() 
height_uniques_validation.head()

Unnamed: 0,RecordID,Height
0,132575,167.6
1,132622,160.0
2,132662,172.7
3,132685,172.7
4,132781,177.8


In [29]:
weight_uniques_validation = filtered_validation_X.groupby('RecordID')['Weight'].first().reset_index() 
weight_uniques_validation.head()

Unnamed: 0,RecordID,Weight
0,132575,63.0
1,132622,79.0
2,132662,77.0
3,132685,81.8
4,132781,96.0


In [30]:
height_uniques_validation['Height'] = height_uniques_validation['Height'] / 100 # Converting Height from cm to meters

In [31]:
bmi_data_validation = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_validation["RecordID"] = height_uniques_validation["RecordID"]
bmi_data_validation["Height"] = height_uniques_validation["Height"]
bmi_data_validation["Weight"] = weight_uniques_validation["Weight"]
bmi_data_validation["BMI"] = weight_uniques_validation["Weight"] / (height_uniques_validation["Height"] ** 2)
bmi_data_validation["Classificacao"] = bmi_data_validation["BMI"].apply(classify_BMI)
bmi_data_validation.head()

Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132575,1.676,63.0,22.428102,Peso normal
1,132622,1.6,79.0,30.859375,Obesidade grau 1
2,132662,1.727,77.0,25.817016,Sobrepeso
3,132685,1.727,81.8,27.426389,Sobrepeso
4,132781,1.778,96.0,30.367408,Obesidade grau 1


<h3>Percentage of BMI classification groups</h3>

In [None]:
percentage_bmi_validation = bmi_data_validation["Classificacao"].value_counts(normalize=True) * 100

<h2>Test data</h2>

In [33]:
test_X = physionet2012_dataset['test_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>


In [34]:
distribution_gender_test = test_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_test # 0: female, or 1: male

Gender
 1.0    56.982076
 0.0    42.851188
-1.0     0.166736
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>


In [35]:
distribution_ICUType_test= test_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_test

ICUType
3.0    35.473114
4.0    28.470196
2.0    21.884118
1.0    14.172572
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>


In [36]:
age_uniques_test = test_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_test.head()

Unnamed: 0,RecordID,Age
0,132570,84.0
1,132585,40.0
2,132599,53.0
3,132610,72.0
4,132639,73.0


<h3>Showing total occurrences of age</h3>

In [37]:
age_uniques_test.count()

RecordID    2399
Age         2399
dtype: int64

<h3>Dividing age into subgroup of people aged 65+</h3>


In [38]:
more_than_or_equal_to_65_test = age_uniques_test[age_uniques_test['Age'] >= 65]
# more_than_or_equal_to_65 = train_X[train_X['Age'] >= 65] 
#distribution_age_group_one = more_than_or_equal_to_65['Age'].count() 
#distribution_age_group_one
# age_uniques_more_than_or_equal_to_65 = more_than_or_equal_to_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_one_test = (more_than_or_equal_to_65_test.count() / age_uniques_test.count())*100
percentage_group_one_test

RecordID    54.606086
Age         54.606086
dtype: float64

<h3>Dividing age into subgroup of people under 65 years of age</h3>


In [39]:
less_than_65_test = age_uniques_test[age_uniques_test['Age'] < 65]
# less_than_65 = train_X[train_X['Age'] < 65] 
#distribution_age_group_two = less_than_65['Age'].value_counts(normalize=True) * 100
#distribution_age_group_two

# age_uniques_less_than_65 = less_than_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_two_test = (less_than_65_test.count() / age_uniques_test.count()) * 100
percentage_group_two_test

RecordID    45.393914
Age         45.393914
dtype: float64

<h3>Filtering only one height and one weight per patient</h3>


In [40]:
filtered_test_X = test_X[(test_X['Height'] != -1) & (test_X['Weight'] != -1) & (test_X['Height'].notna()) & (test_X['Weight'].notna())] 

In [41]:
height_uniques_test = filtered_test_X.groupby('RecordID')['Height'].first().reset_index() 
height_uniques_test.head()

Unnamed: 0,RecordID,Height
0,132570,170.2
1,132585,165.1
2,132599,177.8
3,132610,172.9
4,132639,180.3


In [42]:
weight_uniques_test = filtered_test_X.groupby('RecordID')['Weight'].first().reset_index() 
weight_uniques_test.head()

Unnamed: 0,RecordID,Weight
0,132570,102.6
1,132585,84.7
2,132599,73.5
3,132610,72.26
4,132639,96.3


<h3>Calculate the BMI</h3>


In [43]:
height_uniques_test['Height'] = height_uniques_test['Height'] / 100 # Converting Height from cm to meters

In [44]:
bmi_data_test = pd.DataFrame(columns=['RecordID', 'Height', 'Weight', 'BMI', 'Classificacao'])
bmi_data_test["RecordID"] = height_uniques_test["RecordID"]
bmi_data_test["Height"] = height_uniques_test["Height"]
bmi_data_test["Weight"] = weight_uniques_test["Weight"]
bmi_data_test["BMI"] = weight_uniques_test["Weight"] / (height_uniques_test["Height"] ** 2)
bmi_data_test["Classificacao"] = bmi_data_test["BMI"].apply(classify_BMI)
bmi_data_test.head()
# Creating a new DataFrame to store relevant data
#bmi_data = filtered_train_X[['RecordID', 'Height', 'Weight', 'BMI']].reset_index(drop=True)

#bmi_data = pd.concat([height_uniques_train['RecordID'], height_uniques_train['Height'], weight_uniques_train['Weight']], sort=True)
#bmi_data = pd.merge(height_uniques_train[['RecordID', 'Height']], weight_uniques_train[['RecordID', 'Weight']], on='RecordID')
#bmi_data.head()
# bmi_data['BMI'] = weight_uniques_train['Weight'] / (height_uniques_train['Height'] ** 2)


Unnamed: 0,RecordID,Height,Weight,BMI,Classificacao
0,132570,1.702,102.6,35.418344,Obesidade grau 2
1,132585,1.651,84.7,31.073435,Obesidade grau 1
2,132599,1.778,73.5,23.250047,Peso normal
3,132610,1.729,72.26,24.171743,Peso normal
4,132639,1.803,96.3,29.623395,Sobrepeso


<h3>Percentage of BMI classification groups</h3>

In [141]:
percentage_bmi_test = bmi_data_test["Classificacao"].value_counts(normalize=True) * 100


In [92]:
subgroups = [("Female"), ("Male"), ("ICUType 1"), ("ICUType 2"), ("ICUType 3"), ("ICUType 4"),
    ("Age 65+"), ("Age 65-"), ("Baixo Peso"), ("Peso normal"), ("Sobrepeso"),
    ("Obesidade grau 1"), ("Obesidade grau 2"), ("Obesidade grau 3")
]

df_subgroups = pd.DataFrame(subgroups, columns=["Subgroups"])

In [138]:
train_subgroups = [
    (distribution_gender_training[0]), (distribution_gender_training[1]), (distribution_ICUType_training[1]), (distribution_ICUType_training[2]),
    (distribution_ICUType_training[3]), (distribution_ICUType_training[4]), (percentage_group_one_train["Age"]), (percentage_group_two_train["Age"]),
    (percentage_bmi_train["Baixo peso"]), (percentage_bmi_train["Peso normal"]), (percentage_bmi_train["Sobrepeso"]), (percentage_bmi_train["Obesidade grau 1"]),
    (percentage_bmi_train["Obesidade grau 2"]), (percentage_bmi_train["Obesidade grau 3"])
]

df_train_subgroups = pd.DataFrame(train_subgroups, columns=["train"])

In [151]:
validation_subgroups = [
    (distribution_gender_validation[0]), (distribution_gender_validation[1]), (distribution_ICUType_validation[1]), (distribution_ICUType_validation[2]),
    (distribution_ICUType_validation[3]), (distribution_ICUType_validation[4]), (percentage_group_one_validation["Age"]), (percentage_group_two_validation["Age"]),
    (percentage_bmi_validation["Baixo peso"]), (percentage_bmi_validation["Peso normal"]), (percentage_bmi_validation["Sobrepeso"]), (percentage_bmi_validation["Obesidade grau 1"]),
    (percentage_bmi_validation["Obesidade grau 2"]), (percentage_bmi_validation["Obesidade grau 3"])
]

df_validation_subgroups = pd.DataFrame(validation_subgroups, columns=["train"])

In [152]:
test_subgroups = [
    (distribution_gender_test[0]), (distribution_gender_test[1]), (distribution_ICUType_test[1]), (distribution_ICUType_test[2]),
    (distribution_ICUType_test[3]), (distribution_ICUType_test[4]), (percentage_group_one_test["Age"]), (percentage_group_two_test["Age"]),
    (percentage_bmi_test["Baixo peso"]), (percentage_bmi_test["Peso normal"]), (percentage_bmi_test["Sobrepeso"]), (percentage_bmi_test["Obesidade grau 1"]),
    (percentage_bmi_test["Obesidade grau 2"]), (percentage_bmi_test["Obesidade grau 3"])
]

df_test_subgroups = pd.DataFrame(test_subgroups, columns=["train"])

In [154]:
table1 = pd.DataFrame(columns=['Subgroups', 'train', 'validation', 'test'])
table1["Subgroups"] = df_subgroups
table1["train"] = df_train_subgroups
table1["validation"] = df_validation_subgroups
table1["test"] = df_test_subgroups



In [155]:
table1

Unnamed: 0,Subgroups,train,validation,test
0,Female,44.400991,43.013556,42.851188
1,Male,55.520793,56.882169,56.982076
2,ICUType 1,15.069743,14.025026,14.172572
3,ICUType 2,20.714379,21.584984,21.884118
4,ICUType 3,36.149133,34.567258,35.473114
5,ICUType 4,28.066745,29.822732,28.470196
6,Age 65+,54.686482,54.744526,54.606086
7,Age 65-,45.313518,45.255474,45.393914
8,Baixo Peso,3.19448,2.507523,2.591093
9,Peso normal,30.053667,29.889669,28.178138
