In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

<h1>Loading database</h1>

In [4]:
from pypots.benchpots.datasets import preprocess_physionet2012
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2024-10-30 10:40:10 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-10-30 10:40:10 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-10-30 10:40:10 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-10-30 10:40:10 [INFO]: Loaded successfully!


<h1>Training data</h1>

<h3>Loading training database</h3>

In [5]:
train_X = physionet2012_dataset['train_X']

<h3>Divided into subgroups by gender and showing the percentage</h3>

In [6]:
distribution_gender_training = train_X['Gender'].value_counts(normalize=True) * 100
distribution_gender_training # 0: female, or 1: male

Gender
 1.0    55.846695
 0.0    44.035980
-1.0     0.117325
Name: proportion, dtype: float64

<h3>Divided into subgroups by ICUType and showing the percentage</h3>

In [7]:
distribution_ICUType_training = train_X['ICUType'].value_counts(normalize=True) * 100
distribution_ICUType_training

ICUType
3.0    35.614653
4.0    28.640334
2.0    21.287968
1.0    14.457046
Name: proportion, dtype: float64

<h3>Filtering only one age per patient</h3>

In [8]:
age_uniques_train = train_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_train.head()

Unnamed: 0,RecordID,Age
0,132540,76.0
1,132541,44.0
2,132543,68.0
3,132548,68.0
4,132554,64.0


<h3>Showing total occurrences of age</h3>

In [9]:
# 60% de 11988 (training set)
age_uniques_train.count()

RecordID    7671
Age         7671
dtype: int64

In [None]:
#first_values = train_X['Age'][::48] #first value of "Age" at every 48th index
#total_quantity_of_age_occurrences = first_values.count()
#total_quantity_of_age_occurrences

7671

In [None]:
#first_values.head()

0      54.0
96     44.0
144    68.0
192    88.0
240    64.0
Name: Age, dtype: float64

<h3>Dividing age into subgroup of people aged 65+</h3>

In [10]:
more_than_or_equal_to_65_training = age_uniques_train[age_uniques_train['Age'] >= 65]
# more_than_or_equal_to_65 = train_X[train_X['Age'] >= 65] 
#distribution_age_group_one = more_than_or_equal_to_65['Age'].count() 
#distribution_age_group_one
# age_uniques_more_than_or_equal_to_65 = more_than_or_equal_to_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_one = (more_than_or_equal_to_65_training.count() / age_uniques_train.count())*100
percentage_group_one

RecordID    54.712554
Age         54.712554
dtype: float64

<h3>Dividing age into subgroup of people under 65 years of age</h3>

In [11]:
less_than_65 = age_uniques_train[age_uniques_train['Age'] < 65]
# less_than_65 = train_X[train_X['Age'] < 65] 
#distribution_age_group_two = less_than_65['Age'].value_counts(normalize=True) * 100
#distribution_age_group_two

# age_uniques_less_than_65 = less_than_65.groupby('RecordID')['Age'].first().reset_index()
percentage_group_two = (less_than_65.count() / age_uniques_train.count()) * 100
percentage_group_two

RecordID    45.287446
Age         45.287446
dtype: float64

<h3>Filtering only one height and one weight per patient</h3>

In [24]:
filtered_train_X = train_X[(train_X['Height'] != -1) & (train_X['Weight'] != -1) & (train_X['Height'].notna()) & (train_X['Weight'].notna())] 

In [25]:
height_uniques_train = filtered_train_X.groupby('RecordID')['Height'].first().reset_index() 

height_uniques_train.head()

Unnamed: 0,RecordID,Height
0,132540,175.3
1,132543,180.3
2,132548,162.6
3,132567,157.5
4,132568,157.5


In [26]:
weight_uniques_train = filtered_train_X.groupby('RecordID')['Weight'].first().reset_index() 

weight_uniques_train.head()

Unnamed: 0,RecordID,Weight
0,132540,76.0
1,132543,84.6
2,132548,87.0
3,132567,56.0
4,132568,84.5


<h1>Validation data</h1>

In [11]:
validation_X = physionet2012_dataset['val_X']

In [12]:
age_uniques_validation = validation_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_validation.count()

RecordID    1918
Age         1918
dtype: int64

<h1>Test data</h1>

In [14]:
test_X = physionet2012_dataset['test_X']

In [15]:
age_uniques_test = test_X.groupby('RecordID')['Age'].first().reset_index()
age_uniques_test.count()

RecordID    2399
Age         2399
dtype: int64