# 📑 Tutorials for PyPOTS Classification Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from global_config import RANDOM_SEED

set_random_seed(RANDOM_SEED)

# Load the PhysioNet-2012 dataset, disable artificially-missing values for evaluation
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)
 
# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you, 
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())

2024-03-17 22:49:34 [INFO]: Have set the random seed as 16 for numpy and pytorch.
2024-03-17 22:49:34 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...
2024-03-17 22:49:34 [INFO]: Starting preprocessing physionet_2012...
2024-03-17 22:49:34 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-03-17 22:49:34 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-03-17 22:49:34 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-03-17 22:49:34 [INFO]: Loaded successfully!


dict_keys(['n_classes', 'n_steps', 'n_features', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'scaler', 'val_X_ori', 'test_X_ori', 'test_X_indicating_mask'])


In [2]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
    "y": physionet2012_dataset['train_y'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "y": physionet2012_dataset['val_y'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
    "y": physionet2012_dataset['test_y'],
}

## 🚀 An example of **Raindrop** for classification

In [3]:
from pypots.optim import Adam
from pypots.classification import Raindrop

# initialize the model
raindrop = Raindrop(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_classes=physionet2012_dataset["n_classes"],
    n_layers=2,
    d_model=physionet2012_dataset["n_features"] * 4,
    d_ffn=256,
    n_heads=2,
    dropout=0.3,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="../tutorial_results/classification/raindrop",
    model_saving_strategy="best", # only save the best model after training finished. 
                                  # You can also set it as "better" to save models performing better ever during training.
)

2024-03-17 22:49:51 [INFO]: Using the given device: cuda
2024-03-17 22:49:51 [INFO]: Model files will be saved to tutorial_results/classification/raindrop/20240317_T224951
2024-03-17 22:49:51 [INFO]: Tensorboard file will be saved to tutorial_results/classification/raindrop/20240317_T224951/tensorboard
2024-03-17 22:49:52 [INFO]: Raindrop initialized with the given hyperparameters, the number of trainable parameters: 1,415,006


In [4]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
raindrop.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

2024-03-17 22:50:23 [INFO]: Epoch 001 - training loss: 0.3865, validating loss: 0.3650
2024-03-17 22:50:47 [INFO]: Epoch 002 - training loss: 0.3357, validating loss: 0.3486
2024-03-17 22:51:10 [INFO]: Epoch 003 - training loss: 0.3231, validating loss: 0.3519
2024-03-17 22:51:33 [INFO]: Epoch 004 - training loss: 0.3168, validating loss: 0.3371
2024-03-17 22:51:55 [INFO]: Epoch 005 - training loss: 0.3094, validating loss: 0.3402
2024-03-17 22:52:17 [INFO]: Epoch 006 - training loss: 0.3067, validating loss: 0.3556
2024-03-17 22:52:39 [INFO]: Epoch 007 - training loss: 0.2993, validating loss: 0.3314
2024-03-17 22:53:07 [INFO]: Epoch 008 - training loss: 0.2898, validating loss: 0.3399
2024-03-17 22:53:35 [INFO]: Epoch 009 - training loss: 0.2822, validating loss: 0.3308
2024-03-17 22:53:59 [INFO]: Epoch 010 - training loss: 0.2759, validating loss: 0.3503
2024-03-17 22:53:59 [INFO]: Finished training.
2024-03-17 22:53:59 [INFO]: Saved the model to tutorial_results/classification/rain

In [5]:
# the testing stage
raindrop_results = raindrop.predict(dataset_for_testing)
raindrop_prediction = raindrop_results["classification"]

In [6]:
from pypots.utils.metrics import calc_binary_classification_metrics

# calculate the values of binary classification metrics on the model's prediction
metrics = calc_binary_classification_metrics(raindrop_prediction, dataset_for_testing["y"])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
)

Testing classification metrics: 
ROC_AUC: 0.8110672840564797, 
PR_AUC: 0.4184412610743794,
F1: 0.41390205371248023,
Precision: 0.447098976109215,
Recall: 0.38529411764705884,


## 🚀 An example of **BRITS** for classification

In [7]:
from pypots.optim import Adam
from pypots.classification import BRITS

# initialize the model
brits = BRITS(
    n_steps=physionet2012_dataset['n_steps'], 
    n_features=physionet2012_dataset['n_features'], 
    n_classes=physionet2012_dataset["n_classes"],
    rnn_hidden_size=256,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="../tutorial_results/classification/brits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-03-17 22:54:03 [INFO]: Using the given device: cuda
2024-03-17 22:54:03 [INFO]: Model files will be saved to tutorial_results/classification/brits/20240317_T225403
2024-03-17 22:54:03 [INFO]: Tensorboard file will be saved to tutorial_results/classification/brits/20240317_T225403/tensorboard
2024-03-17 22:54:03 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 730,612


In [8]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
brits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

2024-03-17 22:55:09 [INFO]: Epoch 001 - training loss: 0.9035, validating loss: 0.8116
2024-03-17 22:56:04 [INFO]: Epoch 002 - training loss: 0.7604, validating loss: 0.7751
2024-03-17 22:57:10 [INFO]: Epoch 003 - training loss: 0.7164, validating loss: 0.7471
2024-03-17 22:58:17 [INFO]: Epoch 004 - training loss: 0.6902, validating loss: 0.7711
2024-03-17 22:59:23 [INFO]: Epoch 005 - training loss: 0.6782, validating loss: 0.7286
2024-03-17 23:00:31 [INFO]: Epoch 006 - training loss: 0.6547, validating loss: 0.7265
2024-03-17 23:01:57 [INFO]: Epoch 007 - training loss: 0.6419, validating loss: 0.7295
2024-03-17 23:03:49 [INFO]: Epoch 008 - training loss: 0.6221, validating loss: 0.7234
2024-03-17 23:05:36 [INFO]: Epoch 009 - training loss: 0.6007, validating loss: 0.7549
2024-03-17 23:07:16 [INFO]: Epoch 010 - training loss: 0.5942, validating loss: 0.7689
2024-03-17 23:07:16 [INFO]: Finished training.
2024-03-17 23:07:16 [INFO]: Saved the model to tutorial_results/classification/brit

In [9]:
# the testing stage
brits_results = brits.predict(dataset_for_testing)
brits_prediction = brits_results["classification"]

In [10]:
from pypots.utils.metrics import calc_binary_classification_metrics

# calculate the values of binary classification metrics on the model's prediction
metrics = calc_binary_classification_metrics(brits_prediction, dataset_for_testing["y"])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
)

Testing classification metrics: 
ROC_AUC: 0.811434573829532, 
PR_AUC: 0.40568596808086554,
F1: 0.3706293706293706,
Precision: 0.45689655172413796,
Recall: 0.31176470588235294,


## 🚀 An example of **GRUD** for classification

In [11]:
from pypots.optim import Adam
from pypots.classification import GRUD

# initialize the model
grud = GRUD(
    n_steps=physionet2012_dataset['n_steps'], 
    n_features=physionet2012_dataset['n_features'], 
    n_classes=physionet2012_dataset["n_classes"],
    rnn_hidden_size=32, 
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="../tutorial_results/classification/grud",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-03-17 23:07:31 [INFO]: Using the given device: cuda
2024-03-17 23:07:31 [INFO]: Model files will be saved to tutorial_results/classification/grud/20240317_T230731
2024-03-17 23:07:31 [INFO]: Tensorboard file will be saved to tutorial_results/classification/grud/20240317_T230731/tensorboard
2024-03-17 23:07:31 [INFO]: GRUD initialized with the given hyperparameters, the number of trainable parameters: 16,128


In [12]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
grud.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

2024-03-17 23:07:55 [INFO]: Epoch 001 - training loss: 0.3332, validating loss: 0.3188
2024-03-17 23:08:15 [INFO]: Epoch 002 - training loss: 0.2950, validating loss: 0.3212
2024-03-17 23:08:35 [INFO]: Epoch 003 - training loss: 0.2855, validating loss: 0.3182
2024-03-17 23:08:56 [INFO]: Epoch 004 - training loss: 0.2779, validating loss: 0.3223
2024-03-17 23:09:18 [INFO]: Epoch 005 - training loss: 0.2711, validating loss: 0.3155
2024-03-17 23:09:39 [INFO]: Epoch 006 - training loss: 0.2619, validating loss: 0.3330
2024-03-17 23:10:00 [INFO]: Epoch 007 - training loss: 0.2571, validating loss: 0.3182
2024-03-17 23:10:21 [INFO]: Epoch 008 - training loss: 0.2509, validating loss: 0.3229
2024-03-17 23:10:21 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-03-17 23:10:21 [INFO]: Finished training.
2024-03-17 23:10:21 [INFO]: Saved the model to tutorial_results/classification/grud/20240317_T230731/GRUD.pypots


In [13]:
# the testing stage
grud_results = grud.predict(dataset_for_testing)
grud_prediction = grud_results["classification"]

In [14]:
from pypots.utils.metrics import calc_binary_classification_metrics

# calculate the values of binary classification metrics on the model's prediction
metrics = calc_binary_classification_metrics(grud_prediction, dataset_for_testing["y"])
print("Testing classification metrics: \n"
    f'ROC_AUC: {metrics["roc_auc"]}, \n'
    f'PR_AUC: {metrics["pr_auc"]},\n'
    f'F1: {metrics["f1"]},\n'
    f'Precision: {metrics["precision"]},\n'
    f'Recall: {metrics["recall"]},\n'
)

Testing classification metrics: 
ROC_AUC: 0.8327902589607271, 
PR_AUC: 0.48160561802559804,
F1: 0.41064638783269963,
Precision: 0.5806451612903226,
Recall: 0.3176470588235294,
