# 📑 Tutorials for PyPOTS Clustering Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from global_config import RANDOM_SEED

set_random_seed(RANDOM_SEED)

# Load the PhysioNet-2012 dataset, disable artificially-missing values for evaluation
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


2024-03-17 22:33:58 [INFO]: Have set the random seed as 16 for numpy and pytorch.
2024-03-17 22:33:58 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...
2024-03-17 22:33:58 [INFO]: Starting preprocessing physionet_2012...
2024-03-17 22:33:58 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-03-17 22:33:58 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-03-17 22:33:58 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-03-17 22:33:58 [INFO]: Loaded successfully!


dict_keys(['n_classes', 'n_steps', 'n_features', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'scaler'])


In [2]:
# Assemble the datasets for training, validating, and testing.
import numpy as np

# don't need validation set
dataset_for_training = {
    "X": np.concatenate([physionet2012_dataset['train_X'], physionet2012_dataset['val_X']], axis=0),
    "y": np.concatenate([physionet2012_dataset['train_y'], physionet2012_dataset['val_y']], axis=0),
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
    "y": physionet2012_dataset['test_y'],
}


## 🚀 An example of **CRLI** for clustering

In [9]:
from pypots.optim import Adam
from pypots.clustering import CRLI

# initialize the model
crli = CRLI(
    n_steps=physionet2012_dataset["n_steps"],
    n_features=physionet2012_dataset["n_features"],
    n_clusters=physionet2012_dataset["n_classes"],
    n_generator_layers=2,
    rnn_hidden_size=256,
    rnn_cell_type="GRU",
    decoder_fcn_output_dims=[256, 128],  # the output dimensions of layers in the decoder FCN.
    # Here means there are 3 layers. Leave it to default as None will results in
    # the FCN haveing only one layer.
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    G_optimizer=Adam(lr=1e-3),
    D_optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="../tutorial_results/clustering/crli",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 23:02:01 [INFO]: Using the given device: cuda
2024-03-17 23:02:01 [INFO]: Model files will be saved to tutorial_results/clustering/crli/20240317_T230201
2024-03-17 23:02:01 [INFO]: Tensorboard file will be saved to tutorial_results/clustering/crli/20240317_T230201/tensorboard
2024-03-17 23:02:01 [INFO]: CRLI initialized with the given hyperparameters, the number of trainable parameters: 1,546,820


In [10]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
crli.fit(train_set=dataset_for_training)


2024-03-17 23:03:19 [INFO]: Epoch 001 - generator training loss: 3.1392, discriminator training loss: 0.3933
2024-03-17 23:04:37 [INFO]: Epoch 002 - generator training loss: 3.2296, discriminator training loss: 0.3710
2024-03-17 23:05:56 [INFO]: Epoch 003 - generator training loss: 3.2216, discriminator training loss: 0.3632
2024-03-17 23:07:15 [INFO]: Epoch 004 - generator training loss: 3.2012, discriminator training loss: 0.3592
2024-03-17 23:07:15 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-03-17 23:07:15 [INFO]: Finished training.
2024-03-17 23:07:15 [INFO]: Saved the model to tutorial_results/clustering/crli/20240317_T230201/CRLI.pypots


In [11]:
# the testing stage
crli_results = crli.predict(dataset_for_testing)
crli_prediction = crli_results["clustering"]

In [12]:
from pypots.utils.metrics import calc_rand_index, calc_cluster_purity

# calculate the values of clustering metrics on the model's prediction
RI = calc_rand_index(crli_prediction, dataset_for_testing["y"])
CP = calc_cluster_purity(crli_prediction, dataset_for_testing["y"])

print("Testing clustering metrics: \n"
      f'RI: {RI}, \n'
      f'CP: {CP}\n'
      )


Testing clustering metrics: 
RI: 0.6352735191995277, 
CP: 0.8582151793160967


## 🚀 An example of **VaDER** for clustering

In [16]:
from pypots.optim import Adam
from pypots.clustering import VaDER

# initialize the model
vader = VaDER(
    n_steps=physionet2012_dataset["n_steps"],
    n_features=physionet2012_dataset["n_features"],
    n_clusters=physionet2012_dataset["n_classes"],
    rnn_hidden_size=128,
    d_mu_stddev=2,
    pretrain_epochs=20,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="../tutorial_results/clustering/vader",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 23:26:23 [INFO]: Using the given device: cuda
2024-03-17 23:26:23 [INFO]: Model files will be saved to tutorial_results/clustering/vader/20240317_T232623
2024-03-17 23:26:23 [INFO]: Tensorboard file will be saved to tutorial_results/clustering/vader/20240317_T232623/tensorboard
2024-03-17 23:26:23 [INFO]: VaDER initialized with the given hyperparameters, the number of trainable parameters: 293,644


In [17]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
vader.fit(train_set=dataset_for_training)


2024-03-17 23:41:56 [INFO]: Epoch 001 - training loss: 0.6455
2024-03-17 23:42:31 [INFO]: Epoch 002 - training loss: 0.3395
2024-03-17 23:43:05 [INFO]: Epoch 003 - training loss: 0.3255
2024-03-17 23:43:48 [INFO]: Epoch 004 - training loss: 0.3286
2024-03-17 23:44:39 [INFO]: Epoch 005 - training loss: 0.3266
2024-03-17 23:45:36 [INFO]: Epoch 006 - training loss: 0.3116
2024-03-17 23:46:30 [INFO]: Epoch 007 - training loss: 0.3303
2024-03-17 23:47:19 [INFO]: Epoch 008 - training loss: 0.3111
2024-03-17 23:48:08 [INFO]: Epoch 009 - training loss: 0.3211
2024-03-17 23:48:57 [INFO]: Epoch 010 - training loss: 0.3170
2024-03-17 23:48:57 [INFO]: Finished training.
2024-03-17 23:48:57 [INFO]: Saved the model to tutorial_results/clustering/vader/20240317_T232623/VaDER.pypots


In [18]:
# the testing stage
vader_results = vader.predict(dataset_for_testing)
vader_prediction = vader_results["clustering"]

In [19]:
from pypots.utils.metrics import calc_rand_index, calc_cluster_purity

# calculate the values of clustering metrics on the model's prediction
RI = calc_rand_index(vader_prediction, dataset_for_testing["y"])
CP = calc_cluster_purity(vader_prediction, dataset_for_testing["y"])

print("Testing clustering metrics: \n"
      f'RI: {RI}, \n'
      f'CP: {CP},\n'
      )


Testing clustering metrics: 
RI: 0.7565347009032349, 
CP: 0.8582151793160967,
