# 📑 Tutorials for PyPOTS Imputation Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from global_config import RANDOM_SEED

set_random_seed(RANDOM_SEED)

# Load the PhysioNet-2012 dataset
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


2024-03-17 22:30:38 [INFO]: Have set the random seed as 16 for numpy and pytorch.
2024-03-17 22:30:38 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...
2024-03-17 22:30:38 [INFO]: Starting preprocessing physionet_2012...
2024-03-17 22:30:38 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-03-17 22:30:38 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-03-17 22:30:38 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-03-17 22:30:39 [INFO]: Loaded successfully!


dict_keys(['n_classes', 'n_steps', 'n_features', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'scaler', 'val_X_ori', 'test_X_ori', 'test_X_indicating_mask'])


In [2]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_ori": physionet2012_dataset['val_X_ori'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


## 🚀 An example of **SAITS** for imputation

In [3]:
from pypots.optim import Adam
from pypots.imputation import SAITS

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_ffn=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 22:30:56 [INFO]: Using the given device: cuda
2024-03-17 22:30:56 [INFO]: Model files will be saved to tutorial_results/imputation/saits/20240317_T223056
2024-03-17 22:30:56 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits/20240317_T223056/tensorboard
2024-03-17 22:30:56 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358


In [4]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 22:31:04 [INFO]: Epoch 001 - training loss: 0.7161, validating loss: 0.3840
2024-03-17 22:31:11 [INFO]: Epoch 002 - training loss: 0.5172, validating loss: 0.3559
2024-03-17 22:31:18 [INFO]: Epoch 003 - training loss: 0.4633, validating loss: 0.3153
2024-03-17 22:31:25 [INFO]: Epoch 004 - training loss: 0.4269, validating loss: 0.3069
2024-03-17 22:31:33 [INFO]: Epoch 005 - training loss: 0.3975, validating loss: 0.2906
2024-03-17 22:31:40 [INFO]: Epoch 006 - training loss: 0.3745, validating loss: 0.2811
2024-03-17 22:31:47 [INFO]: Epoch 007 - training loss: 0.3600, validating loss: 0.2766
2024-03-17 22:31:54 [INFO]: Epoch 008 - training loss: 0.3526, validating loss: 0.2721
2024-03-17 22:32:00 [INFO]: Epoch 009 - training loss: 0.3424, validating loss: 0.2675
2024-03-17 22:32:06 [INFO]: Epoch 010 - training loss: 0.3361, validating loss: 0.2696
2024-03-17 22:32:06 [INFO]: Finished training.
2024-03-17 22:32:06 [INFO]: Saved the model to tutorial_results/imputation/saits/20

In [5]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
saits_results = saits.predict(dataset_for_testing)
saits_imputation = saits_results["imputation"]


In [6]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    saits_imputation, 
    physionet2012_dataset['test_X_ori'], 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2312


## 🚀 An example of **Transformer** for imputation

In [7]:
from pypots.optim import Adam
from pypots.imputation import Transformer

# initialize the model
transformer = Transformer(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=6,
    d_model=512,
    d_ffn=256,
    n_heads=4,
    d_k=128,
    d_v=128,
    dropout=0.1,
    attn_dropout=0,
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/transformer",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 22:32:07 [INFO]: Using the given device: cuda
2024-03-17 22:32:07 [INFO]: Model files will be saved to tutorial_results/imputation/transformer/20240317_T223207
2024-03-17 22:32:07 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/transformer/20240317_T223207/tensorboard
2024-03-17 22:32:07 [INFO]: Transformer initialized with the given hyperparameters, the number of trainable parameters: 7,938,597


In [8]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
transformer.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 22:32:14 [INFO]: Epoch 001 - training loss: 1.4377, validating loss: 1.0124
2024-03-17 22:32:22 [INFO]: Epoch 002 - training loss: 1.3912, validating loss: 0.9928
2024-03-17 22:32:29 [INFO]: Epoch 003 - training loss: 1.3825, validating loss: 0.9913
2024-03-17 22:32:37 [INFO]: Epoch 004 - training loss: 1.3709, validating loss: 0.9883
2024-03-17 22:32:44 [INFO]: Epoch 005 - training loss: 1.3676, validating loss: 0.9881
2024-03-17 22:32:52 [INFO]: Epoch 006 - training loss: 1.3662, validating loss: 0.9919
2024-03-17 22:33:00 [INFO]: Epoch 007 - training loss: 1.3650, validating loss: 0.9865
2024-03-17 22:33:07 [INFO]: Epoch 008 - training loss: 1.3643, validating loss: 0.9902
2024-03-17 22:33:15 [INFO]: Epoch 009 - training loss: 1.3643, validating loss: 0.9955
2024-03-17 22:33:22 [INFO]: Epoch 010 - training loss: 1.3632, validating loss: 0.9996
2024-03-17 22:33:22 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-03-17 22:33:22 [INFO]: Fini

In [9]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
transformer_results = transformer.predict(dataset_for_testing)
transformer_imputation = transformer_results["imputation"]

In [10]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    transformer_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.6858


## 🚀 An example of **TimesNet** for imputation

In [11]:
from pypots.optim import Adam
from pypots.imputation import TimesNet

# initialize the model
timesnet = TimesNet(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=1,
    top_k=1,
    d_model=128,
    d_ffn=512,
    n_kernels=5,
    dropout=0.5,
    apply_nonstationary_norm=False,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/timesnet",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-03-17 22:33:22 [INFO]: Using the given device: cuda
2024-03-17 22:33:22 [INFO]: Model files will be saved to tutorial_results/imputation/timesnet/20240317_T223322
2024-03-17 22:33:22 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/timesnet/20240317_T223322/tensorboard
2024-03-17 22:33:23 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 21,649,317


In [12]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
timesnet.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 22:33:29 [INFO]: Epoch 001 - training loss: 0.4754, validating loss: 0.3634
2024-03-17 22:33:34 [INFO]: Epoch 002 - training loss: 0.4249, validating loss: 0.3489
2024-03-17 22:33:39 [INFO]: Epoch 003 - training loss: 0.4562, validating loss: 0.3402
2024-03-17 22:33:45 [INFO]: Epoch 004 - training loss: 0.4208, validating loss: 0.3371
2024-03-17 22:33:50 [INFO]: Epoch 005 - training loss: 0.4197, validating loss: 0.3344
2024-03-17 22:33:55 [INFO]: Epoch 006 - training loss: 0.4246, validating loss: 0.3308
2024-03-17 22:34:00 [INFO]: Epoch 007 - training loss: 0.4657, validating loss: 0.3287
2024-03-17 22:34:05 [INFO]: Epoch 008 - training loss: 0.3869, validating loss: 0.3204
2024-03-17 22:34:10 [INFO]: Epoch 009 - training loss: 0.3711, validating loss: 0.3170
2024-03-17 22:34:15 [INFO]: Epoch 010 - training loss: 0.3610, validating loss: 0.3175
2024-03-17 22:34:15 [INFO]: Finished training.
2024-03-17 22:34:15 [INFO]: Saved the model to tutorial_results/imputation/timesnet

In [13]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
timesnet_results = timesnet.predict(dataset_for_testing)
timesnet_imputation = timesnet_results["imputation"]

In [14]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    timesnet_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.3291


## 🚀 An example of **CSDI** for imputation

In [15]:
from pypots.optim import Adam
from pypots.imputation import CSDI

# initialize the model
csdi = CSDI(
    n_features=physionet2012_dataset['n_features'],
    n_layers=6,
    n_heads=2,
    n_channels=128,
    d_time_embedding=64,
    d_feature_embedding=32,
    d_diffusion_embedding=128,
    target_strategy="random",
    n_diffusion_steps=50,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/csdi",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-03-17 22:34:15 [INFO]: Using the given device: cuda:1
2024-03-17 22:34:15 [INFO]: Model files will be saved to tutorial_results/imputation/csdi/20240317_T223415
2024-03-17 22:34:15 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/csdi/20240317_T223415/tensorboard
2024-03-17 22:34:15 [INFO]: CSDI initialized with the given hyperparameters, the number of trainable parameters: 1,694,753


In [16]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
csdi.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 22:36:03 [INFO]: Epoch 001 - training loss: 0.3336, validating loss: 0.2451
2024-03-17 22:37:50 [INFO]: Epoch 002 - training loss: 0.2654, validating loss: 0.2087
2024-03-17 22:39:37 [INFO]: Epoch 003 - training loss: 0.2497, validating loss: 0.2071
2024-03-17 22:41:23 [INFO]: Epoch 004 - training loss: 0.2442, validating loss: 0.1966
2024-03-17 22:43:09 [INFO]: Epoch 005 - training loss: 0.2317, validating loss: 0.1922
2024-03-17 22:44:55 [INFO]: Epoch 006 - training loss: 0.2231, validating loss: 0.1884
2024-03-17 22:46:41 [INFO]: Epoch 007 - training loss: 0.2322, validating loss: 0.1922
2024-03-17 22:48:29 [INFO]: Epoch 008 - training loss: 0.2357, validating loss: 0.1889
2024-03-17 22:50:15 [INFO]: Epoch 009 - training loss: 0.2298, validating loss: 0.1825
2024-03-17 22:52:05 [INFO]: Epoch 010 - training loss: 0.2251, validating loss: 0.1968
2024-03-17 22:52:05 [INFO]: Finished training.
2024-03-17 22:52:05 [INFO]: Saved the model to tutorial_results/imputation/csdi/202

In [18]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

# CSDI has an argument to control the number of sampling times during inference
csdi_results = csdi.predict(dataset_for_testing, n_sampling_times=2)
csdi_imputation = csdi_results["imputation"]

print(f"The shape of csdi_imputation is {csdi_imputation.shape}")

# for error calculation, we need to take the mean value of the multiple samplings for each data sample
mean_csdi_imputation = csdi_imputation.mean(axis=1)

The shape of csdi_imputation is (2398, 2, 48, 37)


In [19]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mean_csdi_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2976


## 🚀 An example of **US-GAN** for imputation

In [20]:
from pypots.optim import Adam
from pypots.imputation import USGAN

# initialize the model
us_gan = USGAN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=256,
    lambda_mse=1,
    dropout=0.1,
    G_steps=1,
    D_steps=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    G_optimizer=Adam(lr=1e-3),
    D_optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/us_gan",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 22:59:38 [INFO]: Using the given device: cuda
2024-03-17 22:59:38 [INFO]: Model files will be saved to tutorial_results/imputation/us_gan/20240317_T225938
2024-03-17 22:59:38 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/us_gan/20240317_T225938/tensorboard
2024-03-17 22:59:38 [INFO]: USGAN initialized with the given hyperparameters, the number of trainable parameters: 1,258,517


In [21]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
us_gan.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 23:02:11 [INFO]: Epoch 001 - generator training loss: 4.0480, discriminator training loss: 0.1877, validating loss: 0.3738
2024-03-17 23:05:01 [INFO]: Epoch 002 - generator training loss: 4.8115, discriminator training loss: 0.1211, validating loss: 0.3330
2024-03-17 23:07:45 [INFO]: Epoch 003 - generator training loss: 5.2930, discriminator training loss: 0.0932, validating loss: 0.3181
2024-03-17 23:10:28 [INFO]: Epoch 004 - generator training loss: 5.6615, discriminator training loss: 0.0778, validating loss: 0.3131
2024-03-17 23:12:48 [INFO]: Epoch 005 - generator training loss: 5.9637, discriminator training loss: 0.0678, validating loss: 0.3101
2024-03-17 23:14:55 [INFO]: Epoch 006 - generator training loss: 6.2180, discriminator training loss: 0.0607, validating loss: 0.3069
2024-03-17 23:17:00 [INFO]: Epoch 007 - generator training loss: 6.4427, discriminator training loss: 0.0554, validating loss: 0.3068
2024-03-17 23:19:03 [INFO]: Epoch 008 - generator training los

In [22]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
us_gan_results = us_gan.predict(dataset_for_testing)
us_gan_imputation = us_gan_results["imputation"]

In [23]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    us_gan_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2526


## 🚀 An example of **GP-VAE** for imputation

In [24]:
from pypots.optim import Adam
from pypots.imputation import GPVAE

# initialize the model
gp_vae = GPVAE(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    latent_size=37,
    encoder_sizes=(128,128),
    decoder_sizes=(256,256),
    kernel="cauchy",
    beta=0.2,
    M=1,
    K=1,
    sigma=1.005,
    length_scale=7.0,
    kernel_scales=1,
    window_size=24,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/gp_vae",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 23:23:01 [INFO]: Using the given device: cuda
2024-03-17 23:23:01 [INFO]: Model files will be saved to tutorial_results/imputation/gp_vae/20240317_T232301
2024-03-17 23:23:01 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/gp_vae/20240317_T232301/tensorboard
2024-03-17 23:23:01 [INFO]: GPVAE initialized with the given hyperparameters, the number of trainable parameters: 229,652


In [25]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
gp_vae.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 23:23:10 [INFO]: Epoch 001 - training loss: 25965.8314, validating loss: 0.6320
2024-03-17 23:23:19 [INFO]: Epoch 002 - training loss: 22875.1126, validating loss: 0.5981
2024-03-17 23:23:27 [INFO]: Epoch 003 - training loss: 22840.8942, validating loss: 0.5821
2024-03-17 23:23:36 [INFO]: Epoch 004 - training loss: 22833.0289, validating loss: 0.5662
2024-03-17 23:23:44 [INFO]: Epoch 005 - training loss: 22829.6722, validating loss: 0.5608
2024-03-17 23:23:52 [INFO]: Epoch 006 - training loss: 22818.5064, validating loss: 0.5363
2024-03-17 23:24:00 [INFO]: Epoch 007 - training loss: 22818.0259, validating loss: 0.5461
2024-03-17 23:24:08 [INFO]: Epoch 008 - training loss: 22822.0127, validating loss: 0.5420
2024-03-17 23:24:15 [INFO]: Epoch 009 - training loss: 22811.4080, validating loss: 0.5116
2024-03-17 23:24:23 [INFO]: Epoch 010 - training loss: 22806.8758, validating loss: 0.5019
2024-03-17 23:24:23 [INFO]: Finished training.
2024-03-17 23:24:23 [INFO]: Saved the model

In [26]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

# GP-VAE has an argument to control the number of sampling times during inference
gp_vae_results = gp_vae.predict(dataset_for_testing, n_sampling_times=2)
gp_vae_imputation = gp_vae_results["imputation"]

print(f"The shape of gp_vae_imputation is {gp_vae_imputation.shape}")

# for error calculation, we need to take the mean value of the multiple samplings for each data sample
mean_gp_vae_imputation = gp_vae_imputation.mean(axis=1)


The shape of gp_vae_imputation is (2398, 2, 48, 37)


In [27]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mean_gp_vae_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.4546


## 🚀 An example of **BRITS** for imputation

In [28]:
from pypots.optim import Adam
from pypots.imputation import BRITS

# initialize the model
brits = BRITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/brits", 
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 23:24:24 [INFO]: Using the given device: cuda
2024-03-17 23:24:24 [INFO]: Model files will be saved to tutorial_results/imputation/brits/20240317_T232424
2024-03-17 23:24:24 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/brits/20240317_T232424/tensorboard
2024-03-17 23:24:24 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 239,344


In [29]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
brits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 23:25:39 [INFO]: Epoch 001 - training loss: 0.9376, validating loss: 0.3887
2024-03-17 23:26:47 [INFO]: Epoch 002 - training loss: 0.7274, validating loss: 0.3412
2024-03-17 23:28:08 [INFO]: Epoch 003 - training loss: 0.6775, validating loss: 0.3270
2024-03-17 23:29:30 [INFO]: Epoch 004 - training loss: 0.6541, validating loss: 0.3195
2024-03-17 23:30:51 [INFO]: Epoch 005 - training loss: 0.6397, validating loss: 0.3167
2024-03-17 23:32:12 [INFO]: Epoch 006 - training loss: 0.6287, validating loss: 0.3151
2024-03-17 23:33:33 [INFO]: Epoch 007 - training loss: 0.6197, validating loss: 0.3157
2024-03-17 23:34:56 [INFO]: Epoch 008 - training loss: 0.6122, validating loss: 0.3143
2024-03-17 23:36:18 [INFO]: Epoch 009 - training loss: 0.6060, validating loss: 0.3165
2024-03-17 23:37:41 [INFO]: Epoch 010 - training loss: 0.6007, validating loss: 0.3184
2024-03-17 23:37:41 [INFO]: Finished training.
2024-03-17 23:37:41 [INFO]: Saved the model to tutorial_results/imputation/brits/20

In [30]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
brits_results = brits.predict(dataset_for_testing)
brits_imputation = brits_results["imputation"]

In [31]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    brits_imputation, 
    physionet2012_dataset['test_X_ori'], 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2580


## 🚀 An example of **M-RNN** for imputation

In [32]:
from pypots.optim import Adam
from pypots.imputation import MRNN

# initialize the model
mrnn = MRNN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/mrnn",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-03-17 23:37:57 [INFO]: No given device, using default device: cuda
2024-03-17 23:37:57 [INFO]: Model files will be saved to tutorial_results/imputation/mrnn/20240317_T233757
2024-03-17 23:37:57 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/mrnn/20240317_T233757/tensorboard
2024-03-17 23:37:57 [INFO]: MRNN initialized with the given hyperparameters, the number of trainable parameters: 107,951


In [33]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
mrnn.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-03-17 23:38:30 [INFO]: Epoch 001 - training loss: 0.7285, validating loss: 0.9317
2024-03-17 23:38:48 [INFO]: Epoch 002 - training loss: 0.5254, validating loss: 0.8918
2024-03-17 23:39:04 [INFO]: Epoch 003 - training loss: 0.4947, validating loss: 0.8756
2024-03-17 23:39:25 [INFO]: Epoch 004 - training loss: 0.4621, validating loss: 0.8681
2024-03-17 23:39:43 [INFO]: Epoch 005 - training loss: 0.4602, validating loss: 0.8628
2024-03-17 23:40:04 [INFO]: Epoch 006 - training loss: 0.4483, validating loss: 0.8644
2024-03-17 23:40:23 [INFO]: Epoch 007 - training loss: 0.4413, validating loss: 0.8632
2024-03-17 23:40:43 [INFO]: Epoch 008 - training loss: 0.4290, validating loss: 0.8687
2024-03-17 23:40:43 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-03-17 23:40:43 [INFO]: Finished training.
2024-03-17 23:40:43 [INFO]: Saved the model to tutorial_results/imputation/mrnn/20240317_T233757/MRNN.pypots


In [34]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
mrnn_results = mrnn.predict(dataset_for_testing)
mrnn_imputation = mrnn_results["imputation"]

In [35]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mrnn_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.6825


## 🚀 An example of **LOCF** for imputation

In [36]:
from pypots.imputation import LOCF

# initialize the model
locf = LOCF()


2024-03-17 23:40:47 [INFO]: No given device, using default device: cuda


In [37]:
# LOCF doesn't need to be trained, just call the impute() function

locf.fit(train_set=dataset_for_training, val_set=dataset_for_validating)




In [38]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
locf_results = locf.predict(dataset_for_testing)
locf_imputation = locf_results["imputation"]

In [39]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    locf_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.4091
