# 📑 Tutorials for PyPOTS Imputation Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012

# Load the PhysioNet-2012 dataset
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


  from .autonotebook import tqdm as notebook_tqdm
2023-05-17 00:01:28 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Database)...
2023-05-17 00:01:28 [INFO]: Starting preprocessing physionet_2012...


Dataset physionet_2012 has already been downloaded. Processing directly...
Dataset physionet_2012 has already been cached. Loading from cache directly...
Loaded successfully!
dict_keys(['n_classes', 'n_steps', 'n_features', 'train_X', 'train_y', 'val_X', 'val_y', 'test_X', 'test_y', 'scaler', 'test_X_intact', 'test_X_indicating_mask', 'val_X_intact', 'val_X_indicating_mask'])


In [2]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_intact": physionet2012_dataset['val_X_intact'],
    "indicating_mask": physionet2012_dataset['val_X_indicating_mask'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


## 🚀 An exmaple of **SAITS** for imputation

In [3]:
from pypots.optim import Adam
from pypots.imputation import SAITS

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_inner=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cpu',  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2023-05-17 00:01:40 [INFO]: the trained model will be saved to tutorial_results/imputation/saits/20230517_T000140
2023-05-17 00:01:40 [INFO]: the tensorboard file will be saved to tutorial_results/imputation/saits/20230517_T000140/tensorboard
2023-05-17 00:01:40 [INFO]: Model initialized successfully with the number of trainable parameters: 1,378,358


In [4]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2023-05-17 00:02:05 [INFO]: epoch 0: training loss 0.7151, validating loss 0.3206
2023-05-17 00:02:31 [INFO]: epoch 1: training loss 0.5164, validating loss 0.3038
2023-05-17 00:02:55 [INFO]: epoch 2: training loss 0.4577, validating loss 0.2784
2023-05-17 00:03:21 [INFO]: epoch 3: training loss 0.4181, validating loss 0.2638
2023-05-17 00:03:46 [INFO]: epoch 4: training loss 0.3894, validating loss 0.2535
2023-05-17 00:04:11 [INFO]: epoch 5: training loss 0.3730, validating loss 0.2445
2023-05-17 00:04:36 [INFO]: epoch 6: training loss 0.3579, validating loss 0.2425
2023-05-17 00:05:00 [INFO]: epoch 7: training loss 0.3490, validating loss 0.2413
2023-05-17 00:05:24 [INFO]: epoch 8: training loss 0.3399, validating loss 0.2347
2023-05-17 00:05:49 [INFO]: epoch 9: training loss 0.3331, validating loss 0.2330
2023-05-17 00:05:49 [INFO]: Finished training.
2023-05-17 00:05:49 [INFO]: Saved the model to tutorial_results/imputation/saits/20230517_T000140/SAITS.pypots.


In [5]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
saits_imputation = saits.impute(dataset_for_testing)


In [6]:
from pypots.utils.metrics import cal_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    saits_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


Testing mean absolute error: 0.2344


## 🚀 An exmaple of **Transformer** for imputation

In [7]:
from pypots.optim import Adam
from pypots.imputation import Transformer

# initialize the model
transformer = Transformer(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=6,
    d_model=512,
    d_inner=256,
    n_heads=4,
    d_k=128,
    d_v=128,
    dropout=0.1,
    attn_dropout=0,
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cpu',  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/transformer",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2023-05-17 00:05:51 [INFO]: the trained model will be saved to tutorial_results/imputation/transformer/20230517_T000551
2023-05-17 00:05:51 [INFO]: the tensorboard file will be saved to tutorial_results/imputation/transformer/20230517_T000551/tensorboard
2023-05-17 00:05:51 [INFO]: Model initialized successfully with the number of trainable parameters: 7,938,597


In [8]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
transformer.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2023-05-17 00:06:50 [INFO]: epoch 0: training loss 0.8075, validating loss 0.5036
2023-05-17 00:07:48 [INFO]: epoch 1: training loss 0.6279, validating loss 0.4808
2023-05-17 00:08:41 [INFO]: epoch 2: training loss 0.5863, validating loss 0.4693
2023-05-17 00:09:33 [INFO]: epoch 3: training loss 0.5682, validating loss 0.4655
2023-05-17 00:10:23 [INFO]: epoch 4: training loss 0.5607, validating loss 0.4644
2023-05-17 00:11:14 [INFO]: epoch 5: training loss 0.5546, validating loss 0.4632
2023-05-17 00:12:03 [INFO]: epoch 6: training loss 0.5503, validating loss 0.4617
2023-05-17 00:12:53 [INFO]: epoch 7: training loss 0.5479, validating loss 0.4604
2023-05-17 00:13:42 [INFO]: epoch 8: training loss 0.5463, validating loss 0.4605
2023-05-17 00:14:32 [INFO]: epoch 9: training loss 0.5426, validating loss 0.4583
2023-05-17 00:14:32 [INFO]: Finished training.
2023-05-17 00:14:32 [INFO]: Saved the model to tutorial_results/imputation/transformer/20230517_T000551/Transformer.pypots.


In [9]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
transformer_imputation = transformer.impute(dataset_for_testing)


In [10]:
from pypots.utils.metrics import cal_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(transformer_imputation,
                      physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


Testing mean absolute error: 0.4655


## 🚀 An exmaple of **BRITS** for imputation

In [11]:
from pypots.optim import Adam
from pypots.imputation import BRITS

# initialize the model
brits = BRITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cpu',  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/brits", 
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2023-05-17 00:14:36 [INFO]: the trained model will be saved to tutorial_results/imputation/brits/20230517_T001436
2023-05-17 00:14:36 [INFO]: the tensorboard file will be saved to tutorial_results/imputation/brits/20230517_T001436/tensorboard
2023-05-17 00:14:36 [INFO]: Model initialized successfully with the number of trainable parameters: 239,344


In [12]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
brits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2023-05-17 00:15:04 [INFO]: epoch 0: training loss 0.9379, validating loss 0.3447
2023-05-17 00:15:25 [INFO]: epoch 1: training loss 0.7300, validating loss 0.3031
2023-05-17 00:15:45 [INFO]: epoch 2: training loss 0.6804, validating loss 0.2873
2023-05-17 00:16:04 [INFO]: epoch 3: training loss 0.6574, validating loss 0.2789
2023-05-17 00:16:24 [INFO]: epoch 4: training loss 0.6424, validating loss 0.2716
2023-05-17 00:16:44 [INFO]: epoch 5: training loss 0.6310, validating loss 0.2673
2023-05-17 00:17:04 [INFO]: epoch 6: training loss 0.6217, validating loss 0.2649
2023-05-17 00:17:25 [INFO]: epoch 7: training loss 0.6141, validating loss 0.2629
2023-05-17 00:17:45 [INFO]: epoch 8: training loss 0.6075, validating loss 0.2598
2023-05-17 00:18:06 [INFO]: epoch 9: training loss 0.6028, validating loss 0.2585
2023-05-17 00:18:06 [INFO]: Finished training.
2023-05-17 00:18:06 [INFO]: Saved the model to tutorial_results/imputation/brits/20230517_T001436/BRITS.pypots.


In [13]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
brits_imputation = brits.impute(dataset_for_testing)


In [14]:
from pypots.utils.metrics import cal_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    brits_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


Testing mean absolute error: 0.2576


## 🚀 An exmaple of **M-RNN** for imputation

In [None]:
from pypots.optim import Adam
from pypots.imputation import MRNN
from pypots.utils.metrics import cal_mae

# initialize the model
# initialize the model
mrnn = MRNN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # Set it to None to use the default device (will use CPU if you don't have CUDA devices).
    # You can also set it to 'cpu' or 'cuda' explicitly, or ['cuda:0', 'cuda:1'] if you have multiple CUDA devices.
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/mrnn",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


In [12]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
mrnn.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2023-05-17 00:15:04 [INFO]: epoch 0: training loss 0.9379, validating loss 0.3447
2023-05-17 00:15:25 [INFO]: epoch 1: training loss 0.7300, validating loss 0.3031
2023-05-17 00:15:45 [INFO]: epoch 2: training loss 0.6804, validating loss 0.2873
2023-05-17 00:16:04 [INFO]: epoch 3: training loss 0.6574, validating loss 0.2789
2023-05-17 00:16:24 [INFO]: epoch 4: training loss 0.6424, validating loss 0.2716
2023-05-17 00:16:44 [INFO]: epoch 5: training loss 0.6310, validating loss 0.2673
2023-05-17 00:17:04 [INFO]: epoch 6: training loss 0.6217, validating loss 0.2649
2023-05-17 00:17:25 [INFO]: epoch 7: training loss 0.6141, validating loss 0.2629
2023-05-17 00:17:45 [INFO]: epoch 8: training loss 0.6075, validating loss 0.2598
2023-05-17 00:18:06 [INFO]: epoch 9: training loss 0.6028, validating loss 0.2585
2023-05-17 00:18:06 [INFO]: Finished training.
2023-05-17 00:18:06 [INFO]: Saved the model to tutorial_results/imputation/brits/20230517_T001436/BRITS.pypots.


In [13]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
mrnn_imputation = mrnn.impute(dataset_for_testing)


In [14]:
from pypots.utils.metrics import cal_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    mrnn_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


Testing mean absolute error: 0.2576


## 🚀 An exmaple of **LOCF** for imputation

In [15]:
from pypots.imputation import LOCF

# initialize the model
locf = LOCF(
    nan=0  # set the value used to impute data missing at the beginning of the sequence, those cannot use LOCF mechanism to impute
)


In [16]:
# LOCF doesn't need to be trained, just call the impute() function

locf.fit(train_set=dataset_for_training, val_set=dataset_for_validating)




In [17]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
locf_imputation = locf.impute(dataset_for_testing)


In [18]:
from pypots.utils.metrics import cal_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    locf_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


Testing mean absolute error: 0.4110
