In [1]:
from transformers import PatchTSMixerConfig, PatchTSMixerForPretraining, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split
from torch.utils.data import Dataset
import os
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


### Data prep

Generate and prepare dummy data to test the pretraining.

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

class Dataset_ETT_minute(Dataset):
    def __init__(self, root_path='/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/', split='train', size=None,
                 features='M', data_path='ETTm1.csv',
                 target='OT', scale=True, timeenc=0, freq='t',
                 use_time_features=False
                 ):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert split in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[split]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.use_time_features = use_time_features

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        border1s = [0, 12 * 30 * 24 * 4 - self.seq_len, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len]
        border2s = [12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 8 * 30 * 24 * 4]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1)
            df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

#         if self.use_time_features: return _torch(seq_x, seq_y, seq_x_mark, seq_y_mark)
#         else: return _torch(seq_x, seq_y)
        return {"context_values": torch.Tensor(seq_x), "target_values": torch.Tensor(seq_y)}

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

In [3]:
xx=pd.read_csv("/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/ETTm1.csv")
xx

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT
0,2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.340,30.531000
1,2016-07-01 00:15:00,5.760,2.076,1.492,0.426,4.264,1.401,30.459999
2,2016-07-01 00:30:00,5.760,1.942,1.492,0.391,4.234,1.310,30.038000
3,2016-07-01 00:45:00,5.760,1.942,1.492,0.426,4.234,1.310,27.013000
4,2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001
...,...,...,...,...,...,...,...,...
69675,2018-06-26 18:45:00,9.310,3.550,5.437,1.670,3.868,1.462,9.567000
69676,2018-06-26 19:00:00,10.114,3.550,6.183,1.564,3.716,1.462,9.567000
69677,2018-06-26 19:15:00,10.784,3.349,7.000,1.635,3.746,1.432,9.426000
69678,2018-06-26 19:30:00,11.655,3.617,7.533,1.706,4.173,1.523,9.426000


In [4]:
SEQ_LEN = 512
FORECAST_LEN = 96
SIZE = [SEQ_LEN, 0, FORECAST_LEN]

In [5]:
dset_train = Dataset_ETT_minute(split="train", size=SIZE)
dset_val = Dataset_ETT_minute(split="val", size=SIZE)
dset_test = Dataset_ETT_minute(split="test", size=SIZE)

  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values


In [6]:
dd=dset_val.__getitem__(0)
dd["context_values"].shape, dd["target_values"].shape

(torch.Size([512, 7]), torch.Size([96, 7]))

In [7]:
n_features = 7
seq_len = SEQ_LEN
patch_len = 16
stride = patch_len//2

In [8]:
# num_patches should be (no need to specify)
num_patches = seq_len//patch_len
num_patches

32

### Pretrain the model with HF trainer

In [9]:
config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=seq_len,
    patch_len=patch_len,
    stride=stride,
    num_features=32,
    num_layers=3,
    dropout=0.7,
    mode="common_channel",
    revin=True,
    expansion_factor=2,
    head_dropout=0.7,
)

In [10]:
model = PatchTSMixerForPretraining(config)

Define the training arguments and train the model.

In [11]:
training_args = TrainingArguments(
        output_dir='./checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=256,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./logs',  # Make sure to specify a logging directory
        log_level="info",  # Explicitly set the logging level
        
#         load_best_model_at_end=True
#         no_cuda=True,
#         use_mps_device = False,
    )

In [12]:
# import evaluate
# metric = evaluate.load("mse")
# def compute_metrics(eval_pred):
#     print("here")
#     logits, labels = eval_pred
#     loss = metric(logits, labels)
#     print("val mse =", loss)
#     return {"mse": loss}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
#     compute_metrics=compute_metrics
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 33,953
  Num Epochs = 100
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 13,300
  Number of trainable parameters = 77,491
The following columns in the training set don't have a corresponding argument in `PatchTSMixerForPretraining.forward` and have been ignored: target_values. If target_values are not expected by `PatchTSMixerForPretraining.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss
1,0.6288,0.390499
2,0.4737,0.341388
3,0.4511,0.31098
4,0.4358,0.299069
5,0.4271,0.286348
6,0.4202,0.274971
7,0.4138,0.266568
8,0.4092,0.259803
9,0.4064,0.258976
10,0.4037,0.261824


***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
The following columns in the evaluation set don't have a corresponding argument in `PatchTSMixerForPretraining.forward` and have been ignored: target_values. If target_values are not expected by `PatchTSMixerForPretraining.forward`,  you can safely ignore this message.
Saving model checkpoint to ./checkpoint/checkpoint-133
Configuration saved in ./checkpoint/checkpoint-133/config.json
Model weights saved in ./checkpoint/checkpoint-133/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
The following columns in the evaluation set don't have a corresponding argument in `PatchTSMixerForPretraining.forward` and have been ignored: target_values. If target_values are not expected by `PatchTSMixerForPretraining.forward`,  you can safely ignore this message.
Saving model checkpoint to ./checkpoint/checkpoint-266
Configuration saved in ./checkpoint/checkpoint-266/config.json
Model w

TrainOutput(global_step=13300, training_loss=0.3848540278843471, metrics={'train_runtime': 536.9971, 'train_samples_per_second': 6322.753, 'train_steps_per_second': 24.767, 'total_flos': 5657814055219200.0, 'train_loss': 0.3848540278843471, 'epoch': 100.0})

In [14]:
out = trainer.evaluate(dset_val)
out

***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
The following columns in the evaluation set don't have a corresponding argument in `PatchTSMixerForPretraining.forward` and have been ignored: target_values. If target_values are not expected by `PatchTSMixerForPretraining.forward`,  you can safely ignore this message.


{'eval_loss': 0.27754929661750793,
 'eval_runtime': 0.5696,
 'eval_samples_per_second': 20058.763,
 'eval_steps_per_second': 21.068,
 'epoch': 100.0}

In [15]:
out = trainer.evaluate(dset_test)
out

***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
The following columns in the evaluation set don't have a corresponding argument in `PatchTSMixerForPretraining.forward` and have been ignored: target_values. If target_values are not expected by `PatchTSMixerForPretraining.forward`,  you can safely ignore this message.


{'eval_loss': 0.3110661804676056,
 'eval_runtime': 0.6789,
 'eval_samples_per_second': 16829.267,
 'eval_steps_per_second': 17.676,
 'epoch': 100.0}

In [16]:
! mkdir -p pytest_data/ettm1

In [17]:
trainer.save_model('pytest_data/ettm1/patchtsmixer_pretrained_ettm1')

Saving model checkpoint to pytest_data/ettm1/patchtsmixer_pretrained_ettm1
Configuration saved in pytest_data/ettm1/patchtsmixer_pretrained_ettm1/config.json
Model weights saved in pytest_data/ettm1/patchtsmixer_pretrained_ettm1/pytorch_model.bin


# Use the pretrained model to finetune for a forecasting task
## TODO: Loading the backbone weights

In [18]:
from transformers.models.patchtsmixer.modeling_patchtsmixer import PatchTSMixerForForecasting

config.update({"forecast_len": FORECAST_LEN})
finetune_model = PatchTSMixerForForecasting.from_pretrained("pytest_data/ettm1/patchtsmixer_pretrained_ettm1", config=config)


loading weights file pytest_data/ettm1/patchtsmixer_pretrained_ettm1/pytorch_model.bin
Some weights of the model checkpoint at pytest_data/ettm1/patchtsmixer_pretrained_ettm1 were not used when initializing PatchTSMixerForForecasting: ['head.head.base_pt_block.1.weight', 'head.head.base_pt_block.1.bias']
- This IS expected if you are initializing PatchTSMixerForForecasting from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PatchTSMixerForForecasting from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at pytest_data/ettm1/patchtsmixer_pretrained_ettm1 and are newly initialized: ['head.head.base_forecast_block.1.bias'

In [19]:
finetune_args = TrainingArguments(
        output_dir='./checkpoint_ftune',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./logs_ftune',  # Make sure to specify a logging directory
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [20]:
from transformers import EarlyStoppingCallback

# Create the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

finetune_trainer = Trainer(
    model=finetune_model,
    args=finetune_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

In [21]:
finetune_trainer.train()

***** Running training *****
  Num examples = 33,953
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 424,500
  Number of trainable parameters = 270,595


Epoch,Training Loss,Validation Loss
1,0.3251,0.443362
2,0.3005,0.442807
3,0.2967,0.436477
4,0.295,0.438686
5,0.2942,0.747019
6,0.293,0.443275
7,0.2926,0.545229
8,0.2915,0.779547
9,0.2904,0.566062
10,0.2903,0.614435


***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
Saving model checkpoint to ./checkpoint_ftune/checkpoint-4245
Configuration saved in ./checkpoint_ftune/checkpoint-4245/config.json
Model weights saved in ./checkpoint_ftune/checkpoint-4245/pytorch_model.bin
Deleting older checkpoint [checkpoint_ftune/checkpoint-3024] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
Saving model checkpoint to ./checkpoint_ftune/checkpoint-8490
Configuration saved in ./checkpoint_ftune/checkpoint-8490/config.json
Model weights saved in ./checkpoint_ftune/checkpoint-8490/pytorch_model.bin
Deleting older checkpoint [checkpoint_ftune/checkpoint-4032] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024
Saving model checkpoint to ./checkpoint_ftune/checkpoint-12735
Configuration saved in ./checkpoint_ftune/checkpoint-12735/config.json
Model weights saved in ./checkpoint_ftune/checkp

TrainOutput(global_step=55185, training_loss=0.295167497678264, metrics={'train_runtime': 747.3511, 'train_samples_per_second': 4543.112, 'train_steps_per_second': 568.006, 'total_flos': 2568387364408320.0, 'train_loss': 0.295167497678264, 'epoch': 13.0})

In [22]:
finetune_trainer.evaluate(dset_test)

***** Running Evaluation *****
  Num examples = 11425
  Batch size = 1024


{'eval_loss': 0.33519473671913147,
 'eval_runtime': 0.5992,
 'eval_samples_per_second': 19067.15,
 'eval_steps_per_second': 20.027,
 'epoch': 13.0}