In [9]:
import os
import glob
import gc

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader

from transformers import (
    PatchTSTConfig, PatchTSTForPrediction,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from datasets import Dataset

In [10]:
data = "coin"
output_dir = "./saved_models"
LOG_DIR = os.path.join('logstf', data)
os.makedirs(LOG_DIR, exist_ok=True)
loss_name = "mse"
learning_rate = 5e-5

In [11]:
## target domain
target_X = pd.read_csv(f"../data/{data}/train_input_7.csv").iloc[:, 1:].values.astype(np.float32)

np.random.seed(2)
random_indices1 = np.random.choice(pd.read_csv("../data/M4_train.csv").iloc[:, (1):].index,
                                   size=target_X.shape[0] * 20, replace=True)

X_data = pd.read_csv("../data/M4_train.csv").iloc[:, 1 + (24 * 0):].loc[random_indices1].values.astype(np.float32)
y_data = pd.read_csv("../data/M4_test.csv").iloc[:, 1:].loc[random_indices1].values.astype(np.float32)

In [12]:
X_data.shape[1]

168

In [13]:
TSTconfig = PatchTSTConfig(
    num_input_channels = 1,
    context_length = X_data.shape[1],
    prediction_length = y_data.shape[1],

    patch_length = 24,
    patch_stride = 24,
    d_model = 256,
    num_attention_heads = 8,
    num_hidden_layers = 8,
    ffn_dim = 1024,
    dropout = 0.2,
    head_dropout = 0.2,
    pooling_type = None,
    channel_attention = False,
    scaling = "std",
    loss = loss_name,
    pre_norm = True,
    do_mask_input = False
)

In [14]:
model = PatchTSTForPrediction(TSTconfig)

In [19]:
np.random.seed(42)
select = np.random.choice(len(X_data), size=len(X_data), replace=True)
X_bootstrap = X_data[select]
y_bootstrap = y_data[select]

val_split_index = int(len(X_bootstrap) * 0.8)
X_train, X_valid = X_bootstrap[:val_split_index], X_bootstrap[val_split_index:]
y_train, y_valid = y_bootstrap[:val_split_index], y_bootstrap[val_split_index:]

In [20]:
def create_hf_dataset(x, y):
    x_list = [s[..., np.newaxis] for s in x]    ## (N, 168) -> (N, 168, 1)
    y_list = [s[..., np.newaxis] for s in y]    ## (N, 24) -> (N, 24, 1)

    data_dict = {
        "past_values": x_list,
        "future_values": y_list
    }

    return Dataset.from_dict(data_dict)

In [21]:
train_dataset = create_hf_dataset(X_train, y_train)
test_dataset = create_hf_dataset(X_valid, y_valid)

In [114]:
training_args = TrainingArguments(
    output_dir = output_dir,
    overwrite_output_dir = True,
    learning_rate = learning_rate,
    num_train_epochs = 2000,
    do_eval = True,
    eval_strategy = "epoch",
    per_device_train_batch_size = 256,
    per_device_eval_batch_size = 256,
    dataloader_num_workers = 16,
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit = 1,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    label_names = ["future_values"]
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 15,
    early_stopping_threshold = 0.001
)

In [115]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    callbacks = [early_stopping_callback]
)

In [116]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1712831.1304,1196681.25
2,1043972.6087,806034.5625
3,898464.2609,724837.5625
4,805465.5652,679294.5
5,745685.7391,633064.0625
6,719852.6087,606726.75
7,707913.1739,635188.6875
8,657008.2609,604339.8125
9,578306.6957,627974.0625
10,527039.7391,545414.8125


TrainOutput(global_step=3818, training_loss=310956.45272393926, metrics={'train_runtime': 272.7735, 'train_samples_per_second': 84817.633, 'train_steps_per_second': 337.276, 'total_flos': 6164372611934208.0, 'train_loss': 310956.45272393926, 'epoch': 83.0})

In [117]:
# 2. test_dataset으로 DataLoader 생성
# (test_dataset은 'past_values'와 'future_values'를 포함하는 Hf Dataset)
test_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
test_loader = DataLoader(test_dataset, batch_size=64)

unscaled_preds = []
unscaled_labels = []

with torch.no_grad():
    for batch in test_loader:
        # 모델의 forward pass 실행
        outputs = trainer.model(
            past_values=batch['past_values'].to("cuda:0")
        )

        if isinstance(outputs.prediction_outputs, tuple):
            unscaled_preds.append(outputs.prediction_outputs[0])
        else:
            unscaled_preds.append(outputs.prediction_outputs) # (튜플이 아닌 경우 대비)

        unscaled_labels.append(batch['future_values'])

In [118]:
yyhat = torch.concat(unscaled_preds).to("cpu")
yy = torch.concat(unscaled_labels)

print(f"test RMSE: {torch.sqrt(mseLoss(yyhat, yy))}")
print(f"test MAE: {maeLoss(yyhat, yy)}")
print(f"test SMAPE: {smape(yy, yyhat)}")

test RMSE: 582.0230712890625
test MAE: 248.49415588378906
test SMAPE: 6.983912944793701


In [None]:
## 로그 파일 저장: train/test loss
log_data = trainer.state.log_history
df = pd.DataFrame(log_data)

df_train = df[df['loss'].notna()][['epoch', 'loss']]
df_eval = df[df['eval_loss'].notna()][['epoch', 'eval_loss']]

final_df = pd.merge(df_train, df_eval, on="epoch", how="outer").assign(epoch = lambda _df: _df.epoch.astype(int))

In [14]:
final_df.to_csv(os.path.join(LOG_DIR, f"pretrain_{loss_name}_model{2}.csv"), index = False)

In [24]:
trainer.metric_for_best_model

AttributeError: 'Trainer' object has no attribute 'metric_for_best_model'

In [121]:
torch.save(trainer.model.state_dict(), os.path.join(output_dir, f"model_{loss_name}_{1}.pth"))

In [15]:
## model name setting
checkpoint_folder = glob.glob(os.path.join(output_dir, "checkpoint-*"))[0]
os.rename(checkpoint_folder, os.path.join(output_dir, f"model_{loss_name}_{2}"))

In [16]:
torch.cuda.empty_cache()
gc.collect()

585

In [25]:
mseLoss = torch.nn.MSELoss()
maeLoss = torch.nn.L1Loss()

In [76]:
best_model_path = os.path.join(output_dir, f"model_{loss_name}_{1}")
best_model = PatchTSTForPrediction.from_pretrained(best_model_path)
best_model.eval()

# 2. test_dataset으로 DataLoader 생성
# (test_dataset은 'past_values'와 'future_values'를 포함하는 Hf Dataset)
test_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
test_loader = DataLoader(test_dataset, batch_size=64)

unscaled_preds = []
unscaled_labels = []

with torch.no_grad():
    for batch in test_loader:
        # 모델의 forward pass 실행
        outputs = best_model(
            past_values=batch['past_values']
        )

        if isinstance(outputs.prediction_outputs, tuple):
            unscaled_preds.append(outputs.prediction_outputs[0])
        else:
            unscaled_preds.append(outputs.prediction_outputs) # (튜플이 아닌 경우 대비)

        unscaled_labels.append(batch['future_values'])

In [77]:
yyhat = torch.concat(unscaled_preds)
yy = torch.concat(unscaled_labels)

In [23]:
def smape(yy, yyhat):
    numerator = 100*abs(yy - yyhat)
    denominator = (abs(yy) + abs(yyhat))/2
    smape = torch.mean(numerator / denominator)
    return smape

In [79]:
print(f"test RMSE: {torch.sqrt(mseLoss(yyhat, yy))}")
print(f"test MAE: {maeLoss(yyhat, yy)}")
print(f"test SMAPE: {smape(yy, yyhat)}")

test RMSE: 538.7234497070312
test MAE: 242.28775024414062
test SMAPE: 6.946401119232178


In [32]:
torch.cuda.empty_cache()
gc.collect()

2364

In [None]:
model.load_state_dict(torch.load(os.path.join(output_dir, f"model_{loss_name}_{1}.pth"), weights_only = True))