In [1]:
import os
import gc

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader

from transformers import (
    PatchTSTConfig, PatchTSTForPrediction,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from datasets import Dataset

2025-11-02 14:17:21.982089: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [157]:
data = "coin"
output_dir = "./pretrained/MAE"
logging_dir = "./logs/MAE"
loss = "mae"
learning_rate = 5e-6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [107]:
## target domain
target_X = pd.read_csv(f"../data/{data}/train_input_7.csv").iloc[:, 1:].values.astype(np.float32)
target_y = pd.read_csv(f"../data/{data}/train_output_7.csv").iloc[:, 1:].values.astype(np.float32)

target_X_val = target_X[-round(target_X.shape[0] * 0.2):, :].astype(np.float32)
target_y_val = target_y[-round(target_y.shape[0] * 0.2):].astype(np.float32)
target_X = target_X[:-round(target_X.shape[0] * 0.2), :].astype(np.float32)
target_y = target_y[:-round(target_y.shape[0] * 0.2)].astype(np.float32)

test_X  = pd.read_csv(f"../data/{data}/val_input_7.csv").iloc[:, 1:].values.astype(np.float32)
test_y  = pd.read_csv(f"../data/{data}/val_output_7.csv").iloc[:, 1:].values.astype(np.float32)

np.random.seed(2)
random_indices1 = np.random.choice(pd.read_csv("../data/M4_train.csv").iloc[:, (1):].index,
                                   size=target_X.shape[0] * 20, replace=True)

X_data = pd.read_csv("../data/M4_train.csv").iloc[:, 1 + (24 * 0):].loc[random_indices1].values.astype(np.float32)
y_data = pd.read_csv("../data/M4_test.csv").iloc[:, 1:].loc[random_indices1].values.astype(np.float32)

In [4]:
def create_hf_dataset(x, y):
    x_list = [s[..., np.newaxis] for s in x]    ## (N, 168) -> (N, 168, 1)
    y_list = [s[..., np.newaxis] for s in y]    ## (N, 24) -> (N, 24, 1)

    data_dict = {
        "past_values": x_list,
        "future_values": y_list
    }

    return Dataset.from_dict(data_dict)

In [16]:
np.random.seed(42)
select = np.random.choice(len(X_data), size=len(X_data), replace=True)
X_bootstrap = X_data[select]
y_bootstrap = y_data[select]

val_split_index = int(len(X_bootstrap) * 0.8)
X_train, X_valid = X_bootstrap[:val_split_index], X_bootstrap[val_split_index:]
y_train, y_valid = y_bootstrap[:val_split_index], y_bootstrap[val_split_index:]

train_dataset = create_hf_dataset(X_train, y_train)
test_dataset = create_hf_dataset(X_valid, y_valid)

In [5]:
best_model_path = "./pretrained/checkpoint-12851" # 베스트 모델 경로
best_model = PatchTSTForPrediction.from_pretrained(best_model_path)

In [8]:
pretrained_body = best_model.model
config = best_model.config

In [75]:
N_PATCHES = config.patch_length
D_MODEL = config.d_model
T_OUT = target_y.shape[1]
C_NEW = 128

In [169]:
train_dataset = create_hf_dataset(target_X, target_y)
val_dataset = create_hf_dataset(target_X_val, target_y_val)
test_dataset = create_hf_dataset(test_X, test_y)

train_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
val_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
test_dataset.set_format(type='torch', columns=['past_values', 'future_values'])

train_loader = DataLoader(train_dataset, batch_size = 8)
val_loader = DataLoader(val_dataset, batch_size = 64)
test_loader = DataLoader(test_dataset, batch_size = 64)

In [None]:
for batch in val_dataset:
    break

In [148]:
batch["past_values"].shape  ## (B, t, 1)

torch.Size([64, 168, 1])

In [149]:
best_model.model(past_values = batch["past_values"].to("cuda:0")).last_hidden_state.shape    ## (B, 1, 10, 128)

torch.Size([64, 1, 10, 128])

In [150]:
class TransferModel(torch.nn.Module):
    def __init__(self, body, t_out):
        super().__init__()
        self.body = body
        self.t_out = t_out
        body_out_features = 1280 
        self.c_new = 128 
        
        self.flatten = torch.nn.Flatten(start_dim=1, end_dim=-1) 
        self.adapter = torch.nn.Linear(body_out_features, self.t_out * self.c_new)  ## Dense(128)

        self.head = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(128, 64),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(64, 1) 
        )

    def forward(self, x):
        features = self.body(past_values=x).last_hidden_state
        flat_feat = self.flatten(features)
        adapted_feat = self.adapter(flat_feat)
        head_input = adapted_feat.view(-1, self.t_out, self.c_new)  ## (B, 24, 128)
        output = self.head(head_input)

        return output

In [None]:
model_instance = TransferModel(pretrained_body, 24).to(device)

In [None]:
model_instance(batch["past_values"].to(device)).shape

torch.Size([64, 24, 1])

In [121]:
optimizer = torch.optim.Adam(model_instance.parameters(), lr=1e-6)
loss_fn = torch.nn.MSELoss()

In [155]:
len(train_loader)

73

In [163]:
batch.keys()

dict_keys(['past_values', 'future_values'])

In [167]:
EPOCHS = 2000
PATIENCE = 10
best_val_loss = np.inf
patience_counter = 0
BEST_MODEL_PATH = "./result/best_transfer_model.pth"

model_instance.to(device)

for epoch in range(EPOCHS):
    model_instance.train()
    total_train_loss = 0
    
    for batch in train_loader:
        X_batch, y_batch = batch["past_values"].to(device), batch["future_values"].to(device)
        
        optimizer.zero_grad()
        outputs = model_instance(X_batch)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # evaluation
    model_instance.eval()
    total_val_loss = 0
    
    with torch.no_grad():
        for val_batch in val_loader:
            X_val_batch, y_val_batch = val_batch["past_values"].to(device), val_batch["future_values"].to(device)
            
            val_outputs = model_instance(X_val_batch)
            val_loss = loss_fn(val_outputs, y_val_batch)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}/{EPOCHS} - Train Loss: {avg_train_loss:.6f} - Val Loss: {avg_val_loss:.6f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        # Keras의 restore_best_weights=True (모델 저장)
        torch.save(model_instance.state_dict(), BEST_MODEL_PATH) 
        print(f"  > Val loss 개선. 베스트 모델 저장. (Loss: {best_val_loss:.6f})")
        patience_counter = 0 # 인내심 초기화
    else:
        patience_counter += 1
        print(f"  > Val loss 개선 없음. (Patience: {patience_counter}/{PATIENCE})")
    
    if patience_counter >= PATIENCE:
        print(f"--- 조기 종료 (Epoch {epoch+1}) ---")
        break

print("--- 훈련 종료 ---")
# Keras의 restore_best_weights (베스트 모델 불러오기)
model_instance.load_state_dict(torch.load(BEST_MODEL_PATH))
print(f"가장 좋았던 모델(Val Loss: {best_val_loss:.6f})을 로드했습니다.")

Epoch 1/2000 - Train Loss: 26625.612097 - Val Loss: 4458.620117
  > Val loss 개선. 베스트 모델 저장. (Loss: 4458.620117)
Epoch 2/2000 - Train Loss: 26624.784263 - Val Loss: 4458.211751
  > Val loss 개선. 베스트 모델 저장. (Loss: 4458.211751)
Epoch 3/2000 - Train Loss: 26623.466446 - Val Loss: 4457.773926
  > Val loss 개선. 베스트 모델 저장. (Loss: 4457.773926)
Epoch 4/2000 - Train Loss: 26622.312547 - Val Loss: 4457.316732
  > Val loss 개선. 베스트 모델 저장. (Loss: 4457.316732)
Epoch 5/2000 - Train Loss: 26620.800912 - Val Loss: 4456.836507
  > Val loss 개선. 베스트 모델 저장. (Loss: 4456.836507)
Epoch 6/2000 - Train Loss: 26619.376666 - Val Loss: 4456.332113
  > Val loss 개선. 베스트 모델 저장. (Loss: 4456.332113)
Epoch 7/2000 - Train Loss: 26618.360663 - Val Loss: 4455.821045
  > Val loss 개선. 베스트 모델 저장. (Loss: 4455.821045)
Epoch 8/2000 - Train Loss: 26617.004679 - Val Loss: 4455.268555
  > Val loss 개선. 베스트 모델 저장. (Loss: 4455.268555)
Epoch 9/2000 - Train Loss: 26614.999913 - Val Loss: 4454.708659
  > Val loss 개선. 베스트 모델 저장. (Loss: 4454.

In [177]:
model_instance(batch["past_values"].to(device)).shape

torch.Size([64, 24, 1])

In [178]:
preds = []

for batch in test_loader:
    preds.append(model_instance(batch["past_values"].to(device)))

In [None]:
torch.tensor(test_y)

tensor([[63.7900, 63.6200, 64.2500,  ..., 61.8000, 62.0900, 62.1600],
        [61.2900, 62.5900, 61.7500,  ..., 61.1500, 61.0500, 61.3800],
        [60.7600, 60.8100, 61.4700,  ..., 59.1000, 59.0200, 59.1000],
        ...,
        [69.9900, 69.9200, 70.2000,  ..., 70.5500, 70.9400, 70.8000],
        [70.9300, 70.8000, 71.2400,  ..., 71.4700, 71.5500, 71.3000],
        [71.0000, 70.8500, 71.0100,  ...,  0.0000,  0.0000,  0.0000]])

In [203]:
torch.mean(abs(torch.tensor(test_y) - torch.concat(preds, axis = 0).squeeze().data.to('cpu')))

tensor(21.8484)

In [207]:
torch.sqrt(torch.mean((torch.tensor(test_y) - torch.concat(preds, axis = 0).squeeze().data.to('cpu'))**2))

tensor(27.0861)

In [205]:
test_X.mean()

np.float32(75.789345)

> `27.0861, 21.8484`가 나옴. 처음보다야 낫지만, 썩 좋지는 않네.