In [1]:
import os
import gc

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import PatchTSTForPrediction
from datasets import Dataset

2025-11-03 22:24:32.043738: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
data = "coin"

output_dir = "saved_models"
log_dir = os.path.join('logstf', data)

loss_name = "mae"

num_train_epochs = 300
model_num = 1
model_path = "./saved_models"
learning_rate = 5e-6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
## target domain
target_X = pd.read_csv(f"../data/{data}/train_input_7.csv").iloc[:, 1:].values.astype(np.float32)
target_y = pd.read_csv(f"../data/{data}/train_output_7.csv").iloc[:, 1:].values.astype(np.float32)

target_X_val = target_X[-round(target_X.shape[0] * 0.2):, :].astype(np.float32)
target_y_val = target_y[-round(target_y.shape[0] * 0.2):].astype(np.float32)
target_X = target_X[:-round(target_X.shape[0] * 0.2), :].astype(np.float32)
target_y = target_y[:-round(target_y.shape[0] * 0.2)].astype(np.float32)

test_X  = pd.read_csv(f"../data/{data}/val_input_7.csv").iloc[:, 1:].values.astype(np.float32)
test_y  = pd.read_csv(f"../data/{data}/val_output_7.csv").iloc[:, 1:].values.astype(np.float32)

In [32]:
def array_to_dataset(X, y):
    X, y = torch.tensor(X), torch.tensor(y)
    X = X.reshape(-1, X.shape[1], 1)
    y = y.reshape(-1, y.shape[1], 1)

    dataset = torch.utils.data.TensorDataset(X, y)

    return dataset

train_dataset = array_to_dataset(target_X, target_y)
val_dataset = array_to_dataset(target_X_val, target_y_val)
test_dataset = array_to_dataset(test_X, test_y)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = 8, shuffle = True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size = 64)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = 64)

In [33]:
for k in range(1, model_num+1):
    current_path = os.path.join(model_path, f"model_{loss_name}_{k}.pth")

    backbone_model = PatchTSTForPrediction.from_pretrained(os.path.join(model_path, "PatchTSTBackbone")).to(device)
    backbone_model.load_state_dict(torch.load(current_path))

In [34]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout = 0.05, max_len = 5000, **kwargs):
        super().__init__()
        self.dropout = torch.nn.Dropout(p = dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)) * (-np.log(10000.0) / d_model)

        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(torch.tensor(position * div_term))
        pe[:, 1::2] = torch.cos(torch.tensor(position * div_term))
        pe = pe.unsqueeze(0)    ## (1, max_len, d_model)
        
        self.register_buffer("pe", pe)  ## 불변값. 학습되지 않음. tf.constant

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [35]:
class TransformerHead(torch.nn.Module):
    def __init__(self, d_model, nlayers, nhead, dropout, iw, ow, input_dim):
        super().__init__()
        
        self.input_adapter = torch.nn.Sequential(
            torch.nn.Linear(input_dim, d_model // 2),
            torch.nn.ReLU(),
            torch.nn.Linear(d_model // 2, d_model),
            torch.nn.ReLU()
        )

        self.pos_encoding = PositionalEncoding(d_model, dropout)    ## patch만 position encoding

        self.layers = torch.nn.ModuleList([
            torch.nn.ModuleDict({
                "attn": torch.nn.MultiheadAttention(embed_dim = d_model, num_heads = nhead, dropout = dropout, batch_first = True),
                "norm1": torch.nn.LayerNorm(d_model, eps = 1e-6),
                "ffn1": torch.nn.Linear(d_model, d_model),
                "relu": torch.nn.ReLU(),
                "ffn2": torch.nn.Linear(d_model, d_model),
                "norm2": torch.nn.LayerNorm(d_model, eps = 1e-6)
            }) for _ in range(nlayers)
        ])

        self.outlayer = torch.nn.Sequential(
            torch.nn.Linear(d_model, d_model // 2),         ## (B, 7, 42)
            torch.nn.ReLU(),
            torch.nn.Flatten(),                             ## (B, 7*42 = 294). squeeze 역할
            torch.nn.Linear(iw * (d_model // 2), 128),      ## (B, 128)
            torch.nn.ReLU(),
            torch.nn.Linear(128, ow) ## (B, 24)
        )

    def forward(self, patchTSToutput):
        x = patchTSToutput.last_hidden_state.squeeze(1) ## (B, 7, 256)
        x = self.input_adapter(x)                       ## (B, 7, 84)
        x = self.pos_encoding(x)                        ## (B, 7, 84)

        for layer in self.layers:
            attn_output, _ = layer["attn"](x, x, x)
            x = layer["norm1"](x + attn_output)

            ffn_output = layer["relu"](layer["ffn1"](x))
            ffn_output = layer["ffn2"](ffn_output)
            x = layer["norm2"](x + ffn_output)

        outputs = self.outlayer(x)
        outputs = outputs.unsqueeze(2)

        return outputs

In [36]:
model_instance = torch.nn.Sequential(
    backbone_model.model,
    TransformerHead(
        d_model = 84, nlayers = 4, nhead = 4, dropout = 0.2,
        iw = 7, ow = target_y.shape[1], input_dim = 256
    )
).to(device)

  pe[:, 0::2] = torch.sin(torch.tensor(position * div_term))
  pe[:, 1::2] = torch.cos(torch.tensor(position * div_term))


In [37]:
optimizer = torch.optim.AdamW(model_instance.parameters(), lr = learning_rate)
log_data = []

if loss_name == "mse":
    loss_fn = torch.nn.MSELoss()
elif loss_name == "mae":
    loss_fn = torch.nn.L1Loss()
# elif loss_name

## early stopping
PATIENCE = 10
best_val_loss = np.inf
patience_counter = 0

for epoc in range(num_train_epochs):
    model_instance.train()

    total_train_loss = 0

    for X, y in train_dataloader:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        yhat = model_instance(X)
        loss = loss_fn(yhat, y)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_dataloader)

    model_instance.eval()

    with torch.no_grad():
        yys = []
        yyhats = []

        for XX, yy in val_dataloader:
            XX = XX.to(device)
            yys.append(yy.to(device))
            yyhats.append(model_instance(XX))

        yyhat = torch.concat(yyhats)
        yy = torch.concat(yys)

        val_loss = loss_fn(yyhat, yy)

    print(f"Epoch {epoc+1}/{num_train_epochs} | Train Loss: {avg_train_loss:.6f}\t\t Val Loss: {val_loss:.6f}")

    log_data.append({"epoch": epoc, "loss": avg_train_loss, "eval_loss": val_loss.item()})

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state_dict = model_instance.state_dict()   ## 저장 없이 결과물만 산출...
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= PATIENCE:
        break

Epoch 1/300 | Train Loss: 26687.898170		 Val Loss: 4864.021973
Epoch 2/300 | Train Loss: 26601.788260		 Val Loss: 4851.244629
Epoch 3/300 | Train Loss: 26571.177106		 Val Loss: 4835.811523
Epoch 4/300 | Train Loss: 26639.496040		 Val Loss: 4817.316895
Epoch 5/300 | Train Loss: 26908.564105		 Val Loss: 4795.330078
Epoch 6/300 | Train Loss: 26547.277544		 Val Loss: 4769.195312
Epoch 7/300 | Train Loss: 26243.979964		 Val Loss: 4736.920898
Epoch 8/300 | Train Loss: 26354.680236		 Val Loss: 4700.019043
Epoch 9/300 | Train Loss: 26221.689721		 Val Loss: 4659.393066
Epoch 10/300 | Train Loss: 26100.688075		 Val Loss: 4615.312988
Epoch 11/300 | Train Loss: 26321.821236		 Val Loss: 4568.777344
Epoch 12/300 | Train Loss: 25816.500375		 Val Loss: 4518.379395
Epoch 13/300 | Train Loss: 25804.163889		 Val Loss: 4464.913574
Epoch 14/300 | Train Loss: 25733.976589		 Val Loss: 4408.053223
Epoch 15/300 | Train Loss: 25769.363830		 Val Loss: 4346.791504
Epoch 16/300 | Train Loss: 25325.826881		 Val Los

In [40]:
model_instance.eval()

with torch.no_grad():
    yys = []
    yyhats = []

    for XX, yy in test_dataloader:
        XX = XX.to(device)
        yys.append(yy.to(device))
        yyhats.append(model_instance(XX))

    yyhat = torch.concat(yyhats)
    yy = torch.concat(yys)

    test_loss = loss_fn(yyhat, yy)

In [41]:
mseLoss = torch.nn.MSELoss()
maeLoss = torch.nn.L1Loss()

def smape(yy, yyhat):
    numerator = 100*abs(yy - yyhat)
    denominator = (abs(yy) + abs(yyhat))/2
    smape = torch.mean(numerator / denominator)
    return smape

print(f"test RMSE: {torch.sqrt(mseLoss(yyhat, yy))}")
print(f"test MAE: {maeLoss(yyhat, yy)}")
print(f"test SMAPE: {smape(yy, yyhat)}")

test RMSE: 23.462501525878906
test MAE: 19.19106101989746
test SMAPE: 23.6146297454834


> 망함. 헤드 아키텍쳐가 너무 복잡한 것 같은데? 헤드를 거의 새로 학습시키는 수준...
>
> 그냥 기존 아키텍쳐를 최대한 활용할 수 있는 게 가장 좋다.