In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader

from transformers import (
    PatchTSTConfig, PatchTSTForPrediction,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from datasets import Dataset

2025-11-02 11:55:59.521546: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = "coin"
output_dir = "./pretrained/MAE"
logging_dir = "./logs/MAE"
loss = "mae"
learning_rate = 5e-6

In [3]:
## target domain
target_X = pd.read_csv(f"../data/{data}/train_input_7.csv").iloc[:, 1:].values.astype(np.float32)
target_y = pd.read_csv(f"../data/{data}/train_output_7.csv").iloc[:, 1:].values.astype(np.float32)

X_train = target_X[:-round(target_X.shape[0] * 0.2), :].astype(np.float32)  ## ??? 잘못된 것 아님?
y_train = target_y[:-round(target_y.shape[0] * 0.2)].astype(np.float32)
target_X_val = target_X[-round(target_X.shape[0] * 0.2):, :].astype(np.float32)
target_y_val = target_y[-round(target_y.shape[0] * 0.2):].astype(np.float32)

test_X  = pd.read_csv(f"../data/{data}/val_input_7.csv").iloc[:, 1:].values.astype(np.float32)
test_y  = pd.read_csv(f"../data/{data}/val_output_7.csv").iloc[:, 1:].values.astype(np.float32)

np.random.seed(2)
random_indices1 = np.random.choice(pd.read_csv("../data/M4_train.csv").iloc[:, (1):].index,
                                   size=target_X.shape[0] * 20, replace=True)

X_data = pd.read_csv("../data/M4_train.csv").iloc[:, 1 + (24 * 0):].loc[random_indices1].values.astype(np.float32)
y_data = pd.read_csv("../data/M4_test.csv").iloc[:, 1:].loc[random_indices1].values.astype(np.float32)

In [4]:
X_data.shape

(14460, 168)

In [5]:
TSTconfig = PatchTSTConfig(
    num_input_channels = 1,
    context_length = X_data.shape[1],
    prediction_length = y_data.shape[1],

    patch_length = 16,
    patch_stride = 16,
    d_model = 128,
    num_attention_heads = 16,
    num_hidden_layers = 3,
    ffn_dim = 256,
    dropout = 0.2,
    head_dropout = 0.2,
    pooling_type = None,
    channel_attention = False,
    scaling = "std",
    loss = loss,
    pre_norm = True,
    do_mask_input = False
)

In [6]:
model = PatchTSTForPrediction(TSTconfig)

In [7]:
model.model

PatchTSTModel(
  (scaler): PatchTSTScaler(
    (scaler): PatchTSTStdScaler()
  )
  (patchifier): PatchTSTPatchify()
  (masking): Identity()
  (encoder): PatchTSTEncoder(
    (embedder): PatchTSTEmbedding(
      (input_embedding): Linear(in_features=16, out_features=128, bias=True)
    )
    (positional_encoder): PatchTSTPositionalEncoding(
      (positional_dropout): Identity()
    )
    (layers): ModuleList(
      (0-2): 3 x PatchTSTEncoderLayer(
        (self_attn): PatchTSTAttention(
          (k_proj): Linear(in_features=128, out_features=128, bias=True)
          (v_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_proj): Linear(in_features=128, out_features=128, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
        )
        (dropout_path1): Identity()
        (norm_sublayer1): PatchTSTBatchNorm(
          (batchnorm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (

In [5]:
np.random.seed(42)
select = np.random.choice(len(X_data), size=len(X_data), replace=True)
X_bootstrap = X_data[select]
y_bootstrap = y_data[select]

val_split_index = int(len(X_bootstrap) * 0.8)
X_train, X_valid = X_bootstrap[:val_split_index], X_bootstrap[val_split_index:]
y_train, y_valid = y_bootstrap[:val_split_index], y_bootstrap[val_split_index:]

In [6]:
def create_hf_dataset(x, y):
    x_list = [s[..., np.newaxis] for s in x]    ## (N, 168) -> (N, 168, 1)
    y_list = [s[..., np.newaxis] for s in y]    ## (N, 24) -> (N, 24, 1)

    data_dict = {
        "past_values": x_list,
        "future_values": y_list
    }

    return Dataset.from_dict(data_dict)

In [7]:
train_dataset = create_hf_dataset(X_train, y_train)
test_dataset = create_hf_dataset(X_valid, y_valid)

In [11]:
training_args = TrainingArguments(
    output_dir = output_dir,
    overwrite_output_dir = True,
    learning_rate = learning_rate,
    num_train_epochs = 100,
    do_eval = True,
    eval_strategy = "epoch",
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    dataloader_num_workers = 16,
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit = 3,
    logging_dir = logging_dir,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    label_names = ["future_values"]
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience = 10,
    early_stopping_threshold = 0.001
)

In [12]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    callbacks = [early_stopping_callback]
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,8.1051,7.908236
2,7.8259,7.580915
3,7.5221,7.357135
4,7.3916,7.273916
5,7.3372,7.225021
6,7.3043,7.193517
7,7.276,7.163679
8,7.2495,7.132695
9,7.2294,7.111053
10,7.2106,7.091843


TrainOutput(global_step=18100, training_loss=7.0613434613475485, metrics={'train_runtime': 418.3653, 'train_samples_per_second': 2765.047, 'train_steps_per_second': 43.264, 'total_flos': 575014074163200.0, 'train_loss': 7.0613434613475485, 'epoch': 100.0})

In [21]:
# 1. 훈련된 베스트 모델 로드
best_model_path = "./pretrained/MAE/checkpoint-17376" # 베스트 모델 경로
best_model = PatchTSTForPrediction.from_pretrained(best_model_path)
best_model.eval()

# 2. test_dataset으로 DataLoader 생성
# (test_dataset은 'past_values'와 'future_values'를 포함하는 Hf Dataset)
test_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
test_loader = DataLoader(test_dataset, batch_size=64)

total_real_mae = 0
total_samples = 0

print("--- '실제 MAE' (Unscaled) 계산 시작 ---")

with torch.no_grad():
    for batch in test_loader:
        # 모델의 forward pass 실행
        outputs = best_model(
            past_values=batch['past_values'],
            # [참고] 'future_values'를 전달하지 않아도 예측은 가능합니다.
            # (전달하면 outputs.loss도 계산해줌)
        )

        if isinstance(outputs.prediction_outputs, tuple):
            unscaled_preds = outputs.prediction_outputs[0]
        else:
            unscaled_preds = outputs.prediction_outputs # (튜플이 아닌 경우 대비)

        unscaled_labels = batch['future_values']
        
        #    (배치 전체의 평균 MAE)
        real_mae = torch.abs(unscaled_preds - unscaled_labels).mean()
        
        # (정확한 계산을 위해 배치 크기 가중 평균)
        total_real_mae += real_mae.item() * len(batch['future_values'])
        total_samples += len(batch['future_values'])

final_real_mae = total_real_mae / total_samples
print(f"--- 훈련된 모델의 '실제 MAE' (Unscaled) ---")
print(f"Final Real MAE: {final_real_mae}")

--- '실제 MAE' (Unscaled) 계산 시작 ---
--- 훈련된 모델의 '실제 MAE' (Unscaled) ---
Final Real MAE: 4368.487526203752


In [22]:
from tbparse import SummaryReader

2025-11-01 22:57:34.252156: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
# 로그 파일이 있는 폴더 경로
log_dir = "./logs/events.out.tfevents.1761999330.cd0dd4fe3564.737145.0"

# 1. 로그 폴더를 읽습니다.
reader = SummaryReader(log_dir)

# 2. 스칼라 값(loss 등)을 DataFrame으로 변환합니다.
df_scalars = reader.scalars

In [8]:
# 훈련된 베스트 모델 로드
best_model_path = "./pretrained/checkpoint-12851" # 베스트 모델 경로
best_model = PatchTSTForPrediction.from_pretrained(best_model_path)
best_model.eval()

# 2. test_dataset으로 DataLoader 생성
# (test_dataset은 'past_values'와 'future_values'를 포함하는 Hf Dataset)
test_dataset.set_format(type='torch', columns=['past_values', 'future_values'])
test_loader = DataLoader(test_dataset, batch_size=64)

total_real_mae = 0
total_samples = 0

with torch.no_grad():
    for batch in test_loader:
        # 모델의 forward pass 실행
        outputs = best_model(
            past_values=batch['past_values']   ## attribute
        )

        if isinstance(outputs.prediction_outputs, tuple):
            unscaled_preds = outputs.prediction_outputs[0]
        else:
            unscaled_preds = outputs.prediction_outputs # (튜플이 아닌 경우 대비)

        unscaled_labels = batch['future_values']    ## label
        
        #    (배치 전체의 평균 MAE)
        real_mae = torch.abs(unscaled_preds - unscaled_labels).mean()
        
        # (정확한 계산을 위해 배치 크기 가중 평균)
        total_real_mae += real_mae.item() * len(batch['future_values'])
        total_samples += len(batch['future_values'])

final_real_mae = total_real_mae / total_samples
print(f"--- 훈련된 모델의 '실제 MAE' (Unscaled) ---")
print(f"Final Real MAE: {final_real_mae}")

--- 훈련된 모델의 '실제 MAE' (Unscaled) ---
Final Real MAE: 360.81136456037126


In [55]:
total_real_mse = 0
total_samples = 0
loss_fn = torch.nn.MSELoss()

with torch.no_grad():
    for batch in test_loader:
        # 모델의 forward pass 실행
        outputs = best_model(
            past_values=batch['past_values']   ## attribute
        )

        if isinstance(outputs.prediction_outputs, tuple):
            unscaled_preds = outputs.prediction_outputs[0]
        else:
            unscaled_preds = outputs.prediction_outputs # (튜플이 아닌 경우 대비)

        unscaled_labels = batch['future_values']    ## label
        
        real_mse = loss_fn(unscaled_preds, unscaled_labels)
        
        # (정확한 계산을 위해 배치 크기 가중 평균)
        total_real_mae += real_mse.item() * len(batch['future_values'])
        total_samples += len(batch['future_values'])

final_real_mae = total_real_mae / total_samples
print(f"--- 훈련된 모델의 '실제 MSE' (Unscaled) ---")
print(f"Final Real MSE: {final_real_mae}")

--- 훈련된 모델의 '실제 MSE' (Unscaled) ---
Final Real MSE: 697501.4553393184


In [None]:
final_real_mae**0.5

835.1655257129082

In [52]:
unscaled_preds = outputs.prediction_outputs
loss = torch.nn.MSELoss()(unscaled_preds, unscaled_labels)

In [53]:
loss

tensor(222041.7812, grad_fn=<MseLossBackward0>)

In [None]:
for batch in test_loader:
    outputs = best_model(past_values = batch["past_values"])

{'past_values': tensor([[[3942.0000],
          [3878.0000],
          [3961.3301],
          ...,
          [2803.2300],
          [2882.3799],
          [2873.1399]],
 
         [[1000.0000],
          [ 996.0000],
          [ 990.0000],
          ...,
          [1483.0000],
          [1470.0000],
          [1478.0000]],
 
         [[1090.0000],
          [1810.0000],
          [1710.0000],
          ...,
          [4280.0000],
          [4160.0000],
          [3980.0000]],
 
         ...,
 
         [[6195.3130],
          [5702.5293],
          [5317.4561],
          ...,
          [6554.5898],
          [6141.6689],
          [5777.8862]],
 
         [[8210.8125],
          [8264.2178],
          [8222.5205],
          ...,
          [6681.6230],
          [6582.5327],
          [6502.2646]],
 
         [[4167.0000],
          [4561.0000],
          [4574.0000],
          ...,
          [7551.0000],
          [7693.0000],
          [8011.0000]]]),
 'future_values': tensor([[[2969.

In [11]:
best_model(past_values = test_dataset["past_values"])

TypeError: ones_like(): argument 'input' (position 1) must be Tensor, not Column

In [20]:
y_data.mean()

np.float32(4472.5435)

In [None]:
pretrained_layers = base_loaded.layers[:-1]
pretrained_model = Model(inputs=base_loaded.input, outputs=pretrained_layers[-1].output)

inputs = Input(shape=(target_X.shape[1], 1))
flat_inp = layers.Reshape((target_X.shape[1],), name='transfer_flatten')(inputs) 
pretrained_output = pretrained_model(flat_inp)                                   
pretrained_output_reshaped = Reshape((target_y.shape[1], -1))(pretrained_output) 

x = Dense(128, activation='linear')(pretrained_output_reshaped)
x = Dropout(0.2)(x)
x = Dense(64, activation='linear')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='linear')(x)  # (B, T_out, 1)

model_instance = Model(inputs=inputs, outputs=out)
model_instance.compile(optimizer=Adam(learning_rate=lr_), loss=lossf)

early_stop = EarlyStopping(monitor='val_loss', patience=pt, verbose=0, restore_best_weights=True)
csv_log   = CSVLogger(os.path.join(LOG_DIR, f'transfer_{str(lossf)}_lr{lr_}_run{i}.csv'))

history = model_instance.fit(
    target_X.reshape(-1, target_X.shape[1], 1), target_y.reshape(-1, target_y.shape[1], 1),
    batch_size=batch_size_, epochs=epochs_, verbose=0,
    callbacks=[early_stop, csv_log, PrintValLossEveryN(1)],
    validation_data=(target_X_val.reshape(-1, target_X_val.shape[1], 1),
                        target_y_val.reshape(-1, target_y_val.shape[1], 1))
)

pred_val = model_instance.predict(target_X_val.reshape(-1, target_X_val.shape[1], 1), verbose=0).reshape(-1, target_y.shape[1])
pred_test = model_instance.predict(test_X.reshape(-1, test_X.shape[1], 1), verbose=0).reshape(-1, target_y.shape[1])

model_pred_val.append(pred_val)
model_pred_test.append(pred_test)
history_mapes_G.append(history)
print(f"######################################################## Loaded & fine-tuned (FC) model {i}")
del model_instance
del base_loaded
del pretrained_model
tf.keras.backend.clear_session()
import gc; gc.collect()