In [1]:
!pip install git+https://github.com/IBM/tsfm.git
!pip install transformers pandas scikit-learn matplotlib


Collecting git+https://github.com/IBM/tsfm.git
  Cloning https://github.com/IBM/tsfm.git to /tmp/pip-req-build-y8qinsmv
  Running command git clone --filter=blob:none --quiet https://github.com/IBM/tsfm.git /tmp/pip-req-build-y8qinsmv
  Resolved https://github.com/IBM/tsfm.git to commit 02dc9dc7e672abfc3ec6d421deedaff696de81ad
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pandas>=2.2.0 (from tsfm_public==0.1.1.dev3+g02dc9dc)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from tsfm_public==0.1.1.dev3+g02dc9dc)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m28.1 MB/s[0m 

In [2]:
!pip install transformers



In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_squared_log_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import PatchTSTConfig, PatchTSTForPrediction, Trainer, TrainingArguments, EarlyStoppingCallback
from tsfm_public.toolkit.dataset import ForecastDFDataset
from tsfm_public.toolkit.time_series_preprocessor import TimeSeriesPreprocessor
from tsfm_public.toolkit.util import select_by_index

In [6]:
from transformers import set_seed
# Set seed for reproducibility
set_seed(2023)

# Đọc dữ liệu từ file CSV
data = pd.read_csv('/content/gold_price_2018_2024.csv')

# Chuyển đổi cột Date thành định dạng datetime
data['Date'] = pd.to_datetime(data['Date'])
timestamp_column = 'Date'
forecast_columns = ['Value (USD per troy ounce)']

# Chia dữ liệu thành tập huấn luyện và kiểm tra theo tỷ lệ 7-3
train_size = int(len(data) * 0.7)
test_size = len(data) - train_size
train, test = train_test_split(data, test_size=test_size, shuffle=False)

In [7]:


# Define parameters
context_length = 512
forecast_horizon = 96
patch_length = 16
num_workers = 4
batch_size = 64

# Chia dữ liệu thành tập huấn luyện và kiểm tra theo tỷ lệ 7-3
train_size = int(len(data) * 0.7)
test_size = len(data) - train_size
train, test = train_test_split(data, test_size=test_size, shuffle=False)

# Chuẩn bị dữ liệu
time_series_preprocessor = TimeSeriesPreprocessor(
    timestamp_column=timestamp_column,
    id_columns=[],
    input_columns=forecast_columns,
    output_columns=forecast_columns,
    scaling=True,
)

train_data = select_by_index(train, start_index=0, end_index=len(train))
test_data = select_by_index(test, start_index=0, end_index=len(test))

time_series_preprocessor = time_series_preprocessor.train(train_data)

train_dataset = ForecastDFDataset(
    time_series_preprocessor.preprocess(train_data),
    id_columns=[],
    timestamp_column=timestamp_column,
    target_columns=forecast_columns,
    context_length=context_length,
    prediction_length=forecast_horizon,
)

test_dataset = ForecastDFDataset(
    time_series_preprocessor.preprocess(test_data),
    id_columns=[],
    timestamp_column=timestamp_column,
    target_columns=forecast_columns,
    context_length=context_length,
    prediction_length=forecast_horizon,
)

In [None]:
# Inspect the signature of the ForecastDFDataset class
import inspect
print(inspect.signature(ForecastDFDataset))


(data: pandas.core.frame.DataFrame, id_columns: List[str] = [], timestamp_column: Optional[str] = None, target_columns: List[str] = [], observable_columns: List[str] = [], control_columns: List[str] = [], conditional_columns: List[str] = [], static_categorical_columns: List[str] = [], context_length: int = 1, prediction_length: int = 1, num_workers: int = 1, frequency_token: Optional[int] = None, autoregressive_modeling: bool = True)


In [13]:

# Cấu hình mô hình PatchTST
config = PatchTSTConfig(
    num_input_channels=len(forecast_columns),
    context_length=context_length,
    patch_length=patch_length,
    patch_stride=patch_length,
    prediction_length=forecast_horizon,
    random_mask_ratio=0.4,
    d_model=128,
    num_attention_heads=16,
    num_hidden_layers=3,
    ffn_dim=256,
    dropout=0.2,
    head_dropout=0.2,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
)

model = PatchTSTForPrediction(config)


Train model


In [20]:


# Hàm tính toán các chỉ số đánh giá
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(labels, predictions)
    return {
        'eval_loss': mse,
        'rmse': rmse,
        'mape': mape
    }

# Thiết lập các tham số huấn luyện
training_args = TrainingArguments(
    output_dir="./checkpoint/patchtst/gold_price/output/",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=num_workers,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=3,
    logging_dir="./checkpoint/patchtst/gold_price/logs/",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.0001,
)

# Huấn luyện mô hình
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Sử dụng tập kiểm tra làm tập đánh giá
    # compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

trainer.train()

# Đánh giá mô hình trên tập kiểm tra
results = trainer.evaluate(test_dataset)
print("Test results:", results)

# Hàm vẽ biểu đồ kết quả
def plot_results(dates, actual, train, predict, title):
    plt.figure(figsize=(14, 8))
    plt.plot(dates, actual, label='Actual Price', color='red')
    plt.plot(dates[:len(train)], train, label='Train Price', color='green')
    plt.plot(dates[len(train):len(train)+len(predict)], predict, label='Predicted Price', color='blue')
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Vẽ biểu đồ kết quả
predicted_values = model.predict(test_dataset)
plot_results(data['Date'], data['Value (USD per troy ounce)'], train['Value (USD per troy ounce)'], predicted_values, 'Gold Price Prediction')


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,0.3383,No log


  self.pid = os.fork()
early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


KeyError: 'eval_loss'

In [None]:
results = trainer.evaluate(test_dataset)
print("Test result:")
print(results)



  self.pid = os.fork()


Test result:
{'eval_loss': 0.06905169785022736, 'eval_runtime': 0.6211, 'eval_samples_per_second': 1.61, 'eval_steps_per_second': 1.61, 'epoch': 16.0}


In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import PatchTSTConfig, PatchTSTForPrediction, Trainer, TrainingArguments, EarlyStoppingCallback, DefaultDataCollator
from tsfm_public.toolkit.dataset import ForecastDFDataset
from tsfm_public.toolkit.time_series_preprocessor import TimeSeriesPreprocessor
from tsfm_public.toolkit.util import select_by_index

# Đọc dữ liệu từ file CSV
data = pd.read_csv('/content/gold_price_2018_2024.csv')

# Chuyển đổi cột Date thành định dạng datetime
data['Date'] = pd.to_datetime(data['Date'])
timestamp_column = 'Date'
forecast_columns = ['Value (USD per troy ounce)']

# Chia dữ liệu thành tập huấn luyện và kiểm tra theo tỷ lệ 7-3
train_size = int(len(data) * 0.7)
test_size = len(data) - train_size
train, test = train_test_split(data, test_size=test_size, shuffle=False)

# Chuyển đổi dữ liệu thành định dạng thích hợp cho mô hình PatchTST
context_length = 512
forecast_horizon = 96
patch_length = 16
num_workers = 4
batch_size = 64

# Chuẩn bị dữ liệu
time_series_preprocessor = TimeSeriesPreprocessor(
    timestamp_column=timestamp_column,
    id_columns=[],
    input_columns=forecast_columns,
    output_columns=forecast_columns,
    scaling=True,
)

train_data = select_by_index(train, start_index=0, end_index=len(train))
test_data = select_by_index(test, start_index=0, end_index=len(test))

time_series_preprocessor = time_series_preprocessor.train(train_data)

train_dataset = ForecastDFDataset(
    time_series_preprocessor.preprocess(train_data),
    id_columns=[],
    timestamp_column=timestamp_column,
    target_columns=forecast_columns,
    context_length=context_length,
    prediction_length=forecast_horizon,
)

test_dataset = ForecastDFDataset(
    time_series_preprocessor.preprocess(test_data),
    id_columns=[],
    timestamp_column=timestamp_column,
    target_columns=forecast_columns,
    context_length=context_length,
    prediction_length=forecast_horizon,
)

# Cấu hình mô hình PatchTST
config = PatchTSTConfig(
    num_input_channels=len(forecast_columns),
    context_length=context_length,
    patch_length=patch_length,
    patch_stride=patch_length,
    prediction_length=forecast_horizon,
    random_mask_ratio=0.4,
    d_model=128,
    num_attention_heads=16,
    num_hidden_layers=3,
    ffn_dim=256,
    dropout=0.2,
    head_dropout=0.2,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
)

model = PatchTSTForPrediction(config)

# Hàm tính toán các chỉ số đánh giá
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = predictions.reshape(-1, len(forecast_columns))
    labels = labels.reshape(-1, len(forecast_columns))
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(labels, predictions)
    return {
        'eval_loss': mse,
        'rmse': rmse,
        'mape': mape
    }

# Custom data collator
class CustomDataCollator(DefaultDataCollator):
    def __call__(self, features):
        batch = super().__call__(features)
        if "labels" in batch:
            batch["labels"] = batch["labels"].view(-1, len(forecast_columns))
        return batch

data_collator = CustomDataCollator()

# Thiết lập các tham số huấn luyện
training_args = TrainingArguments(
    output_dir="./checkpoint/patchtst/gold_price/output/",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=num_workers,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=3,
    logging_dir="./checkpoint/patchtst/gold_price/logs/",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    label_names=["labels"],
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.0001,
)

# Huấn luyện mô hình
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

trainer.train()

# Đánh giá mô hình trên tập kiểm tra
results = trainer.evaluate(test_dataset)
print("Test results:", results)

# Hàm vẽ biểu đồ kết quả
def plot_results(dates, actual, train, predict, title):
    plt.figure(figsize=(14, 8))
    plt.plot(dates, actual, label='Actual Price', color='red')
    plt.plot(dates[:len(train)], train, label='Train Price', color='green')
    plt.plot(dates[len(train):len(train)+len(predict)], predict, label='Predicted Price', color='blue')
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Vẽ biểu đồ kết quả
predicted_values = model.predict(test_dataset)
plot_results(data['Date'], data['Value (USD per troy ounce)'], train['Value (USD per troy ounce)'], predicted_values, 'Gold Price Prediction')


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,0.6116,No log


  self.pid = os.fork()
early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


KeyError: 'eval_loss'

In [None]:
# Save the model
save_dir = "patchtst/gold_price/model/pretrain/"
os.makedirs(save_dir, exist_ok=True)
trainer.save_model(save_dir)

In [None]:
# Load the dataset
dataset_path = '/content/gold_price_2018_2024.csv'
data = pd.read_csv(dataset_path, parse_dates=['Date'])

# Redefine parameters
timestamp_column = 'Date'
forecast_columns = ['Value (USD per troy ounce)']
context_length = 512
forecast_horizon = 96

# Split data into train and test sets (70% train, 30% test)
num_train = int(len(data) * 0.7)
test_start_index = num_train
test_end_index = len(data) - context_length

train_data = select_by_index(data, id_columns=[], start_index=0, end_index=num_train - context_length)
test_data = select_by_index(data, id_columns=[], start_index=test_start_index, end_index=test_end_index)

# Preprocess the data
time_series_preprocessor = TimeSeriesPreprocessor(
    timestamp_column=timestamp_column,
    id_columns=[],
    input_columns=forecast_columns,
    output_columns=forecast_columns,
    scaling=True,
)

# Train the scaler on the training data
time_series_preprocessor = time_series_preprocessor.train(train_data)

# Preprocess the test data
test_data_preprocessed = time_series_preprocessor.preprocess(test_data)

# Create ForecastDFDataset for test data
test_dataset = ForecastDFDataset(
    test_data_preprocessed,
    id_columns=[],
    timestamp_column=timestamp_column,
    target_columns=forecast_columns,
    context_length=context_length,
    prediction_length=forecast_horizon,
)
