In [1]:
# Cell 1: Setup and Imports

import sys
import os
import pandas as pd
import torch

# --- Thêm đường dẫn project vào hệ thống ---
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


# Cấu hình để Pytorch hoạt động tốt với một số môi trường
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
curr_dir = os.getcwd()
path = os.path.join(curr_dir, "..")

# Fecth data USD + ETH

In [2]:
from src.data.fecther_factory import FetcherFactory
import dotenv
from src.data.saver_factory import SaverFactory


dotenv.load_dotenv()

api_key = os.getenv("API-Key")
api_secret = os.getenv("Secret-Key")

fetcher = FetcherFactory.create_data_fetcher("binance", api_key=api_key, api_secret=api_secret)

data_btc = fetcher.fetch_data(
    symbol="BTCUSDT",
    interval="1d",
    start_str="2025-01-01"
)

data_eth = fetcher.fetch_data(
    symbol="ETHUSDT",
    interval="1d",
    start_str="2025-01-01"
)

saver = SaverFactory.create_data_saver("csv")


saver.save_data(data_btc, file_path=f"{path}/data/raw/day/BTCUSDT.csv")
saver.save_data(data_eth, file_path=f"{path}/data/raw/day/ETHUSDT.csv")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp'], unit="ms")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp'], unit="ms")


# Load data 

In [3]:
from src.data.loader.data_loader_service import DataLoaderService
from src.data.loader.csv_loader import CSVLoader
from src.features.feature_engineer import FeatureEngineer
from src.utils.normalizer import Normalizer

repository_data = CSVLoader(file_path=path+"/data/raw/day")
data_loader = DataLoaderService(repository_data)


raw_data = data_loader.load_data()

fe = FeatureEngineer(lags=[1,7], emas=[10,20], add_volatility=True, add_rsi=True, add_datetime=True)

data_fe = fe.transform(raw_data)

saver.save_data(data_fe, file_path=f"{path}/data/processed/data.csv")


numeric_cols = ["open", "high", "low", "close", "volume", 
                "close_lag_1", "close_lag_7", "ema_10", "ema_20"]

normalizer = Normalizer("standard", per_symbol=True, columns=numeric_cols)
data_normalized = normalizer.fit_transform(data_fe)


print(data_normalized)

saver.save_data(data_normalized, file_path=f"{path}/data/processed/data_normalized.csv")

                open      high       low     close    volume   symbol  \
timestamp                                                               
2025-01-14  0.470736  0.498260  0.568333  0.564715 -0.744934  BTCUSDT   
2025-01-15  0.576382  0.750704  0.642522  0.833228 -0.486008  BTCUSDT   
2025-01-16  0.847186  0.735632  0.739161  0.662687 -0.465569  BTCUSDT   
2025-01-17  0.675182  0.811140  0.790559  0.859510 -0.329400  BTCUSDT   
2025-01-18  0.873717  0.774706  0.692186  0.662282  0.003757  BTCUSDT   
...              ...       ...       ...       ...       ...      ...   
2025-08-30  1.936577  1.846000  1.951122  1.929413 -0.842851  ETHUSDT   
2025-08-31  1.952786  1.942358  2.092181  1.950964 -0.795989  ETHUSDT   
2025-09-01  1.974522  1.934648  1.894187  1.859042 -0.342998  ETHUSDT   
2025-09-02  1.881812  1.847360  1.951916  1.873307 -0.311958  ETHUSDT   
2025-09-03  1.896187  1.933253  1.982931  2.036561 -0.546996  ETHUSDT   

            sentiment_score  close_lag_1  close_la

In [4]:
# import pandas as pd
# from pytorch_forecasting import TimeSeriesDataSet

# # reset index để timestamp trở lại làm cột
# df = df.reset_index()

# # tạo time_idx từ timestamp (số ngày kể từ mốc đầu tiên)
# df["time_idx"] = (df["timestamp"] - df["timestamp"].min()).dt.days

# # xác định cutoff cho tập train
# training_cutoff = df["time_idx"].max() - 30  # 30 ngày cuối để test

# # dataset cho training
# training = TimeSeriesDataSet(
#     df[lambda x: x.time_idx <= training_cutoff],
#     time_idx="time_idx",
#     target="close",                 # target để dự đoán
#     group_ids=["symbol"],           # mỗi coin là 1 group
#     min_encoder_length=24,          # số bước nhìn lại
#     max_encoder_length=48,          # encoder context
#     min_prediction_length=1,
#     max_prediction_length=30,       # dự đoán 30 bước tới
#     time_varying_known_reals=["hour", "dayofweek", "weekofyear"],
#     time_varying_unknown_reals=[
#         "open", "high", "low", "close", "volume",
#         "sentiment_score", "close_lag_1", "close_lag_2",
#         "close_lag_3", "close_lag_24", "EMA_12", "EMA_26",
#         "SMA_50", "RSI_14", "BBM_20_2", "BBP_20_2",
#         "MACD_12_26_9", "MACDs_12_26_9", "MACDh_12_26_9"
#     ],
#     target_normalizer=None,  # có thể dùng GroupNormalizer nếu nhiều symbol
#     add_relative_time_idx=True,
#     add_target_scales=True,
#     add_encoder_length=True,
# )

# # dataset cho validation
# validation = TimeSeriesDataSet.from_dataset(training, df, min_prediction_idx=training_cutoff + 1)

# # tạo DataLoader
# from torch.utils.data import DataLoader

# batch_size = 64
# train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=True, num_workers=4)
# val_dataloader = DataLoader(validation, batch_size=batch_size, shuffle=False, num_workers=4)
