In [1]:
import sys

sys.path.append("..")

import torch
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import AutoConfig
from trl import GRPOConfig

from src.price_process import PriceProcessor
from src.model import Qwen2ForAction
from src.trainer import ActionGRPOTrainer
from src.rewards import compute_yield_policy1

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## data

In [2]:
pe_df = pd.read_csv("../data/000300_pe.csv", encoding="gbk")
index_df = pd.read_csv("../data/000300_price.csv", encoding="gbk")
bond_df = pd.read_csv("../data/10yr_bond_yield.csv", encoding="gbk")

In [3]:
pe_df["tradeDate"] = pd.to_datetime(pe_df["tradeDate"])
index_df["tradeDate"] = pd.to_datetime(index_df["tradeDate"])
bond_df["tradeDate"] = pd.to_datetime(bond_df["tradeDate"])

In [4]:
index_df = index_df.merge(pe_df, how="left", on="tradeDate")

In [5]:
index_df = index_df[index_df["tradeDate"] >= "2013-01-01"]
len(index_df)

2974

In [6]:
bond_df = bond_df[bond_df["tradeDate"] >= "2013-01-01"]
len(bond_df)

3031

In [7]:
overall_df = index_df.merge(bond_df, how="left", on="tradeDate")

In [8]:
prompt_len = 8
max_len = 16
pp = PriceProcessor(overall_df, ["EPValue", "yield"],
                    ["openIndex", "closeIndex"],
                    prompt_len=prompt_len,
                    max_len=max_len)

In [16]:
dataset = pp.rolling(window=max_len, step=13)

In [17]:
dataset = dataset.rename_columns({
    "openIndex": "open_price",
    "closeIndex": "close_price"
})
# dataset.set_format("pt")

In [15]:
dataset["close_price"].shape

torch.Size([228, 8])

## model

In [2]:
dataset = load_dataset("parquet",
                       data_files="../data/p1_32_512.parquet")["train"]

In [3]:
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)

In [4]:
config = AutoConfig.from_pretrained("Qwen/Qwen2.5-0.5B")

In [5]:
prompt_len = 32
max_len = 512

In [6]:
config.update({
    # "hidden_size": 2,
    "vocab_size": 2,
    "num_hidden_layers": 12,
    # "num_generations": 16,
    # "max_length": max_len,
    # "num_attention_heads": 1,
})

In [7]:
model = Qwen2ForAction(config).to("cuda")
# model

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


In [8]:
trainer_config = GRPOConfig(
    num_generations=16,
    beta=0.0,
    temperature=1,
    max_prompt_length=prompt_len,
    max_completion_length=max_len - prompt_len,
    use_vllm=False,
    num_iterations=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_strategy="steps",
    logging_steps=1,
    load_best_model_at_end=True,
    save_total_limit=1,
    greater_is_better=True,
    metric_for_best_model="eval_reward",
)

In [9]:
trainer = ActionGRPOTrainer(
    model,
    reward_funcs=compute_yield_policy1,
    args=trainer_config,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
)

In [10]:
trainer.train()

Step,Training Loss,Validation Loss
50,0.0,1e-06
100,-0.0,0.0
150,-0.0,0.0
200,0.0,1e-06
250,-0.0,1e-06


TrainOutput(global_step=270, training_loss=1.0087902199346282e-07, metrics={'train_runtime': 142.62, 'train_samples_per_second': 1.893, 'train_steps_per_second': 1.893, 'total_flos': 0.0, 'train_loss': 1.0087902199346282e-07})

In [11]:
trainer.state.best_metric

0.02358661045230204