In [1]:
import os
import sys

sys.path.append("..")

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
    Qwen2Model,
    Qwen2PreTrainedModel,
)
from transformers.modeling_outputs import CausalLMOutput
# from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
from trl import GRPOTrainer
from trl.trainer.utils import selective_log_softmax

from src.price_process import PriceProcessor
from src.model import Qwen2ForAction

## model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
config = AutoConfig.from_pretrained("Qwen/Qwen2.5-0.5B")

In [5]:
config.update({
    # "hidden_size": 2,
    "vocab_size": 2,
    "num_generations": 16,
    "prompt_length": 5,
    "max_length": 2048,
    # "num_attention_heads": 1,
})

In [6]:
model = Qwen2ForAction(config).to("cuda")
model

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


Qwen2ForAction(
  (embed): Linear(in_features=2, out_features=896, bias=True)
  (model): Qwen2Model(
    (embed_tokens): Embedding(2, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNo

## sample

In [7]:
pe_df = pd.read_csv("../data/000300_pe.csv", encoding="gbk")
index_df = pd.read_csv("../data/000300_price.csv", encoding="gbk")
bond_df = pd.read_csv("../data/10yr_bond_yield.csv", encoding="gbk")

In [8]:
pe_df["tradeDate"] = pd.to_datetime(pe_df["tradeDate"])
index_df["tradeDate"] = pd.to_datetime(index_df["tradeDate"])
bond_df["tradeDate"] = pd.to_datetime(bond_df["tradeDate"])

In [9]:
index_df = index_df.merge(pe_df, how="left", on="tradeDate")

In [10]:
index_df = index_df[index_df["tradeDate"] >= "2013-01-01"]
len(index_df)

2974

In [11]:
bond_df = bond_df[bond_df["tradeDate"] >= "2013-01-01"]
len(bond_df)

3031

In [12]:
overall_df = index_df.merge(bond_df, how="left", on="tradeDate")

In [13]:
pp = PriceProcessor(overall_df, ["EPValue", "yield"],
                    extra_cols=["openIndex", "closeIndex"])

In [14]:
data = pp(start_time="2025-02-01", prompt_len=5, max_len=10, pad=True)
data

{'tradeDate': [1738713600.0,
  1738800000.0,
  1738886400.0,
  1739145600.0,
  1739232000.0,
  1739318400.0,
  1739404800.0,
  1739491200.0,
  1739750400.0,
  1739836800.0],
 'EPValue': [0.080972,
  0.080192,
  0.079428,
  0.079239,
  0.079239,
  0.078616,
  0.078802,
  0.078247,
  0.078125,
  0.07776],
 'yield': [1.63,
  1.614,
  1.606,
  1.623,
  1.632,
  1.632,
  1.634,
  1.657,
  1.677,
  1.702],
 'openIndex': [3844.6998,
  3789.6086,
  3844.0574,
  3898.1023,
  3905.1594,
  3875.9596,
  3913.9044,
  3900.8274,
  3954.4147,
  3942.5761],
 'closeIndex': [3795.0848,
  3842.8314,
  3892.7028,
  3901.0618,
  3883.1352,
  3919.8617,
  3905.1398,
  3939.0085,
  3947.3983,
  3912.7829],
 'input_ids': tensor([[0.0810, 1.6300],
         [0.0802, 1.6140],
         [0.0794, 1.6060],
         [0.0792, 1.6230],
         [0.0792, 1.6320],
         [0.0786, 1.6320],
         [0.0788, 1.6340],
         [0.0782, 1.6570],
         [0.0781, 1.6770],
         [0.0778, 1.7020]]),
 'attention_mask': ten

In [23]:
input_ids = data["input_ids"].unsqueeze(0).to("cuda")
attention_mask = data["attention_mask"].unsqueeze(0).to("cuda")
output = model(input_ids, attention_mask)

In [24]:
logits_to_keep = 5
logits = output.logits[:, :-1, :]
input_ids = input_ids[:, -logits_to_keep:]
logits = logits[:, -logits_to_keep:]

In [25]:
input_ids

tensor([[[0.0786, 1.6320],
         [0.0788, 1.6340],
         [0.0782, 1.6570],
         [0.0781, 1.6770],
         [0.0778, 1.7020]]], device='cuda:0')

In [17]:
dist = Categorical(output.logits)
actions = dist.sample(torch.Size([5]))
actions

tensor([[[1, 1, 0, 1, 0, 0, 1, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 1, 0, 0, 1]],

        [[1, 1, 0, 0, 0, 0, 0, 1, 0, 0]],

        [[0, 0, 0, 0, 0, 1, 0, 1, 0, 1]],

        [[0, 0, 0, 1, 1, 0, 1, 0, 0, 1]]], device='cuda:0')

In [18]:
log_probs = dist.log_prob(actions)
log_probs

tensor([[[-1.2236, -1.2240, -0.3481, -1.2241, -0.3482, -0.3482, -1.2239,
          -0.3484, -0.3486, -0.3488]],

        [[-0.3484, -0.3482, -0.3481, -0.3482, -0.3482, -0.3482, -1.2239,
          -0.3484, -0.3486, -1.2227]],

        [[-1.2236, -1.2240, -0.3481, -0.3482, -0.3482, -0.3482, -0.3483,
          -1.2235, -0.3486, -0.3488]],

        [[-0.3484, -0.3482, -0.3481, -0.3482, -0.3482, -1.2239, -0.3483,
          -1.2235, -0.3486, -1.2227]],

        [[-0.3484, -0.3482, -0.3481, -1.2241, -1.2239, -0.3482, -1.2239,
          -0.3484, -0.3486, -1.2227]]], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

## reward

In [19]:
actions = actions.permute(1, 0, 2)
log_probs = log_probs.permute(1, 0, 2)

In [20]:
actions

tensor([[[1, 1, 0, 1, 0, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
         [1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 1, 0, 1],
         [0, 0, 0, 1, 1, 0, 1, 0, 0, 1]]], device='cuda:0')

In [21]:
action_mask = data["action_mask"].unsqueeze(0).to("cuda")
action_mask

tensor([[0., 0., 0., 0., 1., 1., 1., 1., 1., 0.]], device='cuda:0')

In [72]:
actions[0]

tensor([[-1, -1, -1, -1,  0,  0,  1,  0,  0, -1],
        [-1, -1, -1, -1,  0,  0,  1,  0,  0, -1],
        [-1, -1, -1, -1,  0,  0,  0,  1,  0, -1],
        [-1, -1, -1, -1,  0,  1,  0,  1,  0, -1],
        [-1, -1, -1, -1,  1,  0,  1,  0,  0, -1]], device='cuda:0')

In [75]:
actions[0].masked_select(action_mask[0] == 1).view(actions.size(1), -1)

tensor([[0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 1, 0, 1, 0],
        [1, 0, 1, 0, 0]], device='cuda:0')

In [22]:
actions.masked_fill_(action_mask.unsqueeze(1) == 0, -1)
actions

tensor([[[-1, -1, -1, -1,  0,  0,  1,  0,  0, -1],
         [-1, -1, -1, -1,  0,  0,  1,  0,  0, -1],
         [-1, -1, -1, -1,  0,  0,  0,  1,  0, -1],
         [-1, -1, -1, -1,  0,  1,  0,  1,  0, -1],
         [-1, -1, -1, -1,  1,  0,  1,  0,  0, -1]]], device='cuda:0')

In [24]:
valid_actions = actions[:, :, config.prompt_length - 1:-1]

In [25]:
valid_actions

tensor([[[0, 0, 1, 0, 0],
         [0, 0, 1, 0, 0],
         [0, 0, 0, 1, 0],
         [0, 1, 0, 1, 0],
         [1, 0, 1, 0, 0]]], device='cuda:0')

In [42]:
a = torch.tensor([0, 1, 1, 1, 0, 1, 0])
is_one = a.eq(1).int()
diff = torch.diff(is_one.int(),
                  prepend=torch.tensor([0]),
                  append=torch.tensor([0]))
diff

tensor([ 0,  1,  0,  0, -1,  1, -1,  0])

In [45]:
torch.where(diff == -1)[0]

tensor([4, 6])

In [63]:
def compute_yield(
        actions: torch.Tensor,  # (N, S)
        open_price: torch.Tensor,  # (S)
        close_price: torch.Tensor,  # (S)
        slippage: float = 0.01,
        stamps: float = 0.0,
        service_fee: float = 1.3e-4,
        assets: float = 2e5):

    num_generations, span_len = actions.shape

    diff = torch.diff(actions.eq(1).int(),
                      prepend=torch.zeros((num_generations, 1),
                                          device=actions.device),
                      append=torch.zeros((num_generations, 1),
                                         device=actions.device))

    yields = torch.zeros(num_generations).to(actions.device)

    for i in range(num_generations):

        total_assets = assets

        # calculate the holding periods
        start = torch.where(diff[i] == 1)[0]
        end = torch.where(diff[i] == -1)[0]

        for span in list(zip(start.tolist(), end.tolist())):
            start_idx, end_idx = span

            bid_rate = (open_price[start_idx] + slippage) * (1 + service_fee)
            shares = total_assets // bid_rate
            total_assets = total_assets % bid_rate

            if end_idx < span_len:
                ask_rate = (open_price[end_idx] -
                            slippage) * (1 - service_fee - stamps)
            else:
                ask_rate = close_price[-1]

            total_assets += shares * ask_rate

        yields[i] = (total_assets - assets) / assets

    return yields

In [64]:
compute_yield(
    valid_actions[0],
    data["openIndex"][-5:],
    data["closeIndex"][-5:],
)

tensor([ 0.0134,  0.0134, -0.0032, -0.0068,  0.0228], device='cuda:0')

In [65]:
valid_actions

tensor([[[0, 0, 1, 0, 0],
         [0, 0, 1, 0, 0],
         [0, 0, 0, 1, 0],
         [0, 1, 0, 1, 0],
         [1, 0, 1, 0, 0]]], device='cuda:0')

In [66]:
data["openIndex"][-5:]

[3875.9596, 3913.9044, 3900.8274, 3954.4147, 3942.5761]

In [50]:
data

{'tradeDate': [1738713600.0,
  1738800000.0,
  1738886400.0,
  1739145600.0,
  1739232000.0,
  1739318400.0,
  1739404800.0,
  1739491200.0,
  1739750400.0,
  1739836800.0],
 'EPValue': [0.080972,
  0.080192,
  0.079428,
  0.079239,
  0.079239,
  0.078616,
  0.078802,
  0.078247,
  0.078125,
  0.07776],
 'yield': [1.63,
  1.614,
  1.606,
  1.623,
  1.632,
  1.632,
  1.634,
  1.657,
  1.677,
  1.702],
 'openIndex': [3844.6998,
  3789.6086,
  3844.0574,
  3898.1023,
  3905.1594,
  3875.9596,
  3913.9044,
  3900.8274,
  3954.4147,
  3942.5761],
 'closeIndex': [3795.0848,
  3842.8314,
  3892.7028,
  3901.0618,
  3883.1352,
  3919.8617,
  3905.1398,
  3939.0085,
  3947.3983,
  3912.7829],
 'input_ids': tensor([[0.0810, 1.6300],
         [0.0802, 1.6140],
         [0.0794, 1.6060],
         [0.0792, 1.6230],
         [0.0792, 1.6320],
         [0.0786, 1.6320],
         [0.0788, 1.6340],
         [0.0782, 1.6570],
         [0.0781, 1.6770],
         [0.0778, 1.7020]]),
 'attention_mask': ten

## GRPO trainer

In [None]:
from typing import Any, Union

from accelerate.utils.operations import gather
from trl import GRPOTrainer
from trl.extras.profiling import profiling_context


class ActionGRPOTrainer(GRPOTrainer):

    def _get_per_token_logps(self, model, input_ids, attention_mask,
                             logits_to_keep):
        logits = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       logits_to_keep=logits_to_keep + 1).logits
        logits = logits[:, :-1, :]
        logits = logits[:, -logits_to_keep:]
        logits = logits / self.temperature

        logits = F.softmax(logits, dim=-1)
        dist = Categorical(logits)
        actions = dist.sample(torch.Size([self.num_generations]))
        log_probs = dist.log_prob(actions)  # (N, B, S)

        # (B * N, S)
        actions = actions.permute(1, 0, 2).view(-1, logits_to_keep)
        log_probs = log_probs.permute(1, 0, 2).view(-1, logits_to_keep)

        return log_probs, actions

    def _generate_and_score_completions(
        self, inputs: dict[str, Union[torch.Tensor, Any]]
    ) -> dict[str, Union[torch.Tensor, Any]]:
        device = self.accelerator.device

        prompt_ids = inputs["prompt_ids"][::self.num_generations]
        prompt_mask = inputs["prompt_mask"][::self.num_generations]
        completion_ids = inputs["completion_ids"][::self.num_generations]
        completion_mask = inputs["completion_mask"][::self.num_generations]

        if self.max_prompt_length is not None:
            prompt_ids = prompt_ids[:, -self.max_prompt_length:]
            prompt_mask = prompt_mask[:, -self.max_prompt_length:]

        prompt_completion_ids = torch.concat([prompt_ids, completion_ids],
                                             dim=1)
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)

        with torch.no_grad():
            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
            # computation here, and use per_token_logps.detach() instead.
            old_per_token_logps, actions = self._get_per_token_logps(
                self.model, prompt_completion_ids, attention_mask,
                logits_to_keep)
            if self.num_iterations <= 1:
                old_per_token_logps = None

            if self.beta == 0.0:
                ref_per_token_logps = None
            elif self.ref_model is not None:
                ref_per_token_logps, _ = self._get_per_token_logps(
                    self.ref_model, prompt_completion_ids, attention_mask,
                    logits_to_keep)
            else:
                with self.accelerator.unwrap_model(
                        self.model).disable_adapter():
                    ref_per_token_logps, _ = self._get_per_token_logps(
                        self.model, prompt_completion_ids, attention_mask,
                        logits_to_keep)

        rewards_per_func = torch.zeros(len(actions),
                                       len(self.reward_funcs),
                                       device=device)  # (B * N, 1)
        for i, (reward_func, reward_processing_class) in enumerate(
                zip(self.reward_funcs, self.reward_processing_classes)):
            if isinstance(
                    reward_func, nn.Module
            ):  # Module instead of PretrainedModel for compat with compiled models
                reward_func_name = f"reward {reward_func.config._name_or_path.split('/')[-1]}"
            else:
                reward_func_name = reward_func.__name__
            with profiling_context(self, reward_func_name):
                if isinstance(
                        reward_func, nn.Module
                ):  # Module instead of PretrainedModel for compat with compiled models
                    raise NotImplementedError()
                else:
                    # Repeat all input columns (but "prompt" and "completion") to match the number of generations
                    keys = [
                        key for key in inputs[0]
                        if not key.startswith("prompt")
                    ]
                    reward_kwargs = {
                        key: [example[key] for example in inputs]
                        for key in keys
                    }
                    output_reward_func = reward_func(
                        actions=actions,
                        num_generations=self.num_generations,
                        **reward_kwargs,
                    )
                    # Convert None values to NaN
                    output_reward_func = [
                        reward if reward is not None else torch.nan
                        for reward in output_reward_func
                    ]

                    rewards_per_func[:, i] = torch.tensor(output_reward_func,
                                                          dtype=torch.float32,
                                                          device=device)

        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
        # completions may be distributed across processes
        rewards_per_func = gather(rewards_per_func)  # (B * N, 1)

        # Apply weights to each reward function's output and sum
        rewards = (rewards_per_func *
                   self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)

        # Compute grouped-wise rewards
        mean_grouped_rewards = rewards.view(-1,
                                            self.num_generations).mean(dim=1)
        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)

        # Normalize the rewards to compute the advantages
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0)
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0)
        advantages = rewards - mean_grouped_rewards
        if self.args.scale_rewards:
            advantages = advantages / (std_grouped_rewards + 1e-4)

        # Slice to keep only the local part of the data
        process_slice = slice(
            self.accelerator.process_index * len(actions),
            (self.accelerator.process_index + 1) * len(actions),
        )
        advantages = advantages[process_slice]

        # Log the metrics
        mode = "eval" if self.control.should_evaluate else "train"

        if mode == "train":
            self._total_train_tokens += self.accelerator.gather_for_metrics(
                attention_mask.sum()).sum().item()
        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]

        completion_length = self.accelerator.gather_for_metrics(
            completion_mask.sum(1)).float().mean().item()
        self._metrics[mode]["completion_length"].append(completion_length)

        # Calculate mean reward per function, but only for samples where the function was applied
        for i, reward_func in enumerate(self.reward_funcs):
            if isinstance(
                    reward_func, nn.Module
            ):  # Module instead of PretrainedModel for compat with compiled models
                reward_func_name = reward_func.config._name_or_path.split(
                    "/")[-1]
            else:
                reward_func_name = reward_func.__name__
            # Only calculate mean for samples where this reward function was applied (non-NaN values)
            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
            self._metrics[mode][f"rewards/{reward_func_name}"].append(
                mean_rewards)
        self._metrics[mode]["reward"].append(rewards.mean().item())
        self._metrics[mode]["reward_std"].append(
            std_grouped_rewards.mean().item())

        return {
            "prompt_ids": prompt_ids,
            "prompt_mask": prompt_mask,
            "completion_ids": completion_ids,
            "completion_mask": completion_mask,
            "old_per_token_logps": old_per_token_logps,
            "ref_per_token_logps": ref_per_token_logps,
            "advantages": advantages,
        }


In [9]:
a = torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 0])

In [10]:
a.diff()

tensor([ 1,  0,  0, -1,  1,  0, -1,  0])

In [5]:
a.view(3, 3)[1, 2]

tensor(1)