In [10]:
import torch
import gymnasium as gym
import numpy as np
from transformers import DecisionTransformerModel
import gymnasium as gym
import os
import pandas as pd
from finrl.main import check_and_make_directories
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
from stable_baselines3 import PPO
from finrl.agents.stablebaselines3.models import DRLAgent


check_and_make_directories([TRAINED_MODEL_DIR])

In [11]:
device = torch.device('cpu')
torch.cuda.current_device()

0

In [12]:
train = pd.read_csv('data/train.csv')
train = train.set_index(train.columns[0])
train.index.names =['']

trade = pd.read_csv('data/trade.csv')
trade = trade.set_index(trade.columns[0])
trade.index.names = ['']

In [13]:
stock_dimension = len(trade.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")

Stock Dimension: 29, State Space: 291


In [14]:
buy_cost_list = sell_cost_list = [0.005] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

e_trade_gym = StockTradingEnv(df=trade, **env_kwargs)
env, obs = e_trade_gym.get_sb_env()

Here `scale=1000.0` is the normalization factor used training to scale return-to-go. The same value used in the data collator (`DecisionTransformerGymCollator`). It also ensures consistent scaling between training and evalution.

In [None]:
scale = 1000.0

Target performance goal for the Decision Transformer (3.6 normalized units). Represents a desired cumulative return of 3600, and the DT will condition its action on achieving this target return. This is the key advantage of Decision Transformers - you can specify different performance goals without retraining.

In [None]:
TARGET_RETURN = 3600 / scale

We have the state space dimension, extract the observation space size from the trading environment, and ensure compatbility between the model and environment.

In [None]:
state_dim = env.observation_space.shape[0]

Here we have the action space dimension, which extracts the action space size from the trading enviroment. This matches the number of stocks being traded.

In [None]:
act_dim = env.action_space.shape[0]

In [16]:
state_dim, act_dim

(291, 29)

In [17]:
from datasets import load_from_disk

dataset = load_from_disk('data/dataset/')

state_mean = dataset['state_mean']
state_std = dataset['state_std']

state_mean = state_mean[:state_dim]
state_std = state_std[:state_dim]

state_mean = torch.Tensor(state_mean).to(device)
state_std = torch.Tensor(state_std).to(device)

In [18]:
# Create the decision transformer model
model = DecisionTransformerModel.from_pretrained('trained_models')
model = model.to(device)

print(list(model.encoder.parameters()))

[Parameter containing:
tensor([[ 2.4312e-02, -1.1555e-02,  3.6968e-04, -1.1414e-03,  9.4329e-03,
         -3.2814e-03,  4.4173e-03, -1.3393e-03,  9.3231e-04, -1.9283e-02,
         -2.3111e-02, -9.4715e-03,  3.0378e-02, -1.1570e-02,  2.2246e-02,
         -2.5608e-02, -9.2927e-03,  2.6140e-02, -2.3734e-02, -2.1007e-02,
         -1.2606e-02,  1.9693e-02,  2.7130e-03, -1.4290e-02,  1.5797e-02,
          7.5824e-03, -2.6650e-02, -7.8964e-03, -1.7305e-02,  1.8385e-02,
         -3.0966e-03,  2.9283e-03,  1.0971e-02, -1.6951e-02,  6.4305e-03,
         -2.6300e-02, -6.6441e-03,  2.1342e-04,  1.3612e-02, -1.0341e-02,
         -3.2291e-02,  1.9063e-02, -9.6947e-03, -2.3142e-02,  1.2474e-02,
         -1.6121e-02,  2.7389e-02,  5.3831e-03,  1.0579e-02, -2.9774e-02,
          5.0675e-03,  8.3857e-03, -1.0057e-02, -1.2753e-02,  2.0177e-03,
          4.1141e-03,  6.2496e-03,  1.5456e-02, -1.4733e-02, -1.8696e-02,
         -9.0569e-03, -1.1911e-03,  5.5717e-02,  2.9336e-02,  2.8145e-02,
         -1.364

### Prediction


Here we are defining the `get_action` function, where is the core inference function for the Decision Transformer during evaluation. It performs autoregression action prediction using a sliding window approach.

This function takes the current trading history and predicts the next action the Decision Transformer should take, conditioned on the desired return-to-go target.

The autoregressive generation uses the previous 20 timesteps to predict the next action. Each prediction is based on recent history and target return.

Actions are conditioned on the target performance and the model learns to achieve the specified return target.

#### Input Reshaping (Line 5-8)

* Reshape all inputs to batch format: `(batch_size=1, sequences_length, feature_dim)`
* Prepares data for the transformer model

#### Sliding Window (Lines 12-16)

* Takes only the last `max_length` timesteps (typically 20)
* Creates a sliding window of recent history
* Calcualtes how much padding is needed

#### Padding (Lines 19-24)

* Pads sequences to a fixed length
* Adds zeros at the beginning for padding
* Creates attention mask: 0 for padding, 1 for real data

#### Model Prediction (Lines 26-32)

* Feeds the padded sequence to the Decision Transformer
* Model processes the sequence through its attention layers
* Returns predictions for states, actions, return-to-go

#### Extract Action (Lines 34)
* Returns only the last predicted action `([0,1])`
* This is the action for the current timestep
* Ignores predictions for previous timesteps

In [None]:
# Function that gets an action from the model using autoregressive prediction with a window of the previous 20 timesteps.
def get_action(model, states, actions, rewards, returns_to_go, timesteps):
    # This implementation does not condition on past rewards

    # Reshape inputs to batch format
    states = states.reshape(1, -1, model.config.state_dim)
    actions = actions.reshape(1, -1, model.config.act_dim)
    returns_to_go = returns_to_go.reshape(1, -1, 1)
    timesteps = timesteps.reshape(1, -1)

    #Sliding Window 
    states = states[:, -model.config.max_length :]
    actions = actions[:, -model.config.max_length :]
    returns_to_go = returns_to_go[:, -model.config.max_length :]
    timesteps = timesteps[:, -model.config.max_length :]
    padding = model.config.max_length - states.shape[1]
    
    # pad all tokens to sequence length
    attention_mask = torch.cat([torch.zeros(padding), torch.ones(states.shape[1])]).to(device=device)
    attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)
    states = torch.cat([torch.zeros((1, padding, model.config.state_dim)), states], dim=1).float().to(device=device)
    actions = torch.cat([torch.zeros((1, padding, model.config.act_dim)), actions], dim=1).float().to(device=device)
    returns_to_go = torch.cat([torch.zeros((1, padding, 1)), returns_to_go], dim=1).float().to(device=device)
    timesteps = torch.cat([torch.zeros((1, padding), dtype=torch.long), timesteps], dim=1).to(device=device)

    state_preds, action_preds, return_preds = model(
        states=states,
        actions=actions,
        rewards=rewards,
        returns_to_go=returns_to_go,
        timesteps=timesteps,
        attention_mask=attention_mask,
        return_dict=False,
    )

    return action_preds[0, -1]

In [20]:
max_steps = e_trade_gym.df.tic.count() - 1
max_steps

np.int64(13310)

### Evaluation

This is the main evluation loop for the Decision Transformer, where it interacts with the trading environment to test the model's performance.

#### Initialization (Lines 4-17)

* Resets environment and initializes tracking variables
* Sets targets return (the performance goal the DT should achieve)
* Initializes empty tensors for states, actions, rewards, and timesteps
* Prepares data structures for the autoregressive generation

#### Action Prediction (Lines 20-32)

* Adds placeholder for current action and reward
* Calls the Decision Transformer to predict next trading action
* uses normalized states (subtracts mean, divides by std)

#### Environment Interaction (Lines 35-40)
* Executes the predicted action in the trading environment
* Receives new state and reward from the environment
* Updates the state and reward history

#### Target Return Updates (Lines 42-44)
* Updates the target return by subtracting the achieved reward
* Maintains the remaining target for future timesteps
* Updates timesteps counter

In [21]:
# Interact with the environment
#max_ep_len = 1000

episode_return, episode_length = 0, 0
state = env.reset()
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
rewards = torch.zeros(0, device=device, dtype=torch.float32)

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

#max_steps = e_trade_gym.df.tic.count() - 1
max_steps = len(e_trade_gym.df.index.unique()) - 1

account_memory = None  # This help avoid unnecessary list creation
actions_memory = None  # optimize memory consumption

for t in range(max_steps+1):
    actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = get_action(
        model,
        (states - state_mean) / state_std,
        actions,
        rewards,
        target_return,
        timesteps,
    )

    actions[-1] = action
    actions_numpy = actions.detach().cpu().numpy()

    #state, reward, done, truncated, info = env.step(actions_numpy)
    state, reward, done, _ = env.step(actions_numpy)

    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = torch.from_numpy(reward).to(device=device)

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

    if (t == max_steps - 1):  # more descriptive condition for early termination to clarify the logic
        # Call instance methods of vectorized environments
        # https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html
        account_memory = env.env_method(method_name="save_asset_memory")
        actions_memory = env.env_method(method_name="save_action_memory")

    if done[0]:# or truncated:
        break

  pred_return = target_return[0, -1] - (reward / scale)
  pred_return = target_return[0, -1] - (reward / scale)


In [22]:
df_account_value_dt, df_actions_dt = account_memory[0], actions_memory[0]

In [23]:
df_account_value_dt.shape, df_actions_dt.shape

((459, 2), (458, 29))

In [24]:
df_account_value_dt.head()

Unnamed: 0,date,account_value
0,2022-07-01,1000000.0
1,2022-07-05,999934.7
2,2022-07-06,999681.8
3,2022-07-07,1000475.0
4,2022-07-08,1000142.0


In [25]:
df_account_value_dt.tail()

Unnamed: 0,date,account_value
454,2024-04-23,1256648.0
455,2024-04-24,1255384.0
456,2024-04-25,1245193.0
457,2024-04-26,1252811.0
458,2024-04-29,1266686.0


In [26]:
df_actions_dt.head()

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,GS,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-01,11,28,0,0,0,19,0,3,0,22,...,0,0,1,4,4,0,1,2,13,0
2022-07-05,11,28,0,0,0,19,0,3,0,22,...,0,0,1,4,4,0,1,2,13,0
2022-07-06,11,28,0,0,0,19,0,3,0,22,...,0,0,1,4,4,0,1,2,13,0
2022-07-07,11,28,0,0,0,19,0,3,0,22,...,0,0,1,4,4,0,1,2,13,0
2022-07-08,11,28,0,0,0,19,0,3,0,22,...,0,0,1,4,4,0,1,2,13,0


In [27]:
df_actions_dt.tail()

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,GS,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-04-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-04-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-04-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-04-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df_account_value_dt.to_pickle('data/df_account_value_dt.pkl')
df_actions_dt.to_pickle('data/df_actions_dt.pkl')

In [29]:
trained_PPO = PPO.load('trained_models/agent_ppo.zip')



In [30]:
df_account_value_ppo, df_actions_ppo = DRLAgent.DRL_prediction(
    model=trained_PPO,
    environment=e_trade_gym
)

hit end!


In [31]:
df_account_value_ppo.to_pickle('data/df_account_value_ppo.pkl')
df_actions_ppo.to_pickle('data/df_actions_ppo.pkl')