In [10]:
import os
import pandas as pd
import gymnasium as gym

from finrl.main import check_and_make_directories
from finrl.main import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR

from stable_baselines3 import PPO
from finrl.agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.logger import configure

check_and_make_directories([TRAINED_MODEL_DIR])

### Why The Offline Approach

1. **Offline Learning**: Decision Transformers are designed for offline RL - theylearn from existing trajectories rather than interacting with the enviroment during training.
2. **Expert Demonstrations**: The PPO model serves as an "expert" that provides high-quality trading trajectories. The DT learns to mimic this expert behavior.
3. **Conditional Generation**: Unlike PPO which learns a policy directly, the DT learns to generate actions conditioned on:

* Current states
* Desired returns-to-go (future performance targets)
* Timesteps

4. **Flexibility**: Once trained, the DT can generate actions for different return targets without retraining, while PPO is fixed to its learned policy.

In [2]:
train = pd.read_csv('data/train.csv')

train = train.set_index(train.columns[0])
train.index.names = ['']

In [2]:
import pickle

with open('data/train.pickle', 'rb') as f:
    train = pickle.load(f)

with open('data/trade.pickle', 'rb') as f:
    trade = pickle.load(f)

In [4]:
train.head()

Unnamed: 0,date,close,high,low,open,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence,cov_list,return_list
0,2010-01-04,19.891676,20.101397,19.783637,19.948872,3815561,A,0,0.385154,19.83291,18.381351,66.065031,219.628311,48.08085,18.934376,18.009171,20.040001,0.0,"[[0.0006651334289736016, 0.0007629606469922543...",tic A AAL AAP ...
0,2010-01-04,4.496878,4.657144,4.393176,4.56287,9837300,AAL,0,0.264201,4.990097,3.97066,56.734717,56.08739,11.644271,4.087727,3.736084,20.040001,0.0,"[[0.0006651334289736016, 0.0007629606469922543...",tic A AAL AAP ...
0,2010-01-04,34.948505,35.519729,34.931195,35.225462,1701700,AAP,0,0.260571,36.124009,34.439074,51.067383,19.990991,0.376685,34.985769,34.059243,20.040001,0.0,"[[0.0006651334289736016, 0.0007629606469922543...",tic A AAL AAP ...
0,2010-01-04,6.424605,6.439315,6.375673,6.407194,493729600,AAPL,0,0.11787,6.487939,5.5241,62.133201,168.826345,33.760767,6.010477,5.957822,20.040001,0.0,"[[0.0006651334289736016, 0.0007629606469922543...",tic A AAL AAP ...
0,2010-01-04,18.414785,18.4486,18.232193,18.32349,10829095,ABT,0,0.112348,18.481256,17.995301,59.54057,72.99546,13.002825,18.246506,17.839046,20.040001,0.0,"[[0.0006651334289736016, 0.0007629606469922543...",tic A AAL AAP ...


In [3]:
train.tic.unique(), INDICATORS

(array(['A', 'AAL', 'AAP', 'AAPL', 'ABT', 'ACN', 'ADBE', 'ADI', 'ADM',
        'ADP', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIV', 'AIZ',
        'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'AMAT', 'AMD', 'AME',
        'AMG', 'AMGN', 'AMP', 'AMT', 'AMZN', 'AON', 'AOS', 'APA', 'APD',
        'APH', 'ARE', 'ATO', 'AVB', 'AVY', 'AWK', 'AXP', 'AZO', 'BA',
        'BAC', 'BAX', 'BBT', 'BBY', 'BDX', 'BEN', 'BIIB', 'BK', 'BKNG',
        'BLK', 'BMY', 'BR', 'BSX', 'BWA', 'BXP', 'C', 'CAG', 'CAH', 'CAT',
        'CB', 'CBRE', 'CCI', 'CCL', 'CDNS', 'CE', 'CF', 'CHD', 'CHRW',
        'CI', 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI',
        'CMS', 'CNC', 'CNP', 'COF', 'COO', 'COP', 'COST', 'CPB', 'CPRT',
        'CRM', 'CSCO', 'CSX', 'CTAS', 'CTSH', 'CVS', 'CVX', 'D', 'DAL',
        'DD', 'DE', 'DGX', 'DHI', 'DHR', 'DIS', 'DLR', 'DLTR', 'DOV',
        'DRI', 'DTE', 'DUK', 'DVA', 'DVN', 'DXC', 'EA', 'EBAY', 'ECL',
        'ED', 'EFX', 'EIX', 'EL', 'EMN', 'EMR', 'EOG', 'EQIX', '

In [4]:
stock_dimension = len(train.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f'Stock Dimension: {stock_dimension}', f'State Space: {state_space}')

Stock Dimension: 394 State Space: 3941


### Stock Universe

The model trades 29 stocks from Dow Jones Industrial Average

* **Stocks:** AAPL, AMGN, AXP, BA, CAT, CSCO, CVX, DIS, GS, HD, HON, IBM, INTC, JNJ, JPM, KO, MCD, MMM, MRK, MSFT, NKE, PG, TRV, UNH, V, VZ, WBA, WMT

### Technical Indicators

The environment uses 8 technical indicators for each stock:

1. **MACD** - Moving Average Convergence Divergence
2. **Bollinger Upper Band**
3. **Bollinger Lower Band**
4. **RSI (30-period)** - Relative Strength Index
5. **CCI (30-period)** - Commodity Channel Index
6. **DX (30-period)** - Directional Movement Index
7. **Close 30-day SMA** - Simple Moving Average
8. **Close 60-day SMA** - Simple Moving Average

### State Space Composition

The state space has 291 dimensions calculated as follows:

$$\text{State Space}=\text{Cash Balance}+2\cdot\text{Stock Dimensions}+\text{Indicators}\cdot\text{Stock Dimensions}$$

### Action Space

* 29-dimension continouus action space
* Each action represents the number of shares buy/sell for each stock
* Actions are bounded by `hmax` (100 shares maximum per trade)

### Trading Constraints

* **Transaction Costs:** 0.5\% for both buying and selling (training)
* **Position Limits:** Maximum 100 shares per stock per trade
* **Initial Capital:** \$1,000,000
* **Reward Scaling:** $e^{-4}$

### Data Structure
* **Training Period**: Historical data with 3,396 trading days
* **Total Data Points**: 98,513 observations ($29\times 3,396$ days)
* **Features**: OHLCV data + technical indicators + VIX + turbulence index

This enviroment simulates realistic stock trading with transaction cost, poistion limits, and uses comprehensive technical analysis indicators to inform trading decisions. The model learns to optimize portfolio allocation across the 29 stocks to maximum returns while managing risk.

In [5]:
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

buy_cost_list = sell_cost_list = [0.005] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    'hmax':100,
    'initial_amount': 1000000,
    'num_stock_shares': num_stock_shares,
    'buy_cost_pct': buy_cost_list,
    'sell_cost_pct': sell_cost_list,
    'state_space': state_space,
    'stock_dim': stock_dimension,
    'tech_indicator_list': INDICATORS,
    'action_space': stock_dimension,
    'reward_scaling': 1e-4
}

e_train_gym = StockTradingEnv(df=train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()

In [6]:
len(e_train_gym.df.index.unique()) - 1

3144

In [7]:
e_train_gym.df.tic.count()

np.int64(1239130)

In [None]:

agent = DRLAgent(env = env_train)
model_ppo = agent.get_model('ppo')

tmp_path = RESULTS_DIR + '/ppo'
new_logger_ppo = configure(tmp_path, ['stdout', 'csv', 'tensorboard'])

model_ppo.set_logger(new_logger_ppo)

trained_ppo = agent.train_model(model=model_ppo,
                                tb_log_name='ppo',
                                total_timesteps=50000)

{'n_steps': 2048, 'ent_coef': 0.01, 'learning_rate': 0.00025, 'batch_size': 64}
Using cuda device
Logging to results/ppo




------------------------------------
| time/              |             |
|    fps             | 14          |
|    iterations      | 1           |
|    time_elapsed    | 140         |
|    total_timesteps | 2048        |
| train/             |             |
|    reward          | -0.01169762 |
------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 14          |
|    iterations           | 2           |
|    time_elapsed         | 281         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.26951742  |
|    clip_fraction        | 0.702       |
|    clip_range           | 0.2         |
|    entropy_loss         | -561        |
|    explained_variance   | -0.0559     |
|    learning_rate        | 0.00025     |
|    loss                 | -4.93       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.025   

In [13]:
trained_ppo.save(TRAINED_MODEL_DIR + '/agent_ppo')

In [11]:
model = PPO.load('trained_models/agent_ppo.zip')



Here we are training data for the Decision Transformer (DT) model by running pre-trained PPO (Proximal Policy Optimization) reinformcement learning model through a stock trading enviroment.

### Main Loop (Lines 16-37)

The loop runs through the trading environment step-by-step. This creates offline trajectories with

* **States**: Market observations (291-dimensional state spaces)
* **Actions**: PPO's trading decisions (29-dimensional action space)
* **Rewards**: Trading performance
* **Dones**: Episode terminiation flags



In [30]:
import numpy as np

"""make a prediction and get results"""
env_train, obs = e_train_gym.get_sb_env()

ds = []
states = []
feature = {}

s, a, r, d = [], [], [], []

env_train.reset()
# max_steps = len(e_train_gym.df.index.unique()) - 1
# max_steps = e_train_gym.df.tic.count() - 1
max_steps = 500000

for i in range(1, max_steps, 1):
    if i % 10000 == 0:
        print(i)

    action, _states = model.predict(obs, deterministic=True)
    s.extend(obs)
    a.extend(action)

    obs, rewards, dones, info = env_train.step(action)
    r.extend(rewards)
    d.append(dones[0])

    states.extend(obs)

    if (i % 100 == 0):
        
        feature['observations'] = s
        feature['actions'] = a
        feature['rewards'] = r
        feature['dones'] = d
        
        ds.append(feature)
        feature = {}
        s, a, r, d = [], [], [], []

10000
20000
day: 3144, episode: 50
begin_total_asset: 1000000.00
end_total_asset: 5691979.77
total_reward: 4691979.77
total_cost: 12681.97
total_trades: 600226
Sharpe: 0.837
30000
40000
50000
day: 3144, episode: 60
begin_total_asset: 1000000.00
end_total_asset: 5691979.77
total_reward: 4691979.77
total_cost: 12681.97
total_trades: 600226
Sharpe: 0.837
60000
70000
80000
day: 3144, episode: 70
begin_total_asset: 1000000.00
end_total_asset: 5691979.77
total_reward: 4691979.77
total_cost: 12681.97
total_trades: 600226
Sharpe: 0.837
90000
100000
110000
day: 3144, episode: 80
begin_total_asset: 1000000.00
end_total_asset: 5691979.77
total_reward: 4691979.77
total_cost: 12681.97
total_trades: 600226
Sharpe: 0.837
120000
130000
140000
day: 3144, episode: 90
begin_total_asset: 1000000.00
end_total_asset: 5691979.77
total_reward: 4691979.77
total_cost: 12681.97
total_trades: 600226
Sharpe: 0.837
150000
160000
170000
day: 3144, episode: 100
begin_total_asset: 1000000.00
end_total_asset: 5691979.7

### State Staistics

This calculates the mean and standard deviation of all collected states, whcih will be used for normalization in the Decision Transformer training.

The purpose is to prepare data for limitation learning. It's collecting expert demonstrations from a trained RL agent (PPO) to train a Deceision Transformer mdoel. The DT will learn to replicate the PPO agent's behavior by observing the state-action-reward sequences.

The data structure `ds` contains batches of experiened tuples (observations, actions, rewards, dones) that will be used to train the Decision Transformer to make similiar trading decisions as the PPO model.


In [31]:
states = np.vstack(states)
state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

In [32]:
state_mean[:5], state_std[:5], state_mean.shape

(array([4067.2368  ,   56.01334 ,   25.70047 ,  112.767555,   44.781887],
       dtype=float32),
 array([5.2179863e+04, 3.6167255e+01, 1.4407150e+01, 4.4108833e+01,
        4.3545811e+01], dtype=float32),
 (3941,))

In [33]:
state_space

3941

In [37]:
len_ds

4999

In [36]:
len_ds = len(ds)

state_mean = np.pad(state_mean, (0, (len_ds-state_space)))
state_std = np.pad(state_std, (0, (len_ds-state_space)))

In [38]:
state_mean, len(state_mean)

(array([4067.2368 ,   56.01334,   25.70047, ...,    0.     ,    0.     ,
           0.     ], shape=(4999,), dtype=float32),
 4999)

In [39]:
len(ds), len(ds[0])

(4999, 4)

In [40]:
feature = ds[0]
len(feature['rewards'])

100

In [41]:
input_data = {}
input_data['train'] = ds
input_data['state_mean'] = state_mean
input_data['state_std'] = state_std

In [None]:
input_data.keys()


dict_keys(['train', 'state_mean', 'state_std'])

In [49]:
import pickle

with open("input_data.pkl", "wb") as f:
    pickle.dump(input_data, f)


In [1]:
import pickle

with open("input_data.pkl", "rb") as f:
    input_data = pickle.load(f)


In [2]:
from datasets import Dataset

dataset = Dataset.from_dict(input_data)

In [3]:
dataset.save_to_disk("data/dataset/")

Saving the dataset (0/18 shards):   0%|          | 0/4999 [00:00<?, ? examples/s]

In [4]:
from datasets import load_from_disk

dataset = load_from_disk("data/dataset/")

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]