In [7]:
import os
import pandas as pd
import gymnasium as gym
import pickle
import numpy as np

from finrl.main import check_and_make_directories
from finrl.main import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR

import pathlib as Path
from stable_baselines3 import PPO
from finrl.agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.logger import configure
from datasets import Dataset, DatasetDict, Features, Sequence, Value, load_from_disk

check_and_make_directories([TRAINED_MODEL_DIR])

In [5]:
with open('data/train_sp500.pickle', 'rb') as f:
    train = pickle.load(f)

with open('data/trade_sp500.pickle', 'rb') as f:
    trade = pickle.load(f)

train.head()

Unnamed: 0,date,close,high,low,open,volume,tic,day,gdp_log,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,turbulence,daily_return
0,2009-01-02,10.320794,10.358926,9.856867,9.914063,4236220,A,4,9.577127,0.0,10.989866,10.001251,100.0,66.666667,100.0,10.320794,10.320794,0.0,-0.233625
0,2009-01-02,7.909602,7.994448,7.230828,7.287392,5167000,AAL,4,9.577127,0.0,10.989866,10.001251,100.0,66.666667,100.0,7.909602,7.909602,0.0,-0.233625
0,2009-01-02,29.373985,29.511649,28.45336,29.133075,795900,AAP,4,9.577127,0.0,10.989866,10.001251,100.0,66.666667,100.0,29.373985,29.373985,0.0,2.713712
0,2009-01-02,2.724325,2.733032,2.556513,2.578127,746015200,AAPL,4,9.577127,0.0,10.989866,10.001251,100.0,66.666667,100.0,2.724325,2.724325,0.0,-0.907254
0,2009-01-02,17.518652,17.613507,17.129422,17.50884,13163193,ABT,4,9.577127,0.0,10.989866,10.001251,100.0,66.666667,100.0,17.518652,17.518652,0.0,5.430456


In [6]:
stock_dimension = len(train.tic.unique())
state_space = 1 + 2 * stock_dimension + len(INDICATORS) * stock_dimension  
print(f'Stock Dimension: {stock_dimension}, state space: {state_space}')

Stock Dimension: 393, state space: 3931


In [None]:
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

buy_cost_list = sell_cost_list = [0.005] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    'hmax':100,
    'initial_amount': 1000000,
    'num_stock_shares': num_stock_shares,
    'buy_cost_pct': buy_cost_list,
    'sell_cost_pct': sell_cost_list,
    'state_space': state_space,
    'stock_dim': stock_dimension,
    'tech_indicator_list': INDICATORS,
    'action_space': stock_dimension,
    'reward_scaling': 1e-4
}

e_train_gym = StockTradingEnv(df=train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()


In [None]:
model = PPO.load('trained_models/agent_ppo.zip')

In [None]:
max_steps = 500000


class RolloutGenerator:
    def __init__(self, env, policy_model, state_dim, action_dim, total_steps, progress_interval=100000):
        self.env = env
        self.policy_model = policy_model
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.total_steps = total_steps
        self.progress_interval = progress_interval
        self._has_run = False
        self._count = 0
        self._mean = np.zeros(self.state_dim, dtype=np.float64)
        self._m2 = np.zeros(self.state_dim, dtype=np.float64)

    def __call__(self):
        if self._has_run:
            raise RuntimeError("RolloutGenerator can only be iterated once")
        self._has_run = True
        return self._generator()

    def _generator(self):
        obs = self.env.reset()
        for step in range(1, self.total_steps + 1):
            if step % self.progress_interval == 0:
                print(step)

            current_obs = obs[0] if getattr(obs, "ndim", 0) > 1 else obs
            action, _states = self.policy_model.predict(obs, deterministic=True)
            next_obs, rewards, dones, info = self.env.step(action)

            action_values = action[0] if getattr(action, "ndim", 0) > 1 else action
            reward_value = float(rewards[0]) if np.ndim(rewards) else float(rewards)
            done_flag = bool(dones[0]) if np.ndim(dones) else bool(dones)

            yield {
                'observations': np.asarray(current_obs, dtype=np.float32).tolist(),
                'actions': np.asarray(action_values, dtype=np.float32).flatten().tolist(),
                'rewards': reward_value,
                'dones': done_flag
            }

            next_obs_vector = next_obs[0] if getattr(next_obs, "ndim", 0) > 1 else next_obs
            self._update_stats(np.asarray(next_obs_vector, dtype=np.float32))
            obs = next_obs

    def _update_stats(self, new_observation):
        self._count += 1
        delta = new_observation - self._mean
        self._mean += delta / self._count
        delta2 = new_observation - self._mean
        self._m2 += delta * delta2

    def finalize_stats(self):
        if self._count < 2:
            variance = np.zeros_like(self._mean)
        else:
            variance = self._m2 / (self._count - 1)

        state_mean = self._mean.astype(np.float32)
        state_std = np.sqrt(variance + 1e-6).astype(np.float32)
        return state_mean, state_std



In [None]:
rollout_generator = RolloutGenerator(
    env=env_train,
    policy_model=model,
    state_dim=state_space,
    action_dim=stock_dimension,
    total_steps=max_steps
)

features = Features(
    {
        'observations': Sequence(Value('float32'), length=state_space),
        'actions': Sequence(Value('float32'), length=stock_dimension),
        'rewards': Value('float32'),
        'dones': Value('bool')
    }
)

dataset = Dataset.from_generator(
    rollout_generator,
    features=features,
    writer_batch_size=2048
)

state_mean, state_std = rollout_generator.finalize_stats()

dataset_dict = DatasetDict({'train': dataset})

dataset_dir = Path("data/dataset")
dataset_dir.mkdir(parents=True, exist_ok=True)

dataset_dict.save_to_disk(str(dataset_dir))

np.savez(dataset_dir / "state_stats.npz", mean=state_mean, std=state_std)

dataset = load_from_disk(str(dataset_dir))