# Train an Agent using Generative Adversarial Imitation Learning

The idea of generative adversarial imitation learning is to train a discriminator network to distinguish between expert trajectories and learner trajectories.
The learner is trained using a traditional reinforcement learning algorithm such as PPO and is rewarded for trajectories that make the discriminator think that it was an expert trajectory.

### Load data

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import datetime
import math
import json

import onnx
from onnx import load
import onnxruntime
from onnxruntime.training import artifacts

from imitation.algorithms.adversarial.gail import GAIL
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.data.types import TrajectoryWithRew

from stable_baselines3.common.logger import configure
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.policies import BasePolicy
from typing import Tuple

import torch as th
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
# from finrl.config_tickers import DOW_30_TICKER
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import data_split
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline

from pprint import pprint

import itertools

2024-04-17 15:12:58,785 matplotlib.pyplot [DEBUG] - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-04-17 15:12:58,789 matplotlib.pyplot [DEBUG] - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-04-17 15:12:58,797 matplotlib.pyplot [DEBUG] - Loaded backend agg version v2.2.


In [3]:
import os
from finrl.main import check_and_make_directories
from finrl.config import (
    DATA_SAVE_DIR,
    TRAINED_MODEL_DIR,
    TENSORBOARD_LOG_DIR,
    RESULTS_DIR,
    INDICATORS,
    TRAIN_START_DATE,
    TRAIN_END_DATE,
    TEST_START_DATE,
    TEST_END_DATE,
    TRADE_START_DATE,
    TRADE_END_DATE,
)

check_and_make_directories([DATA_SAVE_DIR, TRAINED_MODEL_DIR, TENSORBOARD_LOG_DIR, RESULTS_DIR])

In [4]:
from datetime import datetime
TEST_END_DATE = datetime.today().strftime('%Y-%m-%d')

TRAIN_START_DATE = '2000-01-01'
TRAIN_END_DATE = '2021-01-01'
TEST_START_DATE = '2021-01-01'
TEST_END_DATE = datetime.today().strftime('%Y-%m-%d')

# Load price data from csv file
# tic_dir = './' + DATA_SAVE_DIR + '/sp500_price_daily.csv'
# df = pd.read_csv(tic_dir,index_col=0)

In [5]:
processed_full = pd.read_csv('./' + DATA_SAVE_DIR + '/dow30_ready_with_filter_data_daily.csv',index_col=0)
processed_full['date'] = pd.to_datetime(processed_full.date,format='mixed')
processed_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99956 entries, 0 to 99955
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   99956 non-null  datetime64[ns]
 1   tic                    99956 non-null  object        
 2   close                  99956 non-null  float64       
 3   gross_profit_margin    99956 non-null  float64       
 4   sga_ratio              99956 non-null  float64       
 5   dep_ratio              99956 non-null  float64       
 6   ebit_on_int            99956 non-null  float64       
 7   profit_margin          99956 non-null  float64       
 8   count_positive_profit  99956 non-null  float64       
 9   csti_on_liabilities    99956 non-null  float64       
 10  inventory_on_ebit      99956 non-null  float64       
 11  receivable_on_rev      99956 non-null  float64       
 12  roa                    99956 non-null  float64       
 13  roe   

In [6]:
train_data = data_split(processed_full, TRAIN_START_DATE, TRAIN_END_DATE)
test_data = data_split(processed_full, TEST_START_DATE, TEST_END_DATE)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
# Check the length of the two datasets
print(len(train_data))
print(len(test_data))

75776
24180


### Set up environment

In [7]:
import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from gymnasium import spaces
from gymnasium.utils import seeding
from stable_baselines3.common.vec_env import DummyVecEnv
from portfolio import portfolio

matplotlib.use("Agg")

# from stable_baselines3.common import logger


class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""

    metadata = {"render.modes": ["human"]}

    def __init__(
        self,
        df,
        hmax,
        initial_amount,
        buy_cost_pct,
        sell_cost_pct,
        reward_scaling,
        state_space,
        action_space,
        tech_indicator_list,
        stop_loss,
        hold_period,
        make_plots=False,
        print_verbosity=10,
        row=0,
        initial=True,
        previous_state=[],
        model_name="",
        mode="",
        iteration="",
    ):
        # self.row = row
        self.df = df
        # self.stock_dim = stock_dim
        self.hmax = hmax
        self.reward_scaling = reward_scaling
        self.state_space = state_space
        self.action_space = action_space
        self.tech_indicator_list = tech_indicator_list        
        self.initial_amount = initial_amount
        self.hold_period = hold_period
        self.buy_cost_pct = buy_cost_pct
        self.sell_cost_pct = sell_cost_pct
        self.stop_loss = stop_loss # the game stops when the asset loses more than stop_loss percent
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.action_space,))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.state_space,))
        self.terminal = False
        self.make_plots = make_plots
        self.print_verbosity = print_verbosity
        # self.turbulence_threshold = turbulence_threshold
        # self.risk_indicator_col = risk_indicator_col
        self.initial = initial
        self.previous_state = previous_state
        self.model_name = model_name
        self.mode = mode
        self.iteration = iteration
        self.tic_list = self.df.tic.unique()
        self.original_df = self.df.copy()
        self.row = 0
        
        # initalize state
        self.state = self._initiate_state()

        # initialize reward
        self.reward = 0
        self.turbulence = 0
        self.cost = 0
        self.trades = 0
        self.episode = 0
        
        # memorize all the total balance change
        self.asset_memory = [self.initial_amount]
        self.rewards_memory = []
        self.actions_memory = []
        self.date_memory = [self._get_date()]
        # self.reset()
        self._seed()

    def _buy_stock(self, action):
        def _do_buy():
            if self.data.close > 0: # Buy only if the price is > 0 (no missing data in this particular date)
                buy_num_shares, buy_fee = self.portfolio.add_buy_stock(self.data.tic,self.data.close,action)
                # print(f'Buy amount: {buy_num_shares}')
                self.cost += buy_fee
                if buy_num_shares == 0:
                    self.reward = -5 * self.initial_amount * self.punishment_rate * self.reward_scaling
            else:
                buy_num_shares = 0

            return buy_num_shares

        buy_num_shares = _do_buy()
        return buy_num_shares
    
    def _sell_stock(self, action):
        def _do_sell_normal():
            if self.data.close > 0: # Sell only if the price is > 0 (no missing data in this particular date)
                sell_amount,surplus,sell_fee = self.portfolio.minus_sell_stock(self.data.tic,self.data.close,action)
                self.cost += sell_fee
                # print(f'Sell amount: {sell_num_shares}')
                if sell_amount == 0:
                    self.reward = -5 * self.initial_amount * self.punishment_rate * self.reward_scaling
                else:
                    self.reward += (surplus - sell_amount*self.data.close*self.sell_cost_pct) * self.reward_scaling
                    self.win_trade += 1 if surplus > 0 else 0
                    self.trades += 1
            else:
                sell_amount = 0

            return sell_amount

        sell_amount = _do_sell_normal()
        return sell_amount

    def step(self, actions):

        self.terminal = (self.row >= len(self.df.index.unique()) - 1) | (self.portfolio.get_asset_value() < self.initial_amount*(1-self.stop_loss))
        # print(f'Step {self.row}, action: {actions}, current asset: {current_total_asset}, stop loss: {self.initial_amount*(1-self.stop_loss)}, Trade: {self.trades}')

        # Reset reward to zero
        self.reward = 0
        
        # --> IN CASE THE STEP IS A TERMINATED STEP
        if self.terminal: 
            
            # Summary the training performance after an episode
            end_total_asset = self.portfolio.get_asset_value()
            tot_reward = end_total_asset - self.initial_amount * (self.df.iloc[-1].close / self.df.iloc[0].close) # compare with buy-and-hold strategy
            # tot_reward = end_total_asset - self.initial_amount # compare with initial capital

            # Show at each episode
            print(f"Episode: {self.episode}, com: {self.df.iloc[0].tic}, win trade: {self.win_trade}/{self.trades}, Total reward: {self.accumulated_reward}")

            # Print out training results after a certain amount of episodes
            if self.episode % self.print_verbosity == 0:
                print(f"Current company: {self.df.iloc[0].tic}")
                print(f"begin_total_asset: {self.asset_memory[0]:0.2f}")
                print(f"end_total_asset: {end_total_asset:0.2f}")
                print(f"surplus from buy-and-hold: {tot_reward:0.2f}")
                print(f"total_cost: {self.cost:0.2f}")
                print(f"total_trades: {self.trades}")
                # if df_total_value["daily_return"].std() != 0:
                #     print(f"Sharpe: {sharpe:0.3f}")
                print("=================================")
            
            truncated = False  # we do not limit the number of steps here
            # Optionally we can pass additional info, we are not using that for now
            info = {}


            return (
                np.array(self.state).astype(np.float32),
                self.reward,
                self.terminal,
                truncated,
                info,
            )

        # --> IN A NORMAL STEP
        else: 

            # Act according to actions
            action = actions[0]
                
            if action > 0:
                self._buy_stock(action)
            elif action < 0:
                self._sell_stock(action)

            self.current_actions = actions
            self.actions_memory.append(actions)

            # Set a punishment at each step to push the agent decide an action
            self.reward += -1 * self.initial_amount * self.punishment_rate * self.reward_scaling
            self.accumulated_reward += self.reward

            # Update selected row in the dataset based on state: s -> s+1
            self.row += 1
            self.data = self.df.loc[self.row]
            self.state = self._update_state()

            end_total_asset = self.portfolio.get_asset_value()

            # Update asset memory
            self.current_asset = end_total_asset
            self.asset_memory.append(end_total_asset)
            self.date_memory.append(self._get_date())
            
            self.rewards_memory.append(self.reward)

        truncated = False  # we do not limit the number of steps here
        # Optionally we can pass additional info, we are not using that for now
        info = {}
        
        # return self.state, self.reward, self.terminal, {}
    
        return (
            np.array(self.state).astype(np.float32),
            self.reward,
            self.terminal,
            truncated,
            info,
        )

    def reset(self, seed=None, options=None):
        # initiate state
        self.state = self._initiate_state()

        # Reset asset_memory
        self.asset_memory = [self.initial_amount]

        # Reset support variables
        self.cost = 0
        self.trades = 0
        self.win_trade = 0
        self.terminal = False
        self.accumulated_reward = 0
        self.block_remain = 0
        self.rewards_memory = []
        self.actions_memory = []
        self.date_memory = [self._get_date()]
        self.episode += 1

        return np.array(self.state).astype(np.float32), {}

    def render(self, mode="human", close=False):
        return self.state

    def _initiate_state(self):
        
        # Reset portfolio & previous_portfolio
        self.portfolio = portfolio(initial_amount=self.initial_amount,hold_period=self.hold_period,
                                   buy_cost_pct=self.buy_cost_pct,sell_cost_pct=self.sell_cost_pct)

        # Select a random ticker from df
        self.df = self.original_df[self.original_df.tic == random.choice(self.tic_list)].reset_index(drop=True)
        self.punishment_rate = 1/(len(self.df)*10)
        
        # Reset data
        self.row = 0
        self.data = self.df.loc[self.row]
        
         # Reset state
        state = ([self.portfolio.get_remain_capital()] + [self.data.close] 
                    + [self.portfolio.get_stock_weight(self.data.tic)] 
                    +[self.portfolio.get_stock_profit(self.data.tic)]
                    + sum([[self.data[tech]] for tech in self.tech_indicator_list], []))
        
        return state

    def _update_state(self):

        self.portfolio.update_new_state(self.data.tic,self.data.close)
        state = ([self.portfolio.get_remain_capital()] + [self.data.close] 
                    + [self.portfolio.get_stock_amount(self.data.tic)]
                    +[self.portfolio.get_stock_profit(self.data.tic)]
                    + sum([[self.data[tech]] for tech in self.tech_indicator_list], []))

        return state

    def _get_date(self):
        return self.row

    def save_asset_memory(self):
        date_list = self.date_memory
        asset_list = self.asset_memory
        df_account_value = pd.DataFrame({"date": date_list, "account_value": asset_list})
        return df_account_value

    def save_action_memory(self):
        
        date_list = self.date_memory[:-1]
        action_list = self.actions_memory
        df_actions = pd.DataFrame({"date": date_list, "actions": action_list})
        return df_actions

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def get_sb_env(self):
        e = DummyVecEnv([lambda: self])
        obs = e.reset()
        return e, obs

Full feature stone

In [17]:
# features = [ 'gross_profit_margin', 'sga_ratio', 'ebit_on_int', 'profit_margin', 'count_positive_profit',
#        'csti_on_liabilities', 'roa', 'roe', 'liabilities_on_equity', 'debt_on_min_ebit',
#        'capital_cost_on_ebit', 'eps_on_mp', 'dividend_on_mp', 'mp_on_bv',
#        'trend_gross_margin', 'trend_profit_margin', 'fluc_sga', 'fluc_dep_ratio', 'fluc_inv_on_ebit', 'fluc_rec_on_rev']
# # ratio_list = train_data.columns.drop(['date','tic','close'])

# action_dimension = 1 # k float in range (-1,1) to decide sell (k<0) or buy (k>0) decisions
# state_space = 4 + len(features)
# print(f"Action Dimension: {action_dimension}, State Space: {state_space}")

Action Dimension: 1, State Space: 24


Basic stone with 4 features

In [8]:
features = [ 'gross_profit_margin', 'sga_ratio', 'ebit_on_int', 'profit_margin'] # just select random 4 features to set input's sharp
action_dimension = 1 # k float in range (-1,1) to decide sell (k<0) or buy (k>0) decisions
state_space = 4 + len(features) # 4 components from the portfolio state and 4 model features
print(f"Action Dimension: {action_dimension}, State Space: {state_space}")

Action Dimension: 1, State Space: 8


In [9]:
# Parameters for the environment
env_kwargs = {
    "hmax": 100, 
    "initial_amount": 1000000, 
    "buy_cost_pct": 0.001,
    "sell_cost_pct": 0.001,
    "tech_indicator_list": features, 
    "state_space": state_space, 
    "action_space": action_dimension, 
    "reward_scaling": 1e-4,
    "stop_loss": 0.8,
    "print_verbosity":4,
    "hold_period": 5
}

#Establish the training environment using StockTradingEnv() class
e_train_gym = StockTradingEnv(df = test_data, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()

### Build blank models

In [10]:
policy_kwargs = dict(net_arch=dict(pi=[64, 32, 16], vf=[64, 32, 16]))
SEED = 42

learner = PPO(
    env=env_train,
    policy=MlpPolicy,
    batch_size=2048,
    ent_coef=0.01,
    learning_rate=0.00025,
    gamma=0.95,
    n_epochs=5,
    clip_range=0.1,
    policy_kwargs=policy_kwargs,
    # seed=SEED,
)

#### Save the model as onnx format and configurate it

In [11]:
class OnnxableSB3Policy(th.nn.Module):
    def __init__(self, policy: BasePolicy):
        super().__init__()
        self.policy = policy

    def forward(self, observation: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        # NOTE: Preprocessing is included, but postprocessing
        # (clipping/inscaling actions) is not,
        # If needed, you also need to transpose the images so that they are channel first
        # use deterministic=False if you want to export the stochastic policy
        # policy() returns `actions, values, log_prob` for PPO
        return self.policy(observation, deterministic=True)

In [12]:
onnx_policy = OnnxableSB3Policy(learner.policy)
observation_size = learner.observation_space.shape
dummy_input = th.randn(1, *observation_size)

th.onnx.export(
    onnx_policy,
    dummy_input,
    f"./{TRAINED_MODEL_DIR}/basic_stone_{observation_size[0]-4}_features.onnx",
    opset_version=17,
    input_names=["input"],
)

In [13]:
model_onnx = onnx.load(f"./{TRAINED_MODEL_DIR}/basic_stone_{observation_size[0]-4}_features.onnx")

In [14]:
m1 = model_onnx.metadata_props.add()
m1.key = 'feature_amount'
m1.value = json.dumps(observation_size[0]-4)

In [15]:
onnx.save(model_onnx, f"./{TRAINED_MODEL_DIR}/basic_stone_{observation_size[0]-4}_features.onnx")

## Test customizing trajectory

In [16]:
def generate_random_trajectory(number_of_step):

    def generate_random_obs():
        random_row = processed_full.iloc[np.random.randint(len(processed_full))]
        random_port = np.random.uniform(0,env_kwargs["initial_amount"])
        random_amount = np.random.randint(1000)
        random_profit = np.random.rand(1)
        state = ([random_port] + [random_row.close] + [random_amount] + [random_profit[0]]
                    + sum([[random_row[tech]] for tech in features], []))
    
        return state

    # Generate random observations
    random_obs = np.array([generate_random_obs() for i in range(0,number_of_step + 1)])
    
    # Generate random actions
    random_acts = np.array([np.random.rand(1)*2-1 for i in range(0,number_of_step)])
    
    # Generate random rewards
    random_rews = np.random.rand(number_of_step)*10
    
    # And put all these components into the same trajectory
    random_trajectory = TrajectoryWithRew(acts=random_acts, obs=random_obs,rews=random_rews,terminal=True,infos=None)
    return random_trajectory

In [17]:
random_rollouts = []
number_of_step = 10
number_of_episode = 1000
for i in range(0,number_of_episode):
    random_rollouts.append(generate_random_trajectory(number_of_step))

Now we are ready to set up our GAIL trainer.
Note, that the `reward_net` is actually the network of the discriminator.
We evaluate the learner before and after training so we can see if it made any progress.

First we construct a GAIL trainer ...

#### Trying to load PPO model from Onnx model to continue training from the training_model

In [41]:
with open("TestOnnx", "rb") as f:
    content = f.read()
trained_model = onnx.load_model_from_string(content)

In [3]:
# This is the model file you prepared
path_to_forward_only_onnx_model = 'test_model.onnx'

# Load the forward-only ONNX model
model = onnx.load(path_to_forward_only_onnx_model)

# Extract model's parameters
b = [param.name for param in model.graph.initializer]

# We choose to make all layers trainable
trainable_layers = ['fc', 'onnx']
requires_grad = [param for param in all_params if any(layer in param for layer in trainable_layers)]
frozen_params = [param for param in all_params if param not in requires_grad]
print(requires_grad, frozen_params)

# Check the output name of the model
print(model.graph.output)

[] ['policy.mlp_extractor.policy_net.0.weight', 'policy.mlp_extractor.policy_net.0.bias', 'policy.mlp_extractor.policy_net.2.weight', 'policy.mlp_extractor.policy_net.2.bias', 'policy.mlp_extractor.value_net.0.weight', 'policy.mlp_extractor.value_net.0.bias', 'policy.mlp_extractor.value_net.2.weight', 'policy.mlp_extractor.value_net.2.bias', 'policy.action_net.weight', 'policy.action_net.bias', 'policy.value_net.weight', 'policy.value_net.bias']
[name: "56"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_value: 1
      }
    }
  }
}
, name: "24"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_value: 1
      }
      dim {
        dim_value: 1
      }
    }
  }
}
, name: "54"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_value: 1
      }
    }
  }
}
]


In [6]:
# create this directory if it does not exist 
path_to_output_artifact_directory = 'training_artifacts'
artifacts.generate_artifacts(model,
                             requires_grad=requires_grad,
                             artifact_directory=path_to_output_artifact_directory)

2024-04-09 10:31:36,079 root [INFO] - No loss function enum provided. Loss node will not be added to the graph.
2024-04-09 10:31:36,082 root [DEBUG] - Building training block _TrainingBlock
2024-04-09 10:31:36,083 root [DEBUG] - Building block: PassThrough
2024-04-09 10:31:36,088 root [DEBUG] - Building gradient graph for training block _TrainingBlock
2024-04-09 10:31:36,095 root [DEBUG] - The loss output is 56. The gradient graph will be built starting from 56_grad.
2024-04-09 10:31:36.092525 [I:onnxruntime:Default, constant_sharing.cc:256 ApplyImpl] Total shared scalar initializer count: 4
2024-04-09 10:31:36.094131 [I:onnxruntime:Default, graph.cc:3556 CleanUnusedInitializersAndNodeArgs] Removing initializer '/policy/ConstantOfShape_output_0'. It is no longer used by any node.
2024-04-09 10:31:36.094153 [I:onnxruntime:Default, graph.cc:3556 CleanUnusedInitializersAndNodeArgs] Removing initializer '/policy/Shape_output_0'. It is no longer used by any node.
2024-04-09 10:31:36.094576 

In [7]:
from onnxruntime.training.api import CheckpointState, Module, Optimizer

# Assuming the paths are correctly defined
path_to_the_checkpoint_artifact = 'training_artifacts/checkpoint'
path_to_the_training_model = 'training_artifacts/training_model.onnx'
path_to_the_eval_model = 'training_artifacts/eval_model.onnx'
# path_to_the_optimizer_model = 'training_artifacts/optimizer_model.onnx'

# Load the checkpoint state
state = CheckpointState.load_checkpoint(path_to_the_checkpoint_artifact)

# Create the module
module = Module(path_to_the_training_model,
                state,
                path_to_the_eval_model,
                device="cpu")

# optimizer = Optimizer(path_to_the_optimizer_model, module)

In [8]:
module.train(2048)

<onnxruntime.training.api.module.Module at 0x7ff2731b45b0>

In [11]:
# trained_model.graph.initializer
# learner.get_parameters()

#### Start set up GAIL trainier

In [25]:
policy_kwargs = dict(net_arch=dict(pi=[128,64, 32, 16], vf=[128,64, 32, 16]))
SEED = 42

learner = PPO(
    env=env_train,
    policy=MlpPolicy,
    batch_size=2048,
    ent_coef=0.01,
    learning_rate=0.00025,
    gamma=0.95,
    n_epochs=5,
    clip_range=0.1,
    policy_kwargs=policy_kwargs,
    # seed=SEED,
)
# trained_model.env = env_train
# learner = trained_model

reward_net = BasicRewardNet(
    observation_space=env_train.observation_space,
    action_space=env_train.action_space,
    normalize_input_layer=RunningNorm,
)

gail_trainer = GAIL(
    demonstrations=random_rollouts,
    demo_batch_size=64,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=8,
    venv=env_train,
    gen_algo=learner,
    reward_net=reward_net,
)

... then we evaluate it before training ...

In [56]:
env_train.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(learner, env_train, 1, return_episode_rewards=True)

Step 0, action: [0.00397809], Trade: 0
Step 1, action: [0.00397809], Trade: 0
Step 2, action: [0.00397809], Trade: 0
Step 3, action: [0.00397809], Trade: 0
Step 4, action: [0.00397809], Trade: 0
Step 5, action: [0.00397809], Trade: 0
Step 6, action: [0.00397809], Trade: 0
Step 7, action: [0.00397809], Trade: 0
Step 8, action: [0.00397809], Trade: 0
Step 9, action: [0.00397809], Trade: 0
Step 10, action: [0.00397809], Trade: 0
Step 11, action: [0.00397809], Trade: 0
Step 12, action: [0.00397809], Trade: 0
Step 13, action: [0.00397809], Trade: 0
Step 14, action: [0.00397809], Trade: 0
Step 15, action: [0.00397809], Trade: 0
Step 16, action: [0.00397809], Trade: 0
Step 17, action: [0.00397809], Trade: 0
Step 18, action: [0.00397809], Trade: 0
Step 19, action: [0.00397809], Trade: 0
Step 20, action: [0.00397809], Trade: 0
Step 21, action: [0.00397809], Trade: 0
Step 22, action: [0.00397809], Trade: 0
Step 23, action: [0.00397809], Trade: 0
Step 24, action: [0.00397809], Trade: 0
Step 25, a

... and train it ...

In [59]:
gail_trainer.train(2048)

round:   0%|                                              | 0/1 [00:00<?, ?it/s]

Step 0, action: [-0.00561859], Trade: 0
Step 1, action: [0.12219889], Trade: 0
Step 2, action: [0.3014195], Trade: 0
Step 3, action: [0.5027696], Trade: 0
Step 4, action: [-0.88870114], Trade: 0
Step 5, action: [0.56528723], Trade: 0
Step 6, action: [-1.], Trade: 0
Step 7, action: [-0.08654246], Trade: 1
Step 8, action: [-0.79638976], Trade: 2
Step 9, action: [0.02548163], Trade: 3
Step 10, action: [1.], Trade: 3
Step 11, action: [0.8717339], Trade: 3
Step 12, action: [-1.], Trade: 3
Step 13, action: [0.53921825], Trade: 4
Step 14, action: [-0.7768738], Trade: 4
Step 15, action: [0.3594327], Trade: 5
Step 16, action: [-0.21822572], Trade: 5
Step 17, action: [-1.], Trade: 6
Step 18, action: [-0.00016832], Trade: 7
Step 19, action: [-0.5629056], Trade: 8
Step 20, action: [1.], Trade: 9
Step 21, action: [1.], Trade: 9
Step 22, action: [-0.23682944], Trade: 9
Step 23, action: [-1.], Trade: 10
Step 24, action: [-0.74319124], Trade: 11
Step 25, action: [0.40653297], Trade: 11
Step 26, action

round: 100%|██████████████████████████████████████| 1/1 [00:16<00:00, 16.08s/it]


... and finally evaluate it again.

In [61]:
env_train.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, env_train, 1, return_episode_rewards=True
)

Step 0, action: [-0.00141055], Trade: 0
Step 1, action: [-0.00141055], Trade: 0
Step 2, action: [-0.00141055], Trade: 0
Step 3, action: [-0.00141055], Trade: 0
Step 4, action: [-0.00141055], Trade: 0
Step 5, action: [-0.00141055], Trade: 0
Step 6, action: [-0.00141055], Trade: 0
Step 7, action: [-0.00141055], Trade: 0
Step 8, action: [-0.00141055], Trade: 0
Step 9, action: [-0.00141055], Trade: 0
Step 10, action: [-0.00141055], Trade: 0
Step 11, action: [-0.00141055], Trade: 0
Step 12, action: [-0.00141055], Trade: 0
Step 13, action: [-0.00141055], Trade: 0
Step 14, action: [-0.00141055], Trade: 0
Step 15, action: [-0.00141055], Trade: 0
Step 16, action: [-0.00141055], Trade: 0
Step 17, action: [-0.00141055], Trade: 0
Step 18, action: [-0.00141055], Trade: 0
Step 19, action: [-0.00141055], Trade: 0
Step 20, action: [-0.00141055], Trade: 0
Step 21, action: [-0.00141055], Trade: 0
Step 22, action: [-0.00141055], Trade: 0
Step 23, action: [-0.00141055], Trade: 0
Step 24, action: [-0.00141

We can see that an untrained policy performs poorly, while GAIL matches expert returns (500):

In [62]:
print(
    "Rewards before training:",
    np.mean(learner_rewards_before_training),
    "+/-",
    np.std(learner_rewards_before_training),
)
print(
    "Rewards after training:",
    np.mean(learner_rewards_after_training),
    "+/-",
    np.std(learner_rewards_after_training),
)

Rewards before training: -9.98759315814823 +/- 0.0
Rewards after training: -59.92555744946003 +/- 0.0


In [66]:
def DRL_prediction(model, environment, deterministic=False):
        """make a prediction and get results"""
        # test_env, test_obs = environment.get_sb_env()
        # account_memory = None  # This help avoid unnecessary list creation
        # actions_memory = None  # optimize memory consumption

        test_obs = environment.reset()[0]
        # max_steps = len(environment.df.index.unique()) - 1

        for i in range(0,len(environment.df)):
            action = model.predict(np.asarray(test_obs), deterministic=deterministic)
            test_obs,reward,terminal,truncated,info = environment.step(action[0])

            if terminal:
                print("hit end!")
                break
        return pd.DataFrame(environment.asset_memory, columns=['account_value']), pd.DataFrame(environment.actions_memory)

In [67]:
df_account_value_ppo, df_actions_ppo = DRL_prediction(model=learner, environment = e_train_gym)

Step 0, action: [1.], Trade: 0
Step 1, action: [1.], Trade: 0
Step 2, action: [1.], Trade: 0
Step 3, action: [0.98270357], Trade: 0
Step 4, action: [0.6782824], Trade: 0
Step 5, action: [-1.], Trade: 0
Step 6, action: [-0.02467314], Trade: 1
Step 7, action: [-1.], Trade: 1
Step 8, action: [1.], Trade: 1
Step 9, action: [-0.247977], Trade: 1
Step 10, action: [-0.43289724], Trade: 1
Step 11, action: [0.46846297], Trade: 1
Step 12, action: [0.83208895], Trade: 1
Step 13, action: [-0.8413053], Trade: 1
Step 14, action: [-0.49260035], Trade: 2
Step 15, action: [-0.83476055], Trade: 3
Step 16, action: [1.], Trade: 4
Step 17, action: [-0.91973305], Trade: 4
Step 18, action: [0.47894198], Trade: 5
Step 19, action: [0.6353611], Trade: 5
Step 20, action: [0.29584122], Trade: 5
Step 21, action: [-0.43691295], Trade: 5
Step 22, action: [1.], Trade: 6
Step 23, action: [-0.4579729], Trade: 6
Step 24, action: [1.], Trade: 7
Step 25, action: [-0.8712025], Trade: 7
Step 26, action: [-0.19992101], Trade