In [1]:
from gym.spaces import Discrete
import ta
from tensortrade.env.default.actions import TensorTradeActionScheme

from tensortrade.env.generic import ActionScheme, TradingEnv
from tensortrade.core import Clock
from tensortrade.oms.instruments import ExchangePair
from tensortrade.oms.wallets import Portfolio
from tensortrade.oms.orders import (
    Order,
    proportion_order,
    TradeSide,
    TradeType
)


class BSH(TensorTradeActionScheme):

    registered_name = "bsh"

    def __init__(self, cash: 'Wallet', asset: 'Wallet'):
        
        super().__init__()
        self.cash = cash
        self.asset = asset

        self.listeners = []
        self.action = 0

    @property
    def action_space(self):
        return Discrete(2)

    def attach(self, listener):
        self.listeners += [listener]
        return self

    def get_orders(self, action: int, portfolio: 'Portfolio'):
        order = None

        if abs(action - self.action) > 0:
            src = self.cash if self.action == 0 else self.asset
            tgt = self.asset if self.action == 0 else self.cash
            order = proportion_order(portfolio, src, tgt, 1.0)
            self.action = action

        for listener in self.listeners:
            listener.on_action(action)

        return [order]

    def reset(self):
        super().reset()
        self.action = 0

In [2]:
#Custom classes for transformer predictor:
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
#Timevector layer
class Time2Vector(Layer):
  def __init__(self, seq_len, **kwargs):
    super(Time2Vector, self).__init__()
    self.seq_len = seq_len

  def build(self, input_shape):
    '''Initialize weights and biases with shape (batch, seq_len)'''
    self.weights_linear = self.add_weight(name='weight_linear',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)
    
    self.bias_linear = self.add_weight(name='bias_linear',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)
    
    self.weights_periodic = self.add_weight(name='weight_periodic',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)

    self.bias_periodic = self.add_weight(name='bias_periodic',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)

  def call(self, x):
    '''Calculate linear and periodic time features'''
    x = tf.math.reduce_mean(x[:,:,:4], axis=-1) 
    time_linear = self.weights_linear * x + self.bias_linear # Linear time feature
    time_linear = tf.expand_dims(time_linear, axis=-1) # Add dimension (batch, seq_len, 1)
    
    time_periodic = tf.math.sin(tf.multiply(x, self.weights_periodic) + self.bias_periodic)
    time_periodic = tf.expand_dims(time_periodic, axis=-1) # Add dimension (batch, seq_len, 1)
    return tf.concat([time_linear, time_periodic], axis=-1) # shape = (batch, seq_len, 2)
   
  def get_config(self): # Needed for saving and loading model with custom layer
    config = super().get_config().copy()
    config.update({'seq_len': self.seq_len})
    return config

#Transformer Layers
class SingleAttention(Layer):
  def __init__(self, d_k, d_v):
    super(SingleAttention, self).__init__()
    self.d_k = d_k
    self.d_v = d_v

  def build(self, input_shape):
    self.query = Dense(self.d_k, 
                       input_shape=input_shape, 
                       kernel_initializer='glorot_uniform', 
                       bias_initializer='glorot_uniform')
    
    self.key = Dense(self.d_k, 
                     input_shape=input_shape, 
                     kernel_initializer='glorot_uniform', 
                     bias_initializer='glorot_uniform')
    
    self.value = Dense(self.d_v, 
                       input_shape=input_shape, 
                       kernel_initializer='glorot_uniform', 
                       bias_initializer='glorot_uniform')

  def call(self, inputs): # inputs = (in_seq, in_seq, in_seq)
    q = self.query(inputs[0])
    k = self.key(inputs[1])

    attn_weights = tf.matmul(q, k, transpose_b=True)
    attn_weights = tf.map_fn(lambda x: x/np.sqrt(self.d_k), attn_weights)
    attn_weights = tf.nn.softmax(attn_weights, axis=-1)
    
    v = self.value(inputs[2])
    attn_out = tf.matmul(attn_weights, v)
    return attn_out    

#############################################################################

class MultiAttention(Layer):
  def __init__(self, d_k, d_v, n_heads):
    super(MultiAttention, self).__init__()
    self.d_k = d_k
    self.d_v = d_v
    self.n_heads = n_heads
    self.attn_heads = list()

  def build(self, input_shape):
    for n in range(self.n_heads):
      self.attn_heads.append(SingleAttention(self.d_k, self.d_v))  
    
    # input_shape[0]=(batch, seq_len, 7), input_shape[0][-1]=7 
    self.linear = Dense(input_shape[0][-1], 
                        input_shape=input_shape, 
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')

  def call(self, inputs):
    attn = [self.attn_heads[i](inputs) for i in range(self.n_heads)]
    concat_attn = tf.concat(attn, axis=-1)
    multi_linear = self.linear(concat_attn)
    return multi_linear   

#############################################################################

class TransformerEncoder(Layer):
  def __init__(self, d_k, d_v, n_heads, ff_dim, dropout=0.1, **kwargs):
    super(TransformerEncoder, self).__init__()
    self.d_k = d_k
    self.d_v = d_v
    self.n_heads = n_heads
    self.ff_dim = ff_dim
    self.attn_heads = list()
    self.dropout_rate = dropout

  def build(self, input_shape):
    self.attn_multi = MultiAttention(self.d_k, self.d_v, self.n_heads)
    self.attn_dropout = Dropout(self.dropout_rate)
    self.attn_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)

    self.ff_conv1D_1 = Conv1D(filters=self.ff_dim, kernel_size=1, activation='relu')
    # input_shape[0]=(batch, seq_len, 7), input_shape[0][-1] = 7 
    self.ff_conv1D_2 = Conv1D(filters=input_shape[0][-1], kernel_size=1) 
    self.ff_dropout = Dropout(self.dropout_rate)
    self.ff_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)    
  
  def call(self, inputs): # inputs = (in_seq, in_seq, in_seq)
    attn_layer = self.attn_multi(inputs)
    attn_layer = self.attn_dropout(attn_layer)
    attn_layer = self.attn_normalize(inputs[0] + attn_layer)

    ff_layer = self.ff_conv1D_1(attn_layer)
    ff_layer = self.ff_conv1D_2(ff_layer)
    ff_layer = self.ff_dropout(ff_layer)
    ff_layer = self.ff_normalize(inputs[0] + ff_layer)
    return ff_layer 

  def get_config(self): # Needed for saving and loading model with custom layer
    config = super().get_config().copy()
    config.update({'d_k': self.d_k,
                   'd_v': self.d_v,
                   'n_heads': self.n_heads,
                   'ff_dim': self.ff_dim,
                   'attn_heads': self.attn_heads,
                   'dropout_rate': self.dropout_rate})
    return config


In [3]:
from tensortrade.env.default.rewards import TensorTradeRewardScheme
from tensortrade.feed.core import Stream, DataFeed
from tensortrade.oms.instruments import Instrument
import numpy as np
import pandas as pd
USD = Instrument("USD", 2, "U.S. Dollar")
TTC = Instrument("TTC", 8, "TensorTrade Coin")

class MXRewardScheme(TensorTradeRewardScheme):
    """A reward scheme that rewards the agent for increasing its net worth,
        while penalizing more volatile strategies.
        Parameters
        ----------
        :param: return_algorithm : {'sharpe', 'sortino'}, Default 'sharpe'.
            The risk-adjusted return metric to use.
        :param: risk_free_rate : float, Default 0.
            The risk free rate of returns to use for calculating metrics.
        :param: target_returns : float, Default 0
            The target returns per period for use in calculating the sortino ratio.
        :param: window_size : int
            The size of the look back window for computing the reward.
        """

    def __init__(self,
                 return_algorithm: str = 'sharpe',
                 risk_free_rate: float = 0.,
                 target_returns: float = 0.,
                 window_size: int = 1) -> None:
        algorithm = self.default('return_algorithm', return_algorithm)

        assert algorithm in ['sharpe', 'diff_sharpe', 'sortino']

        if algorithm == 'sharpe':
            return_algorithm = self._sharpe_ratio
        elif algorithm == 'sortino':
            return_algorithm = self._sortino_ratio
        elif algorithm == 'diff_sharpe':
            return_algorithm = self._diff_sharpe_ratio

        self._return_algorithm = return_algorithm
        self._risk_free_rate = self.default('risk_free_rate', risk_free_rate)
        self._target_returns = self.default('target_returns', target_returns)
        self._window_size = self.default('window_size', window_size)

    def _sharpe_ratio(self, returns: 'pd.Series') -> float:
        """Computes the sharpe ratio for a given series of a returns.
        Parameters
        ----------
        returns : `pd.Series`
            The returns for the `portfolio`.
        Returns
        -------
        float
            The sharpe ratio for the given series of a `returns`.
        References
        ----------
        .. [1] https://en.wikipedia.org/wiki/Sharpe_ratio
        """
        return (np.mean(returns) - self._risk_free_rate + 1e-9) / (np.std(returns) + 1e-9)

    def _diff_sharpe_ratio(self, returns: 'pd.Series') -> float:
        """Computes the differential sharpe ratio over a given series of returns
        Parameters
        ----------
        returns : `pd.Series`
            The returns for the 'portfolio'
        Returns
        -------
        float
            The differential sharpe ratio for the given series of a `returns`
        References
        ----------
        .. [1] https://proceedings.neurips.cc/paper/1998/file/4e6cd95227cb0c280e99a195be5f6615-Paper.pdf
        .. [2] https://github.com/AchillesJJ/DSR
        """
        np.seterr('raise')
        eta = 0.004

        A = np.mean(returns[-1:])
        B = np.mean(returns[-1:]**2)
        delta_A = np.mean(returns) - A
        delta_B = np.mean(returns)**2 - B
        upper = ((B * delta_A - 0.5*A*delta_B) + 1e-9)
        lower = (B-A**2 + 1e-9)**(3/2)

        if lower == (0 or np.isnan(lower)):
            print(f"A:{A}\n"
                  f"B:{B}\n"
                  f"delta_A:{delta_A}\n"
                  f"delta_B:{delta_B}\n"
                  f"upper:{upper}\n"
                  f"lower:{lower}\n")
                  # f"reward:{dt*eta}\n")
        dt = upper / lower

        return dt * eta

    def _sortino_ratio(self, returns: 'pd.Series') -> float:
        """Computes the sortino ratio for a given series of a returns.
        Parameters
        ----------
        returns : `pd.Series`
            The returns for the `portfolio`.
        Returns
        -------
        float
            The sortino ratio for the given series of a `returns`.
        References
        ----------
        .. [1] https://en.wikipedia.org/wiki/Sortino_ratio
        """
        downside_returns = returns.copy()
        downside_returns[returns < self._target_returns] = returns ** 2

        expected_return = np.mean(returns)
        downside_std = np.sqrt(np.std(downside_returns))

        result = (expected_return - self._risk_free_rate + 1e-9) / (downside_std + 1e-9)

        return result

    def get_reward(self, portfolio: 'Portfolio') -> float:
        """Computes the reward corresponding to the selected risk-adjusted return metric.
        Parameters
        ----------
        portfolio : `Portfolio`
            The current portfolio being used by the environment.
        Returns
        -------
        float
            The reward corresponding to the selected risk-adjusted return metric.
        """
        net_worths = [nw['net_worth'] for nw in portfolio.performance.values()][-(self._window_size + 1):]
        returns = pd.Series(net_worths).pct_change().dropna()
        risk_adjusted_return = self._return_algorithm(returns)
        return risk_adjusted_return

class PBR(TensorTradeRewardScheme):

    registered_name = "pbr"

    def __init__(self, price: 'Stream'):
        super().__init__()
        self.position = -1

        r = Stream.sensor(price, lambda p: p.value, dtype="float").diff()
        position = Stream.sensor(self, lambda rs: rs.position, dtype="float")

        reward = (r * position).fillna(0).rename("reward")

        self.feed = DataFeed([reward])
        self.feed.compile()

    def on_action(self, action: int):
        self.position = -1 if action == 0 else 1

    def get_reward(self, portfolio: 'Portfolio'):
        return self.feed.next()["reward"]

    def reset(self):
        self.position = -1
        self.feed.reset()

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
from tensortrade.env.generic import Renderer


class PositionChangeChart(Renderer):

    def __init__(self, color: str = "orange"):
        self.color = "orange"

    def render(self, env, **kwargs):
        history = pd.DataFrame(env.observer.renderer_history)

        actions = list(history.action)
        p = list(history.price)

        buy = {}
        sell = {}

        for i in range(len(actions) - 1):
            a1 = actions[i]
            a2 = actions[i + 1]

            if a1 != a2:
                if a1 == 0 and a2 == 1:
                    buy[i] = p[i]
                else:
                    sell[i] = p[i]

        buy = pd.Series(buy)
        sell = pd.Series(sell)

        fig, axs = plt.subplots(1, 2, figsize=(15, 5))

        fig.suptitle("Performance")

        axs[0].plot(np.arange(len(p)), p, label="price", color=self.color)
        axs[0].scatter(buy.index, buy.values, marker="^", color="green")
        axs[0].scatter(sell.index, sell.values, marker="^", color="red")
        axs[0].set_title("Trading Chart")
        
        performance = pd.DataFrame.from_dict(env.action_scheme.portfolio.performance, orient='index')
        performance.plot(ax=axs[1])
        axs[1].set_title("Net Worth")

        plt.show()

In [5]:

dataframe = pd.read_csv('BTC-USD_Cleaned_Megafile_01_01_2020-12_31_2020.csv')
dataframe.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
import ray
import pandas as pd
import numpy as np
from ray import tune
from ray.tune.registry import register_env

import tensortrade.env.default as default

from tensortrade.feed.core import DataFeed, Stream
from tensortrade.oms.exchanges import Exchange
from tensortrade.oms.services.execution.simulated import execute_order
from tensortrade.oms.wallets import Wallet, Portfolio



def create_env(config):
    data=config["data"]
    c = Stream.source(list(data["close"]), dtype="float").rename("USD-TTC")
    #sharpe=Stream.source(list(c_sharpe_ratio), dtype="float").rename("sharpe_ratio")
    #vola=Stream.source(list(c_volatility), dtype="float").rename("volatility")
    
    o = Stream.source(list(data["open"]), dtype="float").rename("open")
    h = Stream.source(list(data["high"]), dtype="float").rename("high")
    l = Stream.source(list(data["low"]), dtype="float").rename("low")
    v = Stream.source(list(data["volume"]), dtype="float").rename("volume")
    bitfinex = Exchange("bitfinex", service=execute_order)(
        c
    )

    cash = Wallet(bitfinex, config["USD"] * USD)
    asset = Wallet(bitfinex, config["BTC"] * TTC)

    portfolio = Portfolio(USD, [
        cash,
        asset
    ])

    feed = DataFeed([
    o,
    #o.rolling(window=10).mean().rename("o_fast"),
    #o.rolling(window=50).mean().rename("o_medium"),
    #o.rolling(window=100).mean().rename("o_slow"),
    o.log().diff().fillna(0).rename("o_lr"),
        
    h,
    #h.rolling(window=10).mean().rename("h_fast"),
    #h.rolling(window=50).mean().rename("h_medium"),
    #h.rolling(window=100).mean().rename("h_slow"),
    h.log().diff().fillna(0).rename("h_lr"),
    
        
    l,
    #l.rolling(window=10).mean().rename("l_fast"),
    #l.rolling(window=50).mean().rename("l_medium"),
    #l.rolling(window=100).mean().rename("l_slow"),
    l.log().diff().fillna(0).rename("l_lr"),
        
    c,
    #c.rolling(window=10).mean().rename("c_fast"),
    #c.rolling(window=50).mean().rename("c_medium"),
    #c.rolling(window=100).mean().rename("c_slow"),
    c.log().diff().fillna(0).rename("c_lr"), 
    
    v, 
    Stream.source(list(data["rel_low"]), dtype="float").rename("rel_low"),
    Stream.source(list(data["rel_high"]), dtype="float").rename("rel_high"),
    Stream.source(list(data["rel_open"]), dtype="float").rename("rel_open"),
    Stream.source(list(data["rel_close"]), dtype="float").rename("rel_close"),
    ])

    #reward_scheme = MXRewardScheme(return_algorithm='diff_sharpe')
    reward_scheme = PBR(price=o)

    action_scheme = BSH(
        cash=cash,
        asset=asset
    ).attach(reward_scheme) #remove the .attach for non PBR reward schemes

    renderer_feed = DataFeed([
        Stream.source(list(data["close"]), dtype="float").rename("price"),
        Stream.sensor(action_scheme, lambda s: s.action, dtype="float").rename("action")
    ])

    environment = default.create(
        feed=feed,
        portfolio=portfolio,
        action_scheme=action_scheme,
        reward_scheme=reward_scheme,
        renderer_feed=renderer_feed,
        renderer=PositionChangeChart(),
        window_size=config["window_size"],
        max_allowed_loss=0.6
    )
    return environment

register_env("TradingEnv", create_env)

In [7]:
ray.init(log_to_driver=False,ignore_reinit_error=True)
analysis = tune.run(
    "PPO",
    stop={
      "episode_reward_mean": 500
    },
    config={
        "env": "TradingEnv",
        "env_config": {
            "window_size": 25,
            "data":dataframe,
            "USD":100,
            "BTC":0
        },
        "log_level": "DEBUG",
        "framework": "torch",
        "ignore_worker_failures": True,
        "num_workers": 1,
        "num_gpus": 0,
        "clip_rewards": True,
        "lr": 8e-7,
        "lr_schedule": [
            [0, 1e-1],
            [int(1e2), 1e-2],
            [int(1e3), 1e-3],
            [int(1e4), 1e-4],
            [int(1e5), 1e-5],
            [int(1e6), 1e-6],
            [int(1e7), 1e-7]
        ],
        "gamma": 0,
        "observation_filter": "MeanStdFilter",
        "lambda": 0.72,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01
    },
    checkpoint_at_end=True
)

2021-06-07 17:56:23,289	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-06-07 17:56:39,109	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_TradingEnv_3c444_00000,RUNNING,


Result for PPO_TradingEnv_3c444_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-06-07_17-57-03
  done: false
  episode_len_mean: 344.09090909090907
  episode_media: {}
  episode_reward_max: 623.5999999999931
  episode_reward_mean: -13.915454545457413
  episode_reward_min: -348.4099999999971
  episodes_this_iter: 11
  episodes_total: 11
  experiment_id: 6da27ddd05464703be53231eb69849db
  hostname: MSI
  info:
    learner:
      default_policy:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.2
          cur_lr: 8.0e-07
          entropy: 0.6931188702583313
          entropy_coeff: 0.01
          kl: 2.2926852750515536e-05
          policy_loss: -0.008015550498384982
          total_loss: 0.4642219031229615
          vf_explained_var: 0.0009087827056646347
          vf_loss: 0.9583281148225069
    num_agent_steps_sampled: 4000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 192.168.25

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,RUNNING,192.168.254.29:9744,1,15.3182,4000,-13.9155,623.6,-348.41,344.091


Result for PPO_TradingEnv_3c444_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-06-07_17-57-18
  done: false
  episode_len_mean: 338.2608695652174
  episode_media: {}
  episode_reward_max: 623.5999999999931
  episode_reward_mean: -5.763478260871362
  episode_reward_min: -469.7300000000023
  episodes_this_iter: 12
  episodes_total: 23
  experiment_id: 6da27ddd05464703be53231eb69849db
  hostname: MSI
  info:
    learner:
      default_policy:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.1
          cur_lr: 0.0007000000000000001
          entropy: 0.6361785437911749
          entropy_coeff: 0.01
          kl: 0.05854135821573436
          policy_loss: -0.2962749097496271
          total_loss: 0.16035953437676653
          vf_explained_var: 0.04728589206933975
          vf_loss: 0.9142841715365648
    num_agent_steps_sampled: 8000
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 2
  node_ip: 192.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,RUNNING,192.168.254.29:9744,2,30.6443,8000,-5.76348,623.6,-469.73,338.261


Result for PPO_TradingEnv_3c444_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-06-07_17-57-34
  done: false
  episode_len_mean: 336.57142857142856
  episode_media: {}
  episode_reward_max: 1222.8499999999985
  episode_reward_mean: 164.9354285714272
  episode_reward_min: -469.7300000000023
  episodes_this_iter: 12
  episodes_total: 35
  experiment_id: 6da27ddd05464703be53231eb69849db
  hostname: MSI
  info:
    learner:
      default_policy:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.15000000000000002
          cur_lr: 0.00030000000000000003
          entropy: 0.40287499595433474
          entropy_coeff: 0.01
          kl: 0.1262293483596295
          policy_loss: -0.26622440543724224
          total_loss: 0.15465577799477614
          vf_explained_var: 0.06405028700828552
          vf_loss: 0.8119490593671799
    num_agent_steps_sampled: 12000
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,RUNNING,192.168.254.29:9744,3,45.6281,12000,164.935,1222.85,-469.73,336.571


Result for PPO_TradingEnv_3c444_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-06-07_17-57-49
  done: false
  episode_len_mean: 337.1063829787234
  episode_media: {}
  episode_reward_max: 2044.639999999994
  episode_reward_mean: 469.6014893617002
  episode_reward_min: -469.7300000000023
  episodes_this_iter: 12
  episodes_total: 47
  experiment_id: 6da27ddd05464703be53231eb69849db
  hostname: MSI
  info:
    learner:
      default_policy:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.22500000000000003
          cur_lr: 9.800000000000001e-05
          entropy: 0.23104535974562168
          entropy_coeff: 0.01
          kl: 0.04587947437539697
          policy_loss: -0.11940986511763185
          total_loss: 0.10209224373102188
          vf_explained_var: 0.08195527642965317
          vf_loss: 0.42697936575859785
    num_agent_steps_sampled: 16000
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,RUNNING,192.168.254.29:9744,4,60.9648,16000,469.601,2044.64,-469.73,337.106


Result for PPO_TradingEnv_3c444_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2021-06-07_17-58-05
  done: true
  episode_len_mean: 333.91525423728814
  episode_media: {}
  episode_reward_max: 2047.1699999999983
  episode_reward_mean: 681.4845762711839
  episode_reward_min: -469.7300000000023
  episodes_this_iter: 12
  episodes_total: 59
  experiment_id: 6da27ddd05464703be53231eb69849db
  hostname: MSI
  info:
    learner:
      default_policy:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375
          cur_lr: 9.400000000000001e-05
          entropy: 0.1639405540190637
          entropy_coeff: 0.01
          kl: 0.00860477308742702
          policy_loss: -0.07632988359546289
          total_loss: 0.025280765490606427
          vf_explained_var: 0.13750013709068298
          vf_loss: 0.2006918927654624
    num_agent_steps_sampled: 20000
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 5
  node

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,RUNNING,192.168.254.29:9744,5,76.0818,20000,681.485,2047.17,-469.73,333.915


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TradingEnv_3c444_00000,TERMINATED,,5,76.0818,20000,681.485,2047.17,-469.73,333.915


2021-06-07 17:58:07,258	INFO tune.py:549 -- Total run time: 90.88 seconds (87.05 seconds for the tuning loop).


In [129]:
ray.shutdown()

In [8]:
import ray.rllib.agents.ppo as ppo

# Get checkpoint
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean",mode="max"),
    metric="episode_reward_mean"
)
checkpoint_path = checkpoints[0][0]

# Restore agent
agent = ppo.PPOTrainer(
    env="TradingEnv",
    config={
        "env_config": {
            "window_size": 25,
            "data":dataframe,
            "USD":100,
            "BTC":0
        },
        "framework": "torch",
        "log_level": "DEBUG",
        "ignore_worker_failures": True,
        "num_workers": 1,
        "num_gpus": 0,
        "clip_rewards": True,
        "lr": 8e-7,
        "lr_schedule": [
            [0, 1e-1],
            [int(1e2), 1e-2],
            [int(1e3), 1e-3],
            [int(1e4), 1e-4],
            [int(1e5), 1e-5],
            [int(1e6), 1e-6],
            [int(1e7), 1e-7]
        ],
        "gamma": 0,
        "observation_filter": "MeanStdFilter",
        "lambda": 0.72,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01
    }
)
agent.restore(checkpoint_path)

2021-06-07 18:00:07,172	DEBUG rollout_worker.py:1122 -- Creating policy for default_policy
2021-06-07 18:00:07,183	DEBUG catalog.py:631 -- Created preprocessor <ray.rllib.models.preprocessors.NoPreprocessor object at 0x000001D38DE73C10>: Box(-inf, inf, (25, 13), float32) -> (25, 13)
2021-06-07 18:00:07,184	INFO torch_policy.py:112 -- TorchPolicy running on CPU.
2021-06-07 18:00:07,216	INFO rollout_worker.py:1161 -- Built policy map: {'default_policy': <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x000001D38DE73190>}
2021-06-07 18:00:07,217	INFO rollout_worker.py:1162 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x000001D38DE73C10>}
2021-06-07 18:00:07,217	DEBUG rollout_worker.py:531 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
2021-06-07 18:00:07,218	INFO rollout_worker.py:563 -- Built filter map: {'default_policy': MeanStdFilter((25, 13), True, True, None, (n=0, mean_mean=0.0,

In [11]:
# Instantiate the environment
env = create_env({
    "window_size": 25,
    "data":dataframe,
    "USD":100,
    "BTC":0
})

# Run until episode ends
episode_reward = 0
done = False
obs = env.reset()

while not done:
    action = agent.compute_action(obs)
    obs, reward, done, info = env.step(action)
    episode_reward += reward
env.render()