### Optimized sine-wave reject test.

This notebook presents some basic ideas on state presentation, reward shaping, model architecture and hyperparameters choice. With those tweaks sine-wave sanity test is converging faster and with greater stability.


Key points:
- using 1d convolutions along time-embedding dimension in policy model;
- shaping observation state as bank of open-price simple moving averages with increasing time-periods;
- ensuring signal stationarity by taking gradients along features axis;
- fighting signal outliers with log-scaling;
- making reward a function of current unrealised profit/loss with addition of  one-time amplified realized          trade result;
- allowing variable trade position size.
- using env.skip_frame=10. Introducing prior actions distribution seems critical to agent performance. I            wasn't been able to acheve good results with skip_frame < 8. Obviously it's not optimal way to shape              priors, but working for now;
- lowing entropy regularization term.

In [None]:
import backtrader as bt
import backtrader.indicators as btind
import numpy as np

from gym import spaces

from btgym import BTgymEnv, BTgymStrategy, BTgymDataset

from btgym.a3c import Launcher, BaseLSTMPolicy

import tensorflow as tf
import tensorflow.contrib.rnn as rnn
from tensorflow.python.util.nest import flatten as flatten_nested

In [None]:
class LSTMPolicy1D(BaseLSTMPolicy):
    """
    A3C LSTM policy model with 1D convolution layers.
    """
    def __init__(self, ob_space, ac_space, num_filters=32, filter_size=5, stride=2,
                 lstm_class=rnn.BasicLSTMCell, lstm_layers=(256,)):
        
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space), name='x_in_pl')

        # Conv layers:
        for i in range(4):
            x = tf.nn.elu(self.conv1d(x, num_filters, "l{}".format(i + 1), filter_size, stride))
      
        # Run LSTM along rollout time dimension and evrything else:
        super(LSTMPolicy1D, self).__init__(x, ob_space, ac_space, lstm_class, lstm_layers)
           
    def conv1d(self, x, num_filters, name, filter_size=3, stride=2, pad="SAME", dtype=tf.float32,
               collections=None):
        with tf.variable_scope(name):
            stride_shape =  stride
            
            #print('stride_shape:',stride_shape)
            
            filter_shape = [filter_size, int(x.get_shape()[-1]), num_filters]
            
            #print('filter_shape:', filter_shape)
            
            # there are "num input feature maps * filter height * filter width"
            # inputs to each hidden unit
            fan_in = np.prod(filter_shape[:2])
            
            # each unit in the lower layer receives a gradient from:
            # "num output feature maps * filter height * filter width" /
            #   pooling size
            fan_out = np.prod(filter_shape[:1]) * num_filters

            # initialize weights with random weights
            w_bound = np.sqrt(6. / (fan_in + fan_out))

            w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
                                collections=collections)
            b = tf.get_variable("b", [1, 1, num_filters], initializer=tf.constant_initializer(0.0),
                                collections=collections)
            return tf.nn.conv1d(x, w, stride_shape, pad) + b
   

In [None]:

class MyStrategy(BTgymStrategy):
    """
    BT server inner computation startegy tuned to pass sine-wave sanity test.
    """
    
    def __init__(self, **kwargs):
        super(MyStrategy, self).__init__(**kwargs)
        
        self.dim_time = self.p.state_shape['raw_state'].shape[0] 

        self.trade_just_closed = False
        self.trade_result = None

        self.realised_broker_value = self.env.broker.startingcash
        self.episode_result = 0
        self.reward = 0
        
        self.avg_period = 10 # should be somehow consistent with skip_frame value
        
        # Signal features:
        self.data.sma_4 = btind.SimpleMovingAverage(self.datas[0], period=4)
        self.data.sma_8 = btind.SimpleMovingAverage(self.datas[0], period=8)
        self.data.sma_16 = btind.SimpleMovingAverage(self.datas[0], period=16)
        self.data.sma_32 = btind.SimpleMovingAverage(self.datas[0], period=32)
        self.data.sma_64 = btind.SimpleMovingAverage(self.datas[0], period=64)
        self.data.sma_128 = btind.SimpleMovingAverage(self.datas[0], period=128)
        self.data.sma_256 = btind.SimpleMovingAverage(self.datas[0], period=256)

        # Service sma to get correct first features values:
        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(256 + self.dim_time)
        )
        self.data.dim_sma.plotinfo.plot = False
        
    def notify_trade(self, trade):    
        if trade.isclosed:
            # Set trade flag and store trade result:
            self.trade_just_closed = True
            self.trade_result = trade.pnlcomm
            
            # Store realized prtfolio value:
            self.realised_broker_value = self.broker.get_value()
 
    def get_state(self):
        """ 
        Computes obs. state as [time_dim, 8] matrix of log-scaled features gradients.
        """
        T = 1e4 # EURUSD
        #T = 1 # BTCUSD
      
        x = np.stack(
            [
                np.frombuffer(self.data.open.get(size=self.dim_time)),
                np.frombuffer(self.data.sma_4.get(size=self.dim_time)), 
                np.frombuffer(self.data.sma_8.get(size=self.dim_time)), 
                np.frombuffer(self.data.sma_16.get(size=self.dim_time)), 
                np.frombuffer(self.data.sma_32.get(size=self.dim_time)),
                np.frombuffer(self.data.sma_64.get(size=self.dim_time)),
                np.frombuffer(self.data.sma_128.get(size=self.dim_time)),
                np.frombuffer(self.data.sma_256.get(size=self.dim_time)), 
            ], 
            axis=-1
        )
        # Amplified gradient along features axis:
        x = np.gradient(x, axis=1) * T
        
        # Log-scale:
        x = self.log_transform(x)
        
        self.state['raw_state'] = self.raw_state
                
        self.state['model_input'] = x
        
        return self.state
        
    def log_transform(self, x):
        return np.sign(x) * np.log(np.fabs(x) + 1)

    def norm_log_value(self, current_value, start_value, drawdown_call, target_call, epsilon=1e-4):
        """Current value log-normalized in [-1,1] wrt upper and lower bounds"""
        x = np.asarray(current_value)
        x = (x / start_value - 1) * 100
        x = (x - target_call)/(drawdown_call+target_call) + 1
        x = np.clip(x, epsilon, 1 - epsilon)
        x = 1 - 2 * np.log(x) / np.log( epsilon)
        return x
    
    def norm_value(self, current_value, start_value, drawdown_call, target_call, epsilon=1e-8):
        """Current value normalized in [-1,1] wrt upper and lower bounds"""
        x = np.asarray(current_value)
        x = (x / start_value - 1) * 100
        x = (x - target_call)/(drawdown_call+target_call) + 1
        x = 2 * np.clip(x, epsilon, 1 - epsilon) - 1 
        return x
    
    def decayed_result(self, trade_result, current_value, start_value, drawdown_call, target_call, gamma=0.8):
        "Normalized in [-1,1] trade result, lineary decayed wrt current value."
        target_value = start_value * (1 + target_call/100)
        value_range = start_value * (drawdown_call + target_call)/100
        decay = (gamma - 1) * (current_value - target_value) / value_range + gamma
        x = trade_result * decay / value_range
        return x
    
    def get_reward(self):
        """
        Defines reward as composition of portfolio performance statisitics.
        """
        
        # Reward term 1: averaged profit/loss for current opened trade (unrealized p/l):
        unrealised_pnl = np.average(self.stats.broker.value.get(size=self.avg_period))\
            - self.realised_broker_value
        
        #Normalize:
        unrealised_pnl /= self.env.broker.startingcash *(self.p.drawdown_call + self.p.target_call) / 100

        # Reward term 2: averaged broker value, normalized wrt to max drawdown and target bounds.
        avg_norm_broker_value = self.norm_value(
            np.average(self.stats.broker.value.get(size=self.avg_period)),
            self.env.broker.startingcash,
            self.p.drawdown_call,
            self.p.target_call,
        ) 
        
        # Reward term 3: normalized single trade realized profit/loss:
        if self.trade_just_closed:
            realized_pnl = self.decayed_result(
                self.trade_result,
                np.average(self.stats.broker.value.get(size=self.avg_period)),
                self.env.broker.startingcash,
                self.p.drawdown_call,
                self.p.target_call,
                gamma=1.0
            ) 
            self.trade_just_closed = False
        
        else:
            realized_pnl = 0
            
        # Coefficients are tunable:
        self.reward = unrealised_pnl + 1e-2 * avg_norm_broker_value + 10 * realized_pnl
        
        return self.reward 

class RewardObserver(bt.observer.Observer):
    """ 
    Adds reward visualisation to episode plot.
    """
    lines = ('reward',)
    plotinfo = dict(plot=True, subplot=True)
    
    plotlines = dict(
        reward=dict(markersize=4.0, color='green', fillstyle='full'),
    )
    
    def next(self):
        self.lines.reward[0] = self._owner.reward

########################################################       


# Set backtesting engine parameters:

time_embed_dim = 16

state_shape = {
    'raw_state': spaces.Box(low=-100, high=100, shape=(time_embed_dim, 4)),
    'model_input': spaces.Box(low=-100, high=100, shape=(time_embed_dim, 8)),
}

MyCerebro = bt.Cerebro()

MyCerebro.addstrategy(
    MyStrategy,
    state_shape=state_shape,
    portfolio_actions=('hold', 'buy', 'sell', 'close'),
    drawdown_call=5, # max % to loose, in percent of initial cash
    target_call=8,  # max % to win, same
    skip_frame=10,
)

# Set leveraged account:
MyCerebro.broker.setcash(2000)
MyCerebro.broker.setcommission(commission=0.0001, leverage=10.0) # commisssion to imitate spread
MyCerebro.addsizer(bt.sizers.SizerFix, stake=5000,)  

MyCerebro.addanalyzer(bt.analyzers.DrawDown)

MyCerebro.addobserver(RewardObserver)
# Dataset:
MyDataset = BTgymDataset(
    filename='../data/test_sine_1min_period256_delta0002.csv',
    start_weekdays=[0, 1, 2, 3, 4],
    episode_len_days=1,
    episode_len_hours=2,
    episode_len_minutes=0,
    start_00=False,
    time_gap_hours=6,
)
# Environment parameters:
env_config = dict(
    dataset=MyDataset,
    engine=MyCerebro,
    render_modes=['episode', 'human', 'model_input'],
    render_state_as_image=True,
    render_ylabel='SMA_log_gradients',
    render_size_episode=(12,8),
    render_size_human=(10, 5),
    render_size_state=(10, 5),
    render_dpi=75,
    port=5000,
    data_port=4999,
    connect_timeout=60,
    verbose=0,
)
# Set tensorflow distributed cluster and a3c configuration:
cluster_config = dict(
    host='127.0.0.1',
    port=12222,
    num_workers=8,  # ~ num of CPU cores
    num_ps=1,
    log_dir='./tmp/a3c_test_4',
)
launcher = Launcher(
    cluster_config=cluster_config,
    env_class=BTgymEnv,
    env_config=env_config,
    policy_class=LSTMPolicy1D,
    policy_config={'lstm_layers': (256,)},
    rollout_length=20,
    model_beta=0.02,  # entropy regularization, shouldbe in ~[0.1, 0.01]
    opt_learn_rate=1e-4,  # adam learn rate
    test_mode=False,
    train_steps=1000000000,
    model_summary_freq=20,
    episode_summary_freq=1,
    env_render_freq=10,
    verbose=1
)

In [None]:
# Train it:
launcher.run()

# To track performance: [shell]:  tensorboard --logdir './tmp/a3c_test_4'


In [None]:
# Just in case (for manual environment testing):

env_config.update({'port': 5090, 'data_port': 5089})
env = BTgymEnv(**env_config)