###  test 4.2:
-----

##### data:
real data, small set

##### topics:
- observation state shaping
- reward shaping
- policy architecture

##### abstract:
Using convolution layers to to shape state features, added reward term to force position close at the end of episode.

##### key points:
- using broker/portfolio statistics to augument state observation ('inner' state);
- using 1d convolutions along time-embedding dimension for 'outer' state signal;
- outer observation state is matrix of shape [dim_time_embedding, 3] of:
        - one-step difference of Open price;
        - difference in current Open and High price;
        - difference in current Open and Low price;
- feeding 'inner' part of state directly to LSTM layer;
- using kind of "skip connection" to feed `outer` state both to convolution layers and to LSTM layer, wich seems to give faster convergence;
- shaping reward as weighted sum of averaged broker statisitcs: 
        - unrealised profit/loss;
        - realized trade result;
        - current portfolio value;
        - [added] prohibitive penalty for opened position at the end of episode;
- variable trade position size;
- using env.skip_frame=10.


In [None]:
import backtrader as bt
import backtrader.indicators as btind
import numpy as np

from gym import spaces

from btgym import BTgymEnv, BTgymStrategy, BTgymDataset

from btgym.a3c import Launcher, BaseLSTMPolicy

import tensorflow as tf
import tensorflow.contrib.rnn as rnn
from tensorflow.contrib.layers import flatten as batch_flatten
from tensorflow.python.util.nest import flatten as flatten_nested

In [None]:
class LSTMPolicy1D(BaseLSTMPolicy):
    """
    A3C LSTM policy with 1D convolutions.
    """
    def __init__(self, ob_space, ac_space, num_filters=32, filter_size=3, stride=2,
                 lstm_class=rnn.BasicLSTMCell, lstm_layers=(256,)):
        
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space), name='x_in_pl')
        
        # Outer state:
        x1 = x[..., :3]
        
        # Inner state:
        #x2 = x[..., 3:]

        # Pass outer state through conv layers:
        for i in range(4):
            x1 = tf.nn.elu(self.conv1d(x1, num_filters, "l{}".format(i + 1), filter_size, stride))
                
        x = tf.concat(
            [batch_flatten(x1), batch_flatten(x),],
            axis=-1
        )

        
        # Run LSTM along rollout time dimension and evrything else:
        super(LSTMPolicy1D, self).__init__(x, ob_space, ac_space, lstm_class, lstm_layers)
           
    def conv1d(self, x, num_filters, name, filter_size=3, stride=2, pad="SAME", dtype=tf.float32,
               collections=None):
        with tf.variable_scope(name):
            stride_shape =  stride
            
            #print('stride_shape:',stride_shape)
            
            filter_shape = [filter_size, int(x.get_shape()[-1]), num_filters]
            
            #print('filter_shape:', filter_shape)
            
            # there are "num input feature maps * filter height * filter width"
            # inputs to each hidden unit
            fan_in = np.prod(filter_shape[:2])
            
            # each unit in the lower layer receives a gradient from:
            # "num output feature maps * filter height * filter width" /
            #   pooling size
            fan_out = np.prod(filter_shape[:1]) * num_filters

            # initialize weights with random weights
            w_bound = np.sqrt(6. / (fan_in + fan_out))

            w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
                                collections=collections)
            b = tf.get_variable("b", [1, 1, num_filters], initializer=tf.constant_initializer(0.0),
                                collections=collections)
            return tf.nn.conv1d(x, w, stride_shape, pad) + b
   

In [None]:

class MyStrategy(BTgymStrategy):
    """
    BT server inner computation startegy tuned to pass small real data test.
    """
    
    def __init__(self, **kwargs):
        super(MyStrategy, self).__init__(**kwargs)
        
        self.dim_time = self.p.state_shape['raw_state'].shape[0] 

        self.trade_just_closed = False
        self.trade_result = None
        
        self.unrealized_pnl = None
        self.norm_broker_value = None
        self.avg_norm_broker_cash = 0 # not used
        self.realized_pnl = None

        self.realized_broker_value = self.env.broker.startingcash
        self.episode_result = 0  # not used
        self.reward = 0

        # Signal (`outer` state) presentation datalines:
        
        # `Open` channel is one step difference of Open price;
        # `High` and `Low` channels are differences
        # beetwen current Open price and  current High or Low prices respectively:  
        self.channel_O = bt.Sum(self.data.open, - self.data.open(-1))
        self.channel_H = bt.Sum(self.data.high, - self.data.open)
        self.channel_L = bt.Sum(self.data.low,  - self.data.open)

        # Service sma to get correct first features values:
        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(self.dim_time)
        )
        self.data.dim_sma.plotinfo.plot = False
        
    def notify_trade(self, trade):    
        if trade.isclosed:
            # Set trade flag [True if trade have been closed within last frame-skip period] 
            # and store trade result:
            self.trade_just_closed = True
            self.trade_result = trade.pnlcomm
            
            # Store realized prtfolio value:
            self.realized_broker_value = self.broker.get_value()
 
    def get_state(self):
        """ 
        Computes obs. state as matrix of log-scaled signal features
        and portfolio statistics.
        """
        T = 1e4 # EURUSD
        #T = 1e2 # EURUSD, Z-norm
        #T = 1 # BTCUSD
      
        x = np.stack(
            [
                np.frombuffer(self.channel_O.get(size=self.dim_time)),
                np.frombuffer(self.channel_H.get(size=self.dim_time)), 
                np.frombuffer(self.channel_L.get(size=self.dim_time)), 
            ], 
            axis=-1
        )
            
        # Amplified:
        x *= T
        
        # Log-scale: NOT used. Seems to hurt performance.
        # x = self.log_transform(x)
        
        # Add inner state statistic:
        _ = self.get_unrealized_pnl(size=self.dim_time)
        _ = self.get_realized_pnl(size=self.dim_time, gamma=1.0)
        _ = self.get_norm_broker_value(size=self.dim_time)
        _ = self.get_norm_broker_cash(size=self.dim_time)
        _ = self.get_norm_position_size(size=self.dim_time)
        
        _ = self.get_norm_position_duration(size=self.dim_time)
        _ = self.get_norm_episode_duration(size=self.dim_time)
        
        
        x = np.concatenate(
            [
                x,
                self.unrealized_pnl[..., None],
                self.realized_pnl[..., None],
                #self.norm_broker_value[..., None],  # seems to be indifferent 
                self.norm_broker_cash[..., None],
                self.norm_position_size[..., None],
                self.exp_scale(self.norm_episode_duration, gamma=6)[...,None],
                self.exp_scale(self.norm_position_duration, gamma=2)[...,None],
            ],
            axis=-1
        )
        
        self.state['raw_state'] = self.raw_state
                
        self.state['model_input'] = x
        
        return self.state
        
    def log_transform(self, x):
        return np.sign(x) * np.log(np.fabs(x) + 1)

    def norm_log_value(self, current_value, start_value, drawdown_call, target_call, epsilon=1e-4):
        """Current value log-normalized in [-1,1] wrt p/l limits."""
        x = np.asarray(current_value)
        x = (x / start_value - 1) * 100
        x = (x - target_call)/(drawdown_call+target_call) + 1
        x = np.clip(x, epsilon, 1 - epsilon)
        x = 1 - 2 * np.log(x) / np.log( epsilon)
        return x
    
    def norm_value(self, current_value, start_value, drawdown_call, target_call, epsilon=1e-8):
        """Current value normalized in [-1,1] wrt upper and lower bounds"""
        x = np.asarray(current_value)
        x = (x / start_value - 1) * 100
        x = (x - target_call)/(drawdown_call+target_call) + 1
        x = 2 * np.clip(x, epsilon, 1 - epsilon) - 1 
        return x
    
    def decayed_result(self, trade_result, current_value, start_value, drawdown_call, target_call, gamma=1.0):
        """Normalized in [-1,1] trade result, lineary decayed wrt current value."""
        target_value = start_value * (1 + target_call/100)
        value_range = start_value * (drawdown_call + target_call)/100
        decay = (gamma - 1) * (current_value - target_value) / value_range + gamma
        x = trade_result * decay / value_range
        return x
    
    def exp_scale(self, x, gamma=4):
        """ Returns exp. scaled value in [0,1] for x in [0,1]; gamma controls steepness."""
        x = np.asarray(x) + 1
        return(np.exp(x**gamma - 2**gamma))
    
    def get_unrealized_pnl(self, size=1):
        """
        Normalized profit/loss for current opened trade (unrealized p/l).
        Returns vector of treceback values of length `size`.
        """
        x = np.asarray(self.stats.broker.value.get(size=size)) - self.realized_broker_value
        self.unrealized_pnl = x / self.env.broker.startingcash /(self.p.drawdown_call + self.p.target_call) * 100
        return self.unrealized_pnl
    
    def get_realized_pnl(self, size=1, gamma=1.0):
        """
        Returns single trade realized profit/loss, normalized and gamma-adjusted to current broker value.
        Returns vector of filled with p/l value of length `size`.
        """
        if self.trade_just_closed:
            self.realized_pnl = self.decayed_result(
                self.trade_result,
                np.asarray(self.stats.broker.value.get(size=size)),
                self.env.broker.startingcash,
                self.p.drawdown_call,
                self.p.target_call,
                gamma=gamma
            ) 
            self.trade_just_closed = False
        
        else:
            self.realized_pnl = np.zeros(size)
            
        #print('self.realized_pnl:', self.realized_pnl)  
        return self.realized_pnl
    
    def get_norm_broker_value(self, size=1):
        """
        Broker value, normalized in [-1,1] wrt profit/loss limits.
        Returns vector of treceback values of length `size`.
        """
        self.norm_broker_value = self.norm_value(
            self.stats.broker.value.get(size=size),
            self.env.broker.startingcash,
            self.p.drawdown_call,
            self.p.target_call,
        )
        return self.norm_broker_value 
    
    def get_norm_broker_cash(self, size=1):
        """
        Normalized broker cash.
        Returns vector of treceback values of length `size`.
        """ 
        self.norm_broker_cash = self.norm_value(
            self.stats.broker.cash.get(size=size),
            self.env.broker.startingcash,
            99.0,
            self.p.target_call,
        )
        return self.norm_broker_cash 
    
    def get_norm_position_size(self, size=1):
        """
        Normalized in (-1, 1) position size.
        Returns vector of treceback values of length `size`.
        """
        self.norm_position_size = np.asarray(
            self.stats.position.exposure.get(size=size)
        ) / (self.env.broker.startingcash * (np.asarray(self.stats.position.leverage.get(size=size)) + 1e-2))
        return self.norm_position_size
    
    def get_norm_position_duration(self, size=1):
        """
        Normalized in (0, 1) current position duration wrt max. episode steps.
        Returns vector of treceback values of length `size`.
        """
        self.norm_position_duration = np.frombuffer(self.stats.position.duration.get(size=size)) /\
            (self.data.numrecords - self.inner_embedding)
        return self.norm_position_duration
    
    def get_norm_episode_duration(self, size=1):
        """
        Normalized in (0, 1) current episode duration wrt max. episode steps.
        Returns vector of treceback values of length `size`.
        """
        self.norm_episode_duration = np.frombuffer(self.stats.reward.step.get(size=size)) /\
            (self.data.numrecords - self.inner_embedding)
        return self.norm_episode_duration
    
    
    def get_reward(self):
        """
        Defines reward as weighted sum of portfolio performance statisitics
        averaged over time-embedding period.
        """
        # All reward terms for this step are already computed by get_state(), wich has been called just before.
        
        # Reward term 1: averaged profit/loss for current opened trade (unrealized p/l):
        avg_unrealised_pnl = np.average(self.unrealized_pnl)
 
        # Reward term 2: averaged broker value, normalized wrt to max drawdown and target bounds.
        avg_norm_broker_value = np.average(self.norm_broker_value )
        
        # Reward term 3: normalized single trade realized profit/loss:
        avg_realized_pnl = np.average(self.realized_pnl)
        
        avg_norm_episode_duration = np.average(self.norm_episode_duration)
        abs_max_norm_position_size = abs(self.norm_position_size).max()
        avg_norm_position_duration = np.average(self.norm_position_duration)
            
        # Weights are subject to tune, 
        # absolute reward walue should be consistent with a3c rollout length
        # to keep gradients sane (~/2) while not slowing down training:
        self.reward = (
            + 1.0 * avg_unrealised_pnl
            + 0.01 * avg_norm_broker_value 
            + 10.0 * avg_realized_pnl
            - 5.0 * self.exp_scale(avg_norm_episode_duration, gamma=10) * abs_max_norm_position_size
            #- 1.0 * self.exp_scale(avg_norm_position_duration, gamma=3)
        ) / 2
        
        return self.reward 

class Position(bt.observer.Observer):
    """ 
    Keeps track of position size and effective leverage.
    """
    lines = ('exposure', 'leverage', 'duration')
    plotinfo = dict(plot=True, subplot=True)
    plotlines = dict(
        exposure=dict(marker='.', markersize=1.0, color='blue', fillstyle='full'),
        leverage=dict(_plotskip='True',),
        duration=dict(_plotskip='True',),
    )
    current_duration = 0
    
    def next(self):
        self.lines.exposure[0] = self._owner.position.size
        self.lines.leverage[0] = self._owner.env.broker.get_leverage()
        
        if self._owner.position.size == 0:
            self.current_duration = 0
        
        else:
            self.current_duration += 1
        self.lines.duration[0] = self.current_duration
    
    
class Reward(bt.observer.Observer):
    """ 
    Keeps track of reward values.
    """
    lines = ('value', 'step')
    plotinfo = dict(plot=True, subplot=True)
    
    plotlines = dict(
        value=dict(markersize=4.0, color='green', fillstyle='full'),
        step=dict(_plotskip='True',),
    )
    
    def next(self):
        self.lines.value[0] = self._owner.reward
        self.lines.step[0] = self._owner.iteration

########################################################       


# Set backtesting engine parameters:

time_embed_dim = 30

state_shape = {
    'raw_state': spaces.Box(low=-100, high=100, shape=(time_embed_dim, 4)),
    'model_input': spaces.Box(low=-100, high=100, shape=(time_embed_dim, 9)),
}

MyCerebro = bt.Cerebro()

MyCerebro.addstrategy(
    MyStrategy,
    state_shape=state_shape,
    portfolio_actions=('hold', 'buy', 'sell', 'close'),
    drawdown_call=5, # max % to loose, in percent of initial cash
    target_call=15,  # max % to win, same
    skip_frame=10,
)

# Set leveraged account:
MyCerebro.broker.setcash(2000)
MyCerebro.broker.setcommission(commission=0.0001, leverage=10.0) # commisssion to imitate spread
MyCerebro.addsizer(bt.sizers.SizerFix, stake=5000,)  

MyCerebro.addanalyzer(bt.analyzers.DrawDown)
MyCerebro.addobserver(Reward)
MyCerebro.addobserver(Position)

MyDataset = BTgymDataset(
    #filename='../data/DAT_ASCII_EURUSD_M1_2016.csv',
    filename='../data/DAT_ASCII_EURUSD_M1_201703.csv',
    #filename='../data/test_sine_1min_period256_delta0002.csv',
    start_weekdays=[0, 1, 2, 3, 4],
    episode_len_days=0,
    episode_len_hours=23,
    episode_len_minutes=55,
    start_00=False,
    time_gap_hours=6,
)
env_config = dict(
    dataset=MyDataset,
    engine=MyCerebro,
    render_modes=['episode', 'human', 'model_input'],
    render_state_as_image=True,
    render_ylabel='OHL_diff channels and broker stat.',
    render_size_episode=(12,8),
    render_size_human=(10, 5),
    render_size_state=(10, 5),
    render_dpi=75,
    port=5000,
    data_port=4999,
    connect_timeout=60,
    verbose=0,
)
# Set tensorflow distributed cluster and a3c configuration:
cluster_config = dict(
    host='127.0.0.1',
    port=12222,
    num_workers=8,  # ~ num of CPU cores
    num_ps=1,
    log_dir='./tmp/a3c_test_4_2',
)
launcher = Launcher(
    cluster_config=cluster_config,
    env_class=BTgymEnv,
    env_config=env_config,
    policy_class=LSTMPolicy1D,
    policy_config={'lstm_layers': (256,)}, # TODO: multi-layer, phased/grid LSTM
    rollout_length=20,
    model_beta=0.01,  # entropy regularization, TODO: make it log-unform over ~[0.1, 0.01]
    opt_learn_rate=1e-4,  # adam learn rate,  TODO: RMSProp with shared stats, log-uniform learn rate + annealing
    test_mode=False,
    train_steps=1000000000,
    model_summary_freq=20,
    episode_summary_freq=1,
    env_render_freq=10,
    verbose=1
)

In [None]:
launcher.run()

# To track performance: [shell]:  tensorboard --logdir './tmp/a3c_test_4_2'