In [None]:
import warnings
warnings.filterwarnings("ignore") # suppress h5py deprecation warning

import numpy as np
import datetime

import os
import backtrader as bt
import backtrader.indicators as btind
from backtrader import Indicator

from gym import spaces
from btgym.spaces import DictSpace

from btgym import BTgymEnv, BTgymDataset
from btgym.strategy.base import BTgymBaseStrategy
from btgym.strategy.utils import tanh

#from btgym.algorithms import A3C, AacStackedRL2Policy

from btgym.strategy.observers import Reward, Position, NormPnL

from btgym.research.gps.aac import GuidedAAC
from btgym.research.gps.policy import GuidedPolicy_0_0
from btgym.research.gps.strategy import GuidedStrategy_0_0, ExpertObserver

from btgym.research.casual_conv.networks import conv_1d_casual_encoder, conv_1d_casual_attention_encoder

from btgym.algorithms import Launcher

from btgym.datafeed.generator import BaseDataGenerator

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-white')

from statsmodels.tsa.stattools import adfuller

DATA_BIAS = 1e-5


In [None]:
def ou_process(num_points, mu=0, l=1, sigma=1, s0=0, dt=1):
    """
    Generates Ornshtein-Ulenbeck process realisation trajectory:
    mu - mean;
    l - lambda, mean reversion rate;
    sigma - volatility;
    s0 - starting point;
    dt - time increment;
    """   
    n = num_points
    x = np.zeros(n)
    x[0] = s0
    for i in range(1, n):
        x[i] = x[i-1] * np.exp(-l*dt) + mu*(1-np.exp(-l*dt) ) +\
                 sigma*((1-np.exp(-2*l*dt))/2*l)**0.5 * np.random.normal(0, 1)
    return x

In [None]:


class CryptoSpreadStrat_0(GuidedStrategy_0_0):
    """ 
    """
    # Hyperparameters for estimating signal features:
    features_parameters = [1, 4, 16, 64, 256, 512]
    num_features = len(features_parameters)

    # Time embedding period:
    time_dim = 128 

    # Number of environment steps to skip before returning next response,
    # e.g. if set to 10 -- agent will interact with environment every 10th step;
    # every other step agent action is assumed to be 'hold':
    skip_frame = 1

    # Number of timesteps reward estimation statistics are averaged over, should be:
    # skip_frame_period <= avg_period <= time_embedding_period:
    avg_period = 30

    # Possible agent actions:
    portfolio_actions = ('hold', 'buy', 'sell', 'close')
  
    params = dict(
        # Note: fake `Width` dimension to use 2d conv etc.:
        state_shape=
            {
                'external': spaces.Box(low=-10, high=10, shape=(time_dim, 1, num_features*2), dtype=np.float32),
                'internal': spaces.Box(low=-2, high=2, shape=(avg_period, 1, 5), dtype=np.float32),
                'expert': spaces.Box(low=0, high=10, shape=(len(portfolio_actions),), dtype=np.float32),
                'metadata': DictSpace(
                    {
                        'type': spaces.Box(
                            shape=(),
                            low=0,
                            high=1,
                            dtype=np.uint32
                        ),
                        'trial_num': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'trial_type': spaces.Box(
                            shape=(),
                            low=0,
                            high=1,
                            dtype=np.uint32
                        ),
                        'sample_num': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'first_row': spaces.Box(
                            shape=(),
                            low=0,
                            high=10**10,
                            dtype=np.uint32
                        ),
                        'timestamp': spaces.Box(
                            shape=(),
                            low=0,
                            high=np.finfo(np.float64).max,
                            dtype=np.float64
                        ),
                    }
                )
            },
        cash_name='default_cash',
        asset_names=['default_asset'],
        start_cash=None,
        commission=0.0025,
        leverage=1.0,
        drawdown_call=5,
        target_call=19,
        portfolio_actions=portfolio_actions,
        initial_action=None,
        initial_portfolio_action=None,
        skip_frame=skip_frame,
        state_ext_scale=2e3,
        state_int_scale=1.0, 
        metadata={}
    )
    
    def __init__(self, **kwargs):
        super(GuidedStrategy_0_0, self).__init__(**kwargs)
        self.data.high = self.data.low = self.data.close = self.data.open
        self.current_expert_action = np.asarray([0.51, 0, 0, 0.49])
        self.state['metadata'] = self.metadata
        self.r_debug = {
            'f1': [],
            'f2': [],
            'f_real_pnl': [],
        }
    
    def nextstart(self):
        """
        Not used.
        """
        # This value shows how much episode records we need to spend
        # to estimate first environment observation:
        self.inner_embedding = self.data.close.buflen()
        self.log.info('Inner time embedding: {}'.format(self.inner_embedding))
        self.data.high = self.data.low = self.data.close = self.data.open


    def set_datalines(self):
        self.data.high = self.data.low = self.data.close = self.data.open
        
        self.data.features = [
            btind.SimpleMovingAverage(self.data.open, period=period) for period in self.features_parameters
        ]
        
        # Bollinger bands for expert:
        #self.data.bb = btind.BollingerBands(self.data.open, period=256, devfactor=3)
        self.data.bb = btind.BollingerBands(self.data.features[0], period=128, devfactor=2)

        self.data.dim_sma = btind.SimpleMovingAverage(
            self.datas[0],
            period=(np.asarray(self.features_parameters).max() + self.time_dim)
        )
        self.data.dim_sma.plotinfo.plot = False
        
        
    def get_external_state(self):

        x_sma = np.stack(
            [
                feature.get(size=self.time_dim) for feature in self.data.features
            ],
            axis=-1
        )
        # Gradient along features axis:
        dx = np.gradient(x_sma, axis=-1) * self.p.state_ext_scale
        # Add up: gradient  along time axis:
        dx2 = np.gradient(dx, axis=0) * 10
        
        # In [-1,1]:
        #x = tanh(dx)
        x = tanh(np.concatenate([dx, dx2], axis=-1))
        return x[:, None, :]
    
    def get_internal_state(self):

        x_broker = np.concatenate(
            [
                np.asarray(self.broker_stat['value'])[..., None],
                np.asarray(self.broker_stat['unrealized_pnl'])[..., None],
                np.asarray(self.broker_stat['realized_pnl'])[..., None],
                np.asarray(self.broker_stat['cash'])[..., None],
                np.asarray(self.broker_stat['exposure'])[..., None],
            ],
            axis=-1
        )
        x_broker = tanh(np.gradient(x_broker, axis=-1) * self.p.state_int_scale)
#         x_broker = np.gradient(x_broker, axis=-1) * self.p.state_int_scale

        return x_broker[:, None, :]

    def get_reward(self):
        """
        """

        # All sliding statistics for this step are already updated by get_state().
        
        # Potential-based shaping function 1:
        # based on potential of averaged profit/loss for current opened trade (unrealized p/l):
        unrealised_pnl = np.asarray(self.broker_stat['unrealized_pnl'])
        current_pos_duration = self.broker_stat['pos_duration'][-1]
        
        if current_pos_duration == 0: 
            f1 = 0 
        
        else:
            if current_pos_duration < self.p.skip_frame:
                fi_1 = 0
                fi_1_prime = np.average(unrealised_pnl[-current_pos_duration:])
            
            elif current_pos_duration < 2 * self.p.skip_frame:
                fi_1 = np.average(
                    unrealised_pnl[-(self.p.skip_frame + current_pos_duration):-self.p.skip_frame]
                )
                fi_1_prime = np.average(unrealised_pnl[-self.p.skip_frame:])
                
            else:
                fi_1 = np.average(
                    unrealised_pnl[-2 * self.p.skip_frame:-self.p.skip_frame]
                )
                fi_1_prime = np.average(unrealised_pnl[-self.p.skip_frame:])
                
            f1 = self.p.gamma * fi_1_prime - fi_1 
#             f1 = np.clip(f1 + 1, a_min=1e-10, a_max=None)
#             f1 = np.log(f1)

        #print('pos_duration: {}, f1: {}'.format(current_pos_duration, f1))

        self.r_debug['f1'].append(f1)
        # NOT USED! Potential-based shaping function 2:
        # based on potential of averaged broker value, normalized wrt to max drawdown and target bounds.
        norm_broker_value = np.asarray(self.broker_stat['value'])

        f2 = self.p.gamma * np.average(norm_broker_value[-self.p.skip_frame:]) \
            - np.average(norm_broker_value[- 2 * self.p.skip_frame:-self.p.skip_frame])

        self.r_debug['f2'].append(f2)

        # Main reward function: normalized realized profit/loss:
        realized_pnl = np.asarray(self.broker_stat['realized_pnl'])[-self.p.skip_frame:].sum()
        
        self.r_debug['f_real_pnl'].append(10 * realized_pnl)

        # Weights are subject to tune:
        self.reward = (
             10.0 * f1 + 10.0 * realized_pnl 
        ) * self.p.reward_scale

        self.reward = np.clip(self.reward, -self.p.reward_scale, self.p.reward_scale)

        return self.reward
    
#     def stop(self):
#         """
#         Service output.
#         """
#         for k, v in self.r_debug.items():
#             v = np.asarray(v)
            
#             print('{}: sum: {}, mean: {}, max: {}, min: {}'.format(k, v.sum(), v.mean(), v.max(), v.min()))

    def get_expert_state(self):
        """
        Simple BB logic
        """
        self.current_expert_action = np.asarray([0.51, 0, 0, 0.49])

        asset = np.frombuffer(self.data.features[0].get(size=self.p.skip_frame))
        bb_top = np.frombuffer(self.data.bb.top.get(size=self.p.skip_frame))
        bb_bot = np.frombuffer(self.data.bb.bot.get(size=self.p.skip_frame))
        
        
        if asset.max() > bb_top.max():
                # Issue sell:
                self.current_expert_action = np.asarray([0, 0, 1, 0])

        elif asset.min() < bb_bot.min():
                # Issue Buy:
                self.current_expert_action = np.asarray([0, 1, 0, 0])

        return self.current_expert_action


In [None]:
# Data setup:
# filename='./data/CRYPTO_M1_201809_biased_1e-5.csv'

parsing_params = dict(
    # CSV source specific parsing params:
    sep=',',
    header=0,
    index_col=0,
    parse_dates=True,
    names=['open'],
    timeframe=1,  # 1 minute.
    datetime=0,
    open=1,  # only single value used per timestep
    high=-1,
    low=-1,
    close=-1,
    volume=-1,
    openinterest=-1,
)

domain = BaseDataGenerator(
    generator_fn=ou_process,
    generator_params=dict(l=0.05, sigma=2e-7, mu=1e-5, s0=1e-5),
    episode_duration=dict(days=4, hours=23, minutes=59),
)

# Setting up environment core simulation engine:
engine = bt.Cerebro()

engine.addstrategy(
    CryptoSpreadStrat_0,
    start_cash=1000,  # initial broker cash
    commission=0.0015, # 0.15% broker commission fee
    leverage=1,
    order_size=200 * DATA_BIAS**-1,  # fixed stake, mind leverage
    drawdown_call=10, # max 10% to loose, in percent of initial cash
    target_call=10,  # max 10% to win, same
    skip_frame=2, 
    gamma=0.99,  #same as A3C gamma discount factor, used for rewrd shaping
    reward_scale=25, #7, # gardient`s nitrox, touch with care!
    #state_ext_scale=np.linspace(2e6, 2e7, num=8)  
    state_ext_scale=2e7
)

# Expert actions observer:
engine.addobserver(ExpertObserver)

# Environment configuration:
env_config = dict(
    class_ref=BTgymEnv, 
    kwargs=dict(
        dataset=domain,
        engine=engine,
        render_modes=['episode', 'human', 'internal','external'],
        render_state_as_image=True,
        render_ylabel='OHL_diff. / Internals',
        render_size_episode=(12,8),
        render_size_human=(9, 4),
        render_size_state=(11, 3),
        render_dpi=75,
        port=5000,
        data_port=4999,
        connect_timeout=240,
        verbose=0,
    )
)

# Distributed TF training cluster congfig:
cluster_config = dict(
    host='127.0.0.1',
    port=12230,
    num_workers=4,  # set according CPU's available or so
    num_ps=1,
    num_envs=1,
    log_dir=os.path.expanduser('~/tmp/crypto_spread_test_0'),
)

# Algorithm policy estimator:
# policy_config = dict(
#     class_ref=GuidedPolicy_0_0,
#     kwargs={
#         'lstm_layers': (256, 256),
#         'lstm_2_init_period': 60,
#     }
# )

policy_config = dict(
    class_ref=GuidedPolicy_0_0,
    kwargs={
        'lstm_layers': (256, 256),
        'lstm_2_init_period': 50,
        'conv_2d_layer_config': (
             (32, (3, 1), (2, 1)),
             (32, (3, 1), (2, 1)),
             (64, (3, 1), (2, 1)),
             (64, (3, 1), (2, 1))
         ),
        'encode_internal_state': False,
        'state_encoder_class_ref': conv_1d_casual_encoder,
    }
)

# Algorithm config:
trainer_config = dict(
    class_ref=GuidedAAC,
    kwargs=dict(
        opt_learn_rate=[1e-4, 1e-4], # random log-uniform 
        opt_end_learn_rate=1e-5,
        opt_decay_steps=100*10**6,
        model_gamma=0.99,
        model_gae_lambda=1.0,
        model_beta=0.05, # entropy reg ~0.05
        aac_lambda=1.0, # main a3c loss weight
        guided_lambda=10.0,  # imitation loss weight
        guided_decay_steps=3*10**6,  # annealing guided_lambda to zero in 4M steps
        rollout_length=20,
        time_flat=True, 
        episode_train_test_cycle=(1, 0),
        model_summary_freq=10,
        episode_summary_freq=1,
        env_render_freq=5,
        #aux_render_modes=['action_prob', 'value_fn'], 
    )
)


In [None]:
# Putting it all together:
launcher = Launcher(
    cluster_config=cluster_config,
    env_config=env_config,
    trainer_config=trainer_config,
    policy_config=policy_config,
    test_mode=False,
    max_env_steps=50*10**6,
    root_random_seed=0,
    purge_previous=1,  # ask to override previously saved model and logs
    verbose=0
)

# Train it:
launcher.run()

### ==========