# Import modules

In [None]:
import sys
sys.path.append("..")

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')

import pandas as pd
import numpy as np
from datetime import timedelta
import tensorflow as tf # # python3 -m pip install tensorflow
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
import datetime
import yfinance as yf # Must be v0.1.83: https://github.com/ranaroussi/yfinance/issues/1484
import talib as ta
import pickle
import pyfolio as pf

# Build price data

In [None]:
# start="1982-04-09"
start = "1993-01-29"
end="2023-04-09"

aapl_data = yf.download(tickers="AAPL",interval='1d', start=start, end=end , auto_adjust=True)
spy_data = yf.download(tickers="SPY", interval='1d', start=start, end=end, auto_adjust=True)

In [None]:
aapl_data.tail()

In [None]:
aapl_data.shape

In [None]:
spy_data.shape

In [None]:
spy_data.head()

In [None]:
# tmp = pd.DataFrame(index=aapl_data.index)

# spy_data = tmp.merge(spy_data, left_index=True, right_index=True, how='left')

# spy_data = spy_data.fillna(method='ffill')
# spy_data = spy_data.fillna(method='bfill')

In [None]:
# sanity checks
aapl_data.isnull().sum()

In [None]:
spy_data.isnull().sum()

### Split data into training and test set

In [None]:
t = .9
split = int(t*len(aapl_data))

train_aapl = aapl_data[:split]
test_aapl = aapl_data[split:]

train_spy = spy_data[:split]
test_spy = spy_data[split:]

# Design reward function

In [None]:
# define profit and loss to incluude transaction cost and commissions
def get_pnl(entry, curr_price, position):
    # Transaction cost and commissions
    tc = 0.001
    return (curr_price*(1-tc) - entry*(1+tc))/entry*(1+tc)*position


In [None]:
def reward_categorical_pnl(entry, curr_price, position):
    '''Sign of pnl'''
    pnl = get_pnl(entry, curr_price, position)
    return np.sign(pnl)

def reward_exponential_pnl(entry, curr_price, position):
    '''Exponentual percentage pnl'''
    pnl = get_pnl(entry, curr_price, position)
    return np.exp(pnl)

def reward_positive_categorical_pnl(entry, curr_price, position):
    '''1 for win, 0 for loss'''
    pnl = get_pnl(entry, curr_price, position)
    if pnl >= 0:
        return 1
    else:
        return 0

In [None]:

# def get_pnl(entry, curr_price, position, tc=0.001):
#     return (curr_price*(1-tc) - entry*(1+tc))/entry*(1+tc)*position

# def reward_sharpe_ratio(entry, curr_price, position, returns_window):
#     pnl = get_pnl(entry, curr_price, position)
#     returns_window.append(pnl)
    
#     if len(returns_window) > 1:
#         excess_returns = np.array(returns_window) - 0.0  # Assuming a risk-free rate of 0.0
#         sharpe_ratio = np.mean(excess_returns) / np.std(excess_returns)
#     else:
#         sharpe_ratio = 0.0
        
#     return sharpe_ratio

# # Example usage
# returns_window = []
# entry = 1000
# curr_price = 102
# position = -1
# reward = reward_sharpe_ratio(entry, curr_price, position, returns_window)
# print(reward)

# Build the RL environment

In [None]:
class Game(object):

    def __init__(self, bars1,bars2, reward_function, lkbk=14,  init_idx=None):

        # Initilaise lookback period for the calculation of technical indicators
        self.lkbk = lkbk
        # Intialise length of each trade
        self.trade_len = 0
        # Initialise 1 day frequency data (just 2 price series (aapl and spy))
        self.bars1 = bars1
        self.bars2 = bars2
        # Initialise when game is over to update the state, position and calculate reward
        self.is_over = False
        # Intialise reward to store the value of reward
        self.reward = 0
        # Define pnl_sum to calculate the pnl when all episodes are complete.
        self.pnl_sum = 0
        # Supply a starting index which indicates a position in our price dataframe
        # and denotes the point at which the game starts
        self.init_idx = init_idx
        # Instantiate reward function
        self.reward_function = reward_function
        # When game is over, reset all state values
        self.reset()

# ---------------------------------------------------------------------------------------------

    def _update_position(self, action):
        '''This is where we update our position'''

        # If the action is zero or hold, do nothing (action==0 means hold)
        if action == 0:
            pass

        elif action == 2:
            """Enter a long or exit a short position"""

            # Current position (long) same as the action (buy), do nothing (action==2 means buy)
            if self.position == 1:
                pass

            # No current position, and action is buy, update the position to indicate buy
            elif self.position == 0:
                self.position = 1
                self.entry = self.curr_price
                self.start_idx = self.curr_idx

            # Current postion (short) is different than the action (buy), end the game
            elif self.position == -1:
                self.is_over = True

        elif action == 1:
            """Enter a short or exit a long position"""

            # Current position (short) same as the action (sell), do nothing (action==1 means sell)
            if self.position == -1:
                pass

            # No current position, and action is sell, update the position to indicate sell
            elif self.position == 0:
                self.position = -1
                self.entry = self.curr_price
                self.start_idx = self.curr_idx

            # Current postion (long) is different than the action (sell), end the game
            elif self.position == 1:
                self.is_over = True

# ---------------------------------------------------------------------------------------------

    def _get_reward(self):
        """Here we calculate the reward when the game is finished.
        """
        if self.is_over:
            self.reward = self.reward_function(
                self.entry, self.curr_price, self.position)

# ---------------------------------------------------------------------------------------------

    def _get_last_N_timebars(self):
        '''This function gets the timebars 1 day resolution based on the lookback specified.'''

        '''Width of the 1d'''
        self.wdw1d = np.ceil(self.lkbk*14)

        '''Getting candlesticks before current time'''
        self.last1 = self.bars1[self.curr_time -
                                  timedelta(self.wdw1d):self.curr_time].iloc[-self.lkbk:]
        self.last2 = self.bars2[self.curr_time -
                                  timedelta(self.wdw1d):self.curr_time].iloc[-self.lkbk:]

# ---------------------------------------------------------------------------------------------

    def _assemble_state(self):
        self._get_last_N_timebars()

        """Adding State Variables"""
        self.state = np.array([])

        """Adding candlesticks"""
        def get_normalised_bars_array(bars):
            bars = bars.iloc[-10:].values.flatten()
            bars = (bars-np.mean(bars))/np.std(bars)
            return bars

        self.state = np.append(self.state, get_normalised_bars_array(self.last1))
        self.state = np.append(self.state, get_normalised_bars_array(self.last2))


        """" Adding technical indicators"""
        
    def get_technical_indicators(bars):

        #--------------

        def get_psma(self, prices, window):
            rm = prices.rolling(window).mean()
            psma = prices.divide(rm, axis=0) - 1
            return psma 

        def get_bb(self, prices, window):
            rm = prices.rolling(window).mean()
            rstd = prices.rolling(window).std()
            bbp = (prices - rm) / 2 * rstd
            return bbp

        def get_pema(self, prices, window):
            ema = prices.ewm(window).mean()
            pema = prices.divide(ema, axis=0) - 1
            return pema

        #--------------

        """Relative difference two moving averages"""
        sma1 = ta.SMA(bars.Close, self.lkbk-1)[-1]
        sma2 = ta.SMA(bars.Close, self.lkbk-8)[-1]
        """Relative momentums"""
        mom10 = ta.MOM(bars.Close, 10)[-1]
        mom20 = ta.MOM(bars.Close, 20)[-1]
        mom30 = ta.MOM(bars.Close, 30)[-1]
        """Relative returns"""
        ret1 = bars['Close'].pct_change()
        ret5 = ret1.rolling(5).sum()[-1]
        ret10 = ret1.rolling(10).sum()[-1]
        ret20 = ret1.rolling(20).sum()[-1]
        ret40 = ret1.rolling(40).sum()[-1]    
        """hybrid indicators"""
        bb = get_bb(bars.Close, window=self.lkbk)[-1]
        psma = get_psma(bars,Close, window=self.lkbk)[-1]
        pema = get_pema(bars.Close, window=self.lkbk)[-1]

        tech_ind1 = np.append(tech_ind1, (sma1-sma2)/sma2)
        tech_ind1 = np.append(tech_ind1, mom10)
        tech_ind1 = np.append(tech_ind1, mom20)
        tech_ind1 = np.append(tech_ind1, mom30)
        tech_ind1 = np.append(tech_ind1, ret5)
        tech_ind1 = np.append(tech_ind1, ret10)
        tech_ind1 = np.append(tech_ind1, ret20)
        tech_ind1 = np.append(tech_ind1, ret40)
        tech_ind1 = np.append(tech_ind1, bb)
        tech_ind1 = np.append(tech_ind1, psma)
        tech_ind1 = np.append(tech_ind1, pema)

        # technical indicators
        functions = [ta.RSI, ta.SAR, ta.ADX, ta.NATR, ta.AROONOSC,
                     ta.BOP,ta.BETA, ta.STDDEV, ta.OBV]


        # Loop over the functions and append the last value of each indicator to tech_ind2
        tech_ind2 = np.array([])

        for function in functions:
            try:
                if function in [ta.RSI]:
                    indicator_values = function(bars.Close.shift(-1), timeperiod=self.lkbk)

                elif function in [ta.STDDEV]:
                    indicator_values = function(bars.Close)

                elif function in [ta.ADX]:
                    indicator_values = function(bars.High.shift(1), bars.Low.shift(1),
                                                bars.Open, timeperiod=self.lkbk)

                elif function in [ta.SAR]:
                    indicator_values = function(bars.High.shift(1), bars.Low.shift(1),
                                                0.2, 0.2)

                elif function in [ta.NATR]:
                    indicator_values = function(bars.Low, bars.High, bars.Close,
                                                timeperiod=self.lkbk)

                elif function in [ta.AROONOSC]:
                    indicator_values = function(bars.High, bars.Low,
                                                timeperiod = self.lkbk-3)

                elif function in [ta.BETA]:
                    indicator_values = function(bars.High, bars.Low)

                elif function in [ta.BOP]:
                    indicator_values = function(bars.Open, bars.High, 
                                                bars.Low, bars.Close)

                else:
                    indicator_values = function(bars.Close, bars.Volume)

                last_value = indicator_values[-1]
                tech_ind2 = np.append(tech_ind2, last_value)

            except Exception as e:
                print(f"Error applying {function.__name__}: {e}")


        tech_ind = np.concatenate((tech_ind1, tech_ind2), axis=0).tolist()

        return tech_ind

        self.state = np.append(self.state, get_technical_indicators(self.last1))
        self.state = np.append(self.state, get_technical_indicators(self.last2))

        """Extract features from Dates"""
        self.curr_time = self.bars1.index[self.curr_idx]
        _day_of_week = self.curr_time.weekday()/6
        _week = int(self.curr_time.strftime("%U"))/51 # 52 weeks with index starting from 0
        _month_of_year = self.curr_time.month/11
        _year = self.curr_time.year/30 # thirty years of data

        self.state = np.append(self.state, self._day_of_week)
        self.state = np.append(self.state, self._week)
        self.state = np.append(self.state, self._month_of_year)
        self.state = np.append(self.state, self._year)
        """it is important to add position"""
        self.state = np.append(self.state, self.position)

# ---------------------------------------------------------------------------------------------

    def get_state(self):
        """This function returns the state of the system.
        Returns:
            self.state: the state including candlestick bars, indicators, time signatures and position.
        """
        # Assemble new state
        self._assemble_state()
        return np.array([self.state])

# ---------------------------------------------------------------------------------------------
    def get_position(self, action):
        """
        Get the position in the market based on the action suggested by the RL agent.
        
        Args:
            action (int): The action suggested by the RL agent. 0 means hold, 1 means sell, and 2 means buy.
        
        Returns:
            int: The current position in the market. 0 means no position, 1 means long, and -1 means short.
        """
        if action == 0:  # Hold
            return self.position
        elif action == 2:  # Buy
            if self.position == 1:  # If the current position is already long, do nothing.
                return self.position
            elif self.position == 0:  # If there's no current position, update the position to indicate buy.
                return 1
            elif self.position == -1:  # If the current position is short, end the game.
                self.is_over = True
                return self.position
        elif action == 1:  # Sell
            if self.position == -1:  # If the current position is already short, do nothing.
                return self.position
            elif self.position == 0:  # If there's no current position, update the position to indicate sell.
                return -1
            elif self.position == 1:  # If the current position is long, end the game.
                self.is_over = True
                return self.position

# ---------------------------------------------------------------------------------------------
    def act(self, action):
        """This is the point where the game interacts with the trading
        algo. It returns value of reward when game is over.
        """
        # note that here we're only using the aapl price and not spy which was only used to generate technicals
        self.curr_time = self.bars1.index[self.curr_idx]
        self.curr_price = self.bars1['Close'][self.curr_idx]

        self._update_position(action)

        # Unrealized or realized pnl. This is different from pnl in reward method which is only realized pnl.
        self.pnl = (-self.entry + self.curr_price)*self.position/self.entry

        self._get_reward()
        if self.is_over:
            self.trade_len = self.curr_idx - self.start_idx

        return self.is_over, self.reward
    
# ---------------------------------------------------------------------------------------------
    def reset(self):
        """Resetting the system for each new trading game.
        Here, we also resample the bars for 1d.
        Ideally, we should do this on every update but this will take very long.
        """
        self.pnl = 0
        self.entry = 0
        self._day_of_week = 0
        self._week = 0
        self._month_of_year = 0
        self._year = 0
        self.curr_idx = self.init_idx
        self.start_idx = self.curr_idx
        self.curr_time = self.bars1.index[self.curr_idx]
        self._get_last_N_timebars()
        self.position = 0
        self.act(0)
        self.state = []
        self._assemble_state()

## Defining the method to create the neural networks

In [None]:
# def init_net(env, rl_config):
#     """
#     This initialises the RL run by
#     creating two new predictive neural network
#     Args:
#         env:
#     Returns:
#         modelQ: the neural network
#         modelR: the neural network

#     """
#     hidden_size = len(env.state)*rl_config['HIDDEN_MULT']
#     modelQ = Sequential()
#     modelQ.add(LSTM(len(env.state), input_shape=(
#         len(env.state),), activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(LSTM(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelQ.compile(Adam(lr=rl_config['LEARNING_RATE']),
#                    loss=rl_config['LOSS_FUNCTION'])

#     modelR = Sequential()
#     modelR.add(LSTM(len(env.state), input_shape=(
#         len(env.state),), activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(LSTM(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelR.compile(Adam(learning_rate=rl_config['LEARNING_RATE']),
#                    loss=rl_config['LOSS_FUNCTION'])

#     return modelQ, modelR

In [None]:
# from keras.layers import LSTM, Dense
# from keras.models import Sequential
# from keras.optimizers import SGD

# def init_net(env, rl_config):
#     """
#     This initialises the RL run by
#     creating two new predictive neural network
#     Args:
#         env:
#     Returns:
#         modelQ: the neural network
#         modelR: the neural network

#     """
#     hidden_size = len(env.state)*rl_config['HIDDEN_MULT']
#     timesteps = 1  # You may need to adjust this depending on the input shape of your environment's state

#     modelQ = Sequential()
#     modelQ.add(LSTM(hidden_size, input_shape=(timesteps, len(env.state)), activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelQ.compile(SGD(lr=rl_config['LEARNING_RATE']), loss=rl_config['LOSS_FUNCTION'])

#     modelR = Sequential()
#     modelR.add(LSTM(hidden_size, input_shape=(timesteps, len(env.state)), activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelR.compile(SGD(lr=rl_config['LEARNING_RATE']), loss=rl_config['LOSS_FUNCTION'])

#     return modelQ, modelR


In [None]:
from keras.optimizers import Adam
from keras.layers import Dropout

def init_net(env, rl_config):
    """
    This initialises the RL run by
    creating two new predictive neural network
    Args:
        env:
    Returns:
        modelQ: the neural network
        modelR: the neural network

    """
    dropout_rate = 0.5  # adjust the dropout rate as needed
    hidden_size = len(env.state) * rl_config['HIDDEN_MULT']

    modelQ = Sequential()
    modelQ.add(Dense(len(env.state), input_shape=(
        len(env.state),), activation=rl_config['ACTIVATION_FUN']))
    modelQ.add(Dropout(dropout_rate))
    modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelQ.add(Dropout(dropout_rate))
    modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelQ.add(Dropout(dropout_rate))
    modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelQ.add(Dropout(dropout_rate))
    modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelQ.add(Dropout(dropout_rate))
    modelQ.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
    modelQ.compile(Adam(lr=rl_config['LEARNING_RATE']),
                   loss=rl_config['LOSS_FUNCTION'])

    modelR = Sequential()
    modelR.add(Dense(len(env.state), input_shape=(
        len(env.state),), activation=rl_config['ACTIVATION_FUN']))
    modelR.add(Dropout(dropout_rate))
    modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelR.add(Dropout(dropout_rate))
    modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelR.add(Dropout(dropout_rate))
    modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelR.add(Dropout(dropout_rate))
    modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
    modelR.add(Dropout(dropout_rate))
    modelR.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
    modelR.compile(Adam(lr=rl_config['LEARNING_RATE']),
                   loss=rl_config['LOSS_FUNCTION'])

    return modelQ, modelR


In [None]:
# def init_net(env, rl_config):
#     """
#     This initialises the RL run by
#     creating two new predictive neural network
#     Args:
#         env:
#     Returns:
#         modelQ: the neural network
#         modelR: the neural network

#     """
#     hidden_size = len(env.state)*rl_config['HIDDEN_MULT']
    
#     modelQ = Sequential()
#     modelQ.add(Dense(len(env.state), input_shape=(len(env.state),), activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelQ.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelQ.compile(Adam(lr=rl_config['LEARNING_RATE']), loss=rl_config['LOSS_FUNCTION'])

#     modelR = Sequential()
#     modelR.add(Dense(len(env.state), input_shape=(len(env.state),), activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(hidden_size, activation=rl_config['ACTIVATION_FUN']))
#     modelR.add(Dense(rl_config['NUM_ACTIONS'], activation='softmax'))
#     modelR.compile(Adam(lr=rl_config['LEARNING_RATE']), loss=rl_config['LOSS_FUNCTION'])

#     return modelQ, modelR


In [None]:
bars1 = train_aapl
bars2 = train_spy
# Create Game class environment
env = Game(bars1,bars2, reward_exponential_pnl,lkbk=14,  init_idx=300)

In [None]:
env.act(1)

In [None]:
env.get_position(0)

In [None]:
len(env.state)

# Experience Replay

In [None]:
class ExperienceReplay(object):
    '''This class calculates the Q-Table.
    It gathers memory from previous experience and 
    creates a Q-Table with states and rewards for each
    action using the NN. At the end of the game the reward
    is calculated from the reward function. 
    The weights in the NN are constantly updated with new
    batch of experience. 
    This is the heart of the RL algorithm.
    Args:
        state_tp1: the state at time t+1
        state_t: the state at time t
        action_t: int {0..2} hold, sell, buy taken at state_t 
        Q_sa: float, the reward for state_tp1
        reward_t: the reward for state_t
        self.memory: list of state_t, action_t and reward_t at time t as well as state_tp1
        targets: array(float) Nx2, weight of each action
        inputs: an array with scrambled states at different times
        targets: Nx3 array of weights for each action for scrambled input states
    '''

    def __init__(self, max_memory, discount):
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        '''Add states to time t and t+1 as well as  to memory'''
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def process(self, modelQ, modelR, batch_size=10):
        len_memory = len(self.memory)
        num_actions = modelQ.output_shape[-1]
        env_dim = self.memory[0][0][0].shape[1]

        """Initialise input and target arrays"""
        inputs = np.zeros((min(len_memory, batch_size), env_dim))
        targets = np.zeros((inputs.shape[0], num_actions))

        # Option 1
        """
        Random Sampling for loop:
        Step randomly through different places in the memory
        and scramble them into a new input array (inputs) with the
        length of the pre-defined batch size
        
        for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):    
            # Obtain the parameters for Bellman from memory,
            # S.A.R.S: state, action, reward, new state
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            # Boolean flag to check if the game is over
            game_over = self.memory[idx][1]
            inputs[i] = state_t    
        
        """

        # Option 2

        """
        Recency Sampling for loop:
        Select sequentially from the most recent memories. The number of memories fetched is
        determined by the batch size.
        
        for i, idx in enumerate(np.arange(-inputs.shape[0],0))
        
        """

        # Implementing the recency sampling loop
        for i, idx in enumerate(np.arange(-inputs.shape[0], 0)):
            """Obtain the parameters for Bellman from memory,
            S.A.R.S: state, action, reward, new state."""
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]
            inputs[i] = state_t

            """Calculate the targets for the state at time t"""
            targets[i] = modelR.predict(state_t)[0]

            """Calculate the reward at time t+1 for action at time t"""
            Q_sa = np.max(modelQ.predict(state_tp1)[0])

            if game_over:
                """When game is over we have a definite reward"""
                targets[i, action_t] = reward_t
            else:
                """
                Update the part of the target for which action_t occurred to new value
                Q_new(s,a) = reward_t + gamma * max_a' Q(s', a')
                """
                targets[i, action_t] = reward_t + self.discount * Q_sa

        return inputs, targets

# Backtest function

In [None]:
def run(bars1,bars2, rl_config):
    """
    Function to run the RL model on the passed price data
    """

    pnls = []
    trade_logs = pd.DataFrame()
    episode = 0

    """Initialise a NN and a set up initial game parameters and experience replay"""
    env = Game(bars1,bars2, rl_config['RF'],
               lkbk=rl_config['LKBK'], init_idx=rl_config['START_IDX'])
    q_network, r_network = init_net(env, rl_config)
    exp_replay = ExperienceReplay(
        max_memory=rl_config['MAX_MEM'], discount=rl_config['DISCOUNT_RATE'])
    
    """Preloading the model weights"""
    if rl_config['PRELOAD']:
        q_network.load_weights(rl_config['WEIGHTS_FILE'])
        r_network.load_weights(rl_config['WEIGHTS_FILE'])
        exp_replay.memory = pickle.load(open(rl_config['REPLAY_FILE'], 'rb'))

    r_network.set_weights(q_network.get_weights())

    """Loop that steps through one trade (game) at a time"""
    while True:
        """Stop the algo when end is near to avoid exception"""
        if env.curr_idx >= len(bars1)-1:
            break

        episode += 1

        """Initialise a new game"""
        env = Game(bars1,bars2, rl_config['RF'],
                   lkbk=rl_config['LKBK'], init_idx=env.curr_idx)
        state_tp1 = env.get_state()

        """Calculate epsilon for exploration vs exploitation random action generator"""
#         epsilon = rl_config['EPSILON']**(np.log10(episode)) + rl_config['EPS_MIN']
        epsilon = rl_config['EPSILON']
        epsilon_decay = 0.995       
        epsilon *= epsilon_decay
        epsilon = max(rl_config['EPS_MIN'], epsilon)


        game_over = False
        cnt = 0

        """Walkthrough time steps starting from the end of the last game"""
        while not game_over:

            if env.curr_idx >= len(bars1)-1:
                break

            cnt += 1
            state_t = state_tp1

            """Generate a random action or through q_network"""
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, 3, size=1)[0]

            else:
                q = q_network.predict(state_t)
                action = np.argmax(q[0])

            """Updating the Game"""
            reward, game_over = env.act(action)

            """Updating trade/position logs"""
            tl = [[env.curr_time, env.position, episode]]
            if game_over:
                tl = [[env.curr_time, 0, episode]]
            trade_logs = trade_logs.append(tl)

            """Move to next time step"""
            env.curr_idx += 1
            state_tp1 = env.get_state()

            """Adding state to memory"""
            exp_replay.remember([state_t, action, reward, state_tp1], game_over)

            """Creating a new Q-Table"""
            inputs, targets = exp_replay.process(q_network, r_network, batch_size=rl_config['BATCH_SIZE'])
            env.pnl_sum = sum(pnls)

            """Update the NN model with a new Q-Table"""
            q_network.train_on_batch(inputs, targets)

            if game_over and rl_config['UPDATE_QR']:
                r_network.set_weights(q_network.get_weights())

        pnls.append(env.pnl)

        print("Trade {:03d} | pos {} | len {} | approx cum ret {:,.2f}% | trade ret {:,.2f}% | eps {:,.4f} | {} | {}".format(
            episode, env.position, env.trade_len, sum(pnls)*100, env.pnl*100, epsilon, env.curr_time, env.curr_idx))

        if not episode % 10:
            print('---saving weights, trade logs and replay buffer---')
            r_network.save_weights(rl_config['WEIGHTS_FILE'], overwrite=True)
            trade_logs.to_pickle(rl_config['TRADE_FILE'])
            pickle.dump(exp_replay.memory, open(
                rl_config['REPLAY_FILE'], 'wb'))

        if not episode % 15 and rl_config['TEST_MODE']:
            print('\n**********************************************\nTest mode is on due to resource constraints and therefore stopped after 15 trades. \nYou can trade on full dataset on your local computer and set TEST_MODE flag to False in rl_config dictionary. \n**********************************************\n')
            break

    print('---saving weights, trade logs and replay buffer---')
    r_network.save_weights(rl_config['WEIGHTS_FILE'], overwrite=True)
    trade_logs.to_pickle(rl_config['TRADE_FILE'])
    pickle.dump(exp_replay.memory, open(rl_config['REPLAY_FILE'], 'wb'))

    print('***FINISHED***')

# RL configuration and running the backtest

In [None]:
"""
For running the RL model on the price data, you need to 
set the configuration parameters.
These configuration parameters are hyperparameters for the 
RL model and the ANN used in it.
"""
rl_config = {

    'LEARNING_RATE': 1.0e-6, 
    'LOSS_FUNCTION': 'mse',
    'ACTIVATION_FUN': 'relu',
    'NUM_ACTIONS': 3,
    'HIDDEN_MULT': 2, # 
    'DISCOUNT_RATE': 0.9, 
    'LKBK': 14,
    'BATCH_SIZE': 10, # 
    'MAX_MEM': 1000,  #  
    'EPSILON': 1.0, # 
    'EPS_MIN': 0.01,   # 
    'START_IDX': 300,
    'WEIGHTS_FILE': 'indicator_model_aapl_1.h5',
    'TRADE_FILE': 'trade_logs_aapl_1.bz2',
    'REPLAY_FILE': 'memory_aapl_1.bz2',
    'RF': reward_categorical_pnl,  # You can change the reward function here
    'TEST_MODE': False,
    'PRELOAD': False,
    'UPDATE_QR': True
}

"""
Run the RL model on the price data
Note: To run in a local machine, please change the `TEST_MODE` to 
`False` in `rl_config`
"""
run(bars1,bars2, rl_config)

In [None]:
# pd.read_pickle('trade_logs_aapl_0.bz2').to_csv('trade_log.csv')

In [None]:
rl_config = {

    'LEARNING_RATE': 1.0e-5, 
    'LOSS_FUNCTION': 'mse',
    'ACTIVATION_FUN': 'relu',
    'NUM_ACTIONS': 3,
    'HIDDEN_MULT': 2, # 
    'DISCOUNT_RATE': 0.9, 
    'LKBK': 14,
    'BATCH_SIZE': 5, # 
    'MAX_MEM': 1000,  #  
    'EPSILON': 1., # 
    'EPS_MIN': 0.01,   # 
    'START_IDX': 300,
    'WEIGHTS_FILE': 'indicator_model_aapl_2.h5',
    'TRADE_FILE': 'trade_logs_aapl_2.bz2',
    'REPLAY_FILE': 'memory_aapl_2.bz2',
    'RF': reward_categorical_pnl,  # You can change the reward function here
    'TEST_MODE': False,
    'PRELOAD': False,
    'UPDATE_QR': True
}

run(bars1,bars2, rl_config)

In [None]:
rl_config = {

    'LEARNING_RATE': 1.0e-5, 
    'LOSS_FUNCTION': 'mse',
    'ACTIVATION_FUN': 'relu',
    'NUM_ACTIONS': 3,
    'HIDDEN_MULT': 2, # 
    'DISCOUNT_RATE': 0.9, 
    'LKBK': 14,
    'BATCH_SIZE': 1, # 
    'MAX_MEM': 1000,  #  
    'EPSILON': 1., # 
    'EPS_MIN': 0.001,   # 
    'START_IDX': 300,
    'WEIGHTS_FILE': 'indicator_model_aapl_3.h5',
    'TRADE_FILE': 'trade_logs_aapl_3.bz2',
    'REPLAY_FILE': 'memory_aapl_3.bz2',
    'RF': reward_categorical_pnl,  # You can change the reward function here
    'TEST_MODE': False,
    'PRELOAD': False,
    'UPDATE_QR': True
}

run(bars1,bars2, rl_config)

In [None]:
rl_config = {

    'LEARNING_RATE': 1.0e-5, 
    'LOSS_FUNCTION': 'mse',
    'ACTIVATION_FUN': 'relu',
    'NUM_ACTIONS': 3,
    'HIDDEN_MULT': 3, # 
    'DISCOUNT_RATE': 0.99, 
    'LKBK': 14,
    'BATCH_SIZE': 1, # 
    'MAX_MEM': 1000,  #  
    'EPSILON': 1., # 
    'EPS_MIN': 0.01,   # 
    'START_IDX': 300,
    'WEIGHTS_FILE': 'indicator_model_aapl_4.h5',
    'TRADE_FILE': 'trade_logs_aapl_4.bz2',
    'REPLAY_FILE': 'memory_aapl_4.bz2',
    'RF': reward_categorical_pnl,  # You can change the reward function here
    'TEST_MODE': False,
    'PRELOAD': False,
    'UPDATE_QR': True
}

run(bars1,bars2, rl_config)

In [None]:
rl_config = {

    'LEARNING_RATE': 1.0e-6, 
    'LOSS_FUNCTION': 'mse',
    'ACTIVATION_FUN': 'relu',
    'NUM_ACTIONS': 3,
    'HIDDEN_MULT': 3, # 
    'DISCOUNT_RATE': 0.9, 
    'LKBK': 14,
    'BATCH_SIZE': 1, # 
    'MAX_MEM': 1000,  #  
    'EPSILON': 1., # 
    'EPS_MIN': 0.0001,   # 
    'START_IDX': 300,
    'WEIGHTS_FILE': 'indicator_model_aapl_5.h5',
    'TRADE_FILE': 'trade_logs_aapl_5.bz2',
    'REPLAY_FILE': 'memory_aapl_5.bz2',
    'RF': reward_categorical_pnl,  # You can change the reward function here
    'TEST_MODE': False,
    'PRELOAD': False,
    'UPDATE_QR': True
}

run(bars1,bars2, rl_config)

# Performance on the test set

Here we need to load the weights of the best agents based on its performance and use that agent to  to make decisions (take actions) on the test data

In [None]:
# rl_config = {

#     'LEARNING_RATE': 0.000001, 
#     'LOSS_FUNCTION': 'mse',
#     'ACTIVATION_FUN': 'relu',
#     'NUM_ACTIONS': 3,
#     'HIDDEN_MULT': 3, # 
#     'DISCOUNT_RATE': 0.99, 
#     'LKBK': 14,
#     'BATCH_SIZE': 10, # 
#     'MAX_MEM': 600,  #  
#     'EPSILON': 1.0, # 0.0001
#     'EPS_MIN': 0.1,   # 0.001
#     'START_IDX': 300,
#     'WEIGHTS_FILE': 'indicator_model_aapl_4.h5', # note that the weights obtained while training will be overwritten.
#     'TRADE_FILE': 'trade_logs_aapl_4.bz2',
#     'REPLAY_FILE': 'memory_aapl_4.bz2',
#     'RF': reward_categorical_pnl,  # You can change the reward function here
#     'TEST_MODE': False,
#     'PRELOAD': True, # here we're loading weights from the training phase
#     'UPDATE_QR': True
# }

In [None]:
run(test_aapl,test_spy, rl_config)