In [None]:
# Tensorflow
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Add, Subtract, Lambda
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import Huber

# Change tensorflow default behavior (where it uses all of the memory at the outset)
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

# Raise warning as error to capture floating-point errors during computation:
# https://stackoverflow.com/questions/34955158/what-might-be-the-cause-of-invalid-value-encountered-in-less-equal-in-numpy/34955622
# https://www.soa.org/news-and-publications/newsletters/compact/2014/may/com-2014-iss51/losing-my-precision-tips-for-handling-tricky-floating-point-arithmetic/
import numpy as np
# np.seterr(all='raise')

# Plotly
from plotly.offline import iplot, plot, init_notebook_mode
import plotly.graph_objects as go
init_notebook_mode(connected=True)

# Time measurement
import time
from datetime import timedelta

import pandas as pd
import os
import random
from IPython.display import clear_output

# Sound notification
import winsound

# Profiling
%load_ext line_profiler

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_IMAGE   = 'resources/output/image/'
OUT_PATH_GRAPH = 'resources/output/graph/'
OUT_PATH_FILE = 'resources/output/file/'

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def plot_graph(data, title, xlabel=None, ylabel=None, generate_file=True):
    layout = go.Layout(
        title = title,
        xaxis = dict(
            title=xlabel,
            gridcolor='rgb(159, 197, 232)'
        ),
        yaxis = dict(
            title=ylabel,
            gridcolor='rgb(159, 197, 232)'
        ),
        hovermode='x',
        showlegend=True,
        legend_orientation='h',
        plot_bgcolor='rgba(0, 0, 0, 0)'
    )

    fig = go.Figure(data=data, layout=layout)
    fig.update_yaxes(hoverformat=".5f")

    if generate_file:
        generate_plot(fig, f'{OUT_PATH_GRAPH}', f'{title}.html')
    else:
        generate_plot(fig)

def generate_plot(fig, out_path=None, out_filename=None):
    if out_path is None:
        iplot(fig)
    else:
        create_directory(out_path)
        out_file = f'{out_path}{out_filename}'
        plot(fig, filename=out_file, auto_open=False)
        
        print(f'Generated: {out_file}')

def generate_csv(df, out_path, out_filename, export_index=None):
    create_directory(out_path)
    out_file = f'{out_path}{out_filename}'
    df.to_csv(out_file, sep=';', index=export_index, header=True)
    
    print(f'Generated: {out_file}')

def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Scaler

In [None]:
def min_max_scale(x, min_value, max_value, precision=5):
    # Reference: https://www.codecademy.com/articles/normalization
    return np.round((x - min_value) / (max_value - min_value), precision)

def zero_one_scale(x, precision=5):
    # Reference: https://stackoverflow.com/questions/42140347/normalize-any-value-in-range-inf-inf-to-0-1-is-it-possible
    return np.round((1 + x / (1 + abs(x))) * .5, precision)

# Environment

In [None]:
class ForexEnv:
    def __init__(self, source_path, filename, nrows=None, train_size=.7, train=True, random_size=.8):
        source_path = SOURCE_PATH_DATA
        filename    = f'DAT_ASCII_{currency_pair}_T_201901.csv'
        self.__train_test_split(source_path, filename, nrows=nrows, train_size=train_size, train=train)
        
        self.random_range = int(len(self.indexes) * random_size)
        
    def __train_test_split(self, source_path, filename, chunk_size=50_000, nrows=None, train_size=.7, train=True):
        source_file = f'{source_path}{filename}'
        df_chunks = pd.read_csv(source_file, sep=',',
                                header=None, names=['datetime', 'bid', 'ask', 'vol'],
                                usecols=['datetime', 'bid', 'ask'],
                                parse_dates=['datetime'],
                                date_parser=lambda x: pd.to_datetime(x, format="%Y%m%d %H%M%S%f"),
                                chunksize=chunk_size, nrows=nrows)
        timeseries_df = pd.concat(df_chunks)
        
        row_count  = len(timeseries_df) if nrows is None else nrows
        split_size = round(row_count * train_size)
        
        if train:
            timeseries_df = timeseries_df[:split_size].reset_index().drop(columns=['index'])
        else:
            timeseries_df = timeseries_df[split_size:].reset_index().drop(columns=['index'])
        
        self.indexes   = timeseries_df.index.values
        self.datetimes = timeseries_df['datetime'].values
        self.bids      = timeseries_df['bid'].values
        self.asks      = timeseries_df['ask'].values
        
    def constant_values(self):
        return {
            'TRADE_STATUS': {
                'OPEN': 'OPEN',
                'CLOSE': 'CLOSED',
                'CLOSE_TRADE': 'CLOSE_TRADE'
            },
            'TRADE_ACTION': {
                'DEFAULT': -1,
                'BUY': 0,
                'SELL': 1,
                'HOLD': 2
            }
        }
        
    def state_space(self):
        return np.array(['entry_action', 'bid_fluctuation_pct', 'ask_fluctuation_pct', 'equity_pct'])
        
    def state_size(self):
        return len(self.state_space())
        
    def action_space(self):
        const_action_dict = self.constant_values()['TRADE_ACTION']
        return [const_action_dict['BUY'], const_action_dict['SELL'], const_action_dict['HOLD']]
        
    def action_size(self):
        return len(self.action_space())
        
    def available_actions(self):
        const_status_dict = self.constant_values()['TRADE_STATUS']
        actions = self.action_space()
        
        # Have open trades
        trade_dict = self.trading_params_dict['trade_dict']
        if const_status_dict['OPEN'] in trade_dict['status']:
            open_index  = trade_dict['status'].index(const_status_dict['OPEN'])
            open_action = trade_dict['action'][open_index]

            # Ensure agent is able to have only 1 open trade while trading
            actions.remove(open_action)
        return actions
    
    def __price_by_action(self, action, bid, ask, closed_trade):
        const_action_dict = self.constant_values()['TRADE_ACTION']
        
        # Close trade by Selling at Ask price, and Buying at Bid price
        if closed_trade:
            return bid if action == const_action_dict['BUY'] else ask
        
        # Open trade by Buying at Ask price, and Selling at Bid price
        else:
            return ask if action == const_action_dict['BUY'] else bid
    
    def __profit_by_action(self, entry_action, entry_price, curr_bid, curr_ask):
        const_action_dict = self.constant_values()['TRADE_ACTION']
        if entry_action == const_action_dict['BUY']:
            return curr_ask - entry_price
        
        elif entry_action == const_action_dict['SELL']:
            return entry_price - curr_bid
        return 0
    
    def update_timestep(self, index):
        try:
            self.timestep = {
                'index': self.indexes[index],
                'datetime': self.datetimes[index],
                'bid': self.bids[index],
                'ask': self.asks[index]
            }
            return False
        
        except:
            self.timestep = {}
            return True
    
    def scale_state(self, state):
        entry_action, bid_fluctuation_pct, ask_fluctuation_pct, equity_pct = state
        
        scaled_state      = min_max_scale(entry_action, -1, 2)
        scaled_bid_pct    = min_max_scale(bid_fluctuation_pct, -100, 100)
        scaled_ask_pct    = min_max_scale(ask_fluctuation_pct, -100, 100)
        scaled_equity_pct = min_max_scale(equity_pct, -100, 100)
        
        return np.array([scaled_state, scaled_bid_pct, scaled_ask_pct, scaled_equity_pct])
    
    def reset(self, random=True):
        # State
        self.default_entry_action        = self.constant_values()['TRADE_ACTION']['DEFAULT']
        self.default_bid_fluctuation_pct = 0.
        self.default_ask_fluctuation_pct = 0.
        self.default_equity_pct          = 100.
        
        entry_action        = self.default_entry_action
        bid_fluctuation_pct = self.default_bid_fluctuation_pct
        ask_fluctuation_pct = self.default_ask_fluctuation_pct
        equity_pct          = self.default_equity_pct
        self.observe_bid    = None
        self.observe_ask    = None
        self.state = np.array([entry_action, bid_fluctuation_pct, ask_fluctuation_pct, equity_pct])
        
        # Timestep
        index = np.random.choice(self.indexes[:self.random_range]) if random else 0
        self.update_timestep(index)
        
        # Trading
        self.trading_params_dict = {
            'orig_bal': 1_000_000.,
            'acct_bal': 1_000_000.,
            'unit':     100_000.,
            
            'trade_dict': {
                'action':   [],
                'datetime': [],
                'price':    [],
                'status':   [],
                'profits':  [],
                'acct_bal': []
            }
        }
        return self.state
    
    def step(self, action):
        const_action_dict = self.constant_values()['TRADE_ACTION']
        const_status_dict = self.constant_values()['TRADE_STATUS']
        
        self.observe_bid = self.timestep['bid'] if self.observe_bid is None else self.observe_bid
        self.observe_ask = self.timestep['ask'] if self.observe_ask is None else self.observe_ask
        
        trade_dict = self.trading_params_dict['trade_dict']
        
        # Get entry action & price
        # - if there's no entry action, treat current action as action to open a trade
        # - if there's entry action, treat current action as action to close a trade
        try:
            # NOTE: not to use pd.DataFrame() to convert trade_dict to dataframe, as it is slower
            open_index      = trade_dict['status'].index(const_status_dict['OPEN'])
            trade_actions   = trade_dict['action'][open_index:]
            trade_prices    = trade_dict['price'][open_index:]
            trade_datetimes = trade_dict['datetime'][open_index:]
            
            entry_action = trade_actions[0]
            
            # Not allowed to close open trades with same entry action
            if entry_action == action:
                trade_actions  = []
                trade_prices   = []
                trade_datetime = []
            
        except:
            trade_actions  = []
            trade_prices   = []
            trade_datetime = []

            entry_action = self.default_entry_action
        
        
        profit = 0
        closed_trade = False
        sufficient_margin = True
        if action in [const_action_dict['BUY'], const_action_dict['SELL']]:
            # Close open trades
            for trade_index, trade_price in enumerate(trade_prices):
                profit += self.__profit_by_action(entry_action, trade_price, self.timestep['bid'], self.timestep['ask'])
                profit *= self.trading_params_dict['unit']
                profit = round(profit, 5)
                
                trade_dict['status'][trade_dict['datetime'].index(trade_datetimes[trade_index])] = const_status_dict['CLOSE']
                closed_trade = True

            # Add trade transaction
            self.trading_params_dict['acct_bal'] += profit
            price = self.__price_by_action(action, self.timestep['bid'], self.timestep['ask'], closed_trade)

            # Add back free margin upon close trade
            if closed_trade:
                self.trading_params_dict['acct_bal'] += (len(trade_prices) * self.trading_params_dict['unit'])
                
            # Deduct required margin upon opening trade
            else:
                required_margin = self.trading_params_dict['unit']
                if self.trading_params_dict['acct_bal'] < required_margin:
                    sufficient_margin = False
                self.trading_params_dict['acct_bal'] -= required_margin
            
            
            trade_dict['action'].append(action)
            trade_dict['datetime'].append(self.timestep['datetime'])
            trade_dict['price'].append(price)
            trade_dict['status'].append(const_status_dict['CLOSE_TRADE'] if closed_trade else const_status_dict['OPEN'])
            trade_dict['profits'].append(profit)
            trade_dict['acct_bal'].append(round(self.trading_params_dict['acct_bal'], 5))
        
        
        # Calculate floating P/L
        float_profit = 0
        if (entry_action != self.default_entry_action) & (not closed_trade):
            for trade_index, trade_price in enumerate(trade_prices):
                float_profit = self.__profit_by_action(entry_action, trade_price, self.timestep['bid'], self.timestep['ask'])
                float_profit *= self.trading_params_dict['unit']
                float_profit = round(float_profit, 5)
        
        # Calculate equity %
        equity_pct = (self.trading_params_dict['acct_bal'] + float_profit) / self.trading_params_dict['orig_bal'] * 100
        equity_pct = round(equity_pct, 5)
        
        # Observe the price at current timestemp if open or closed trades, else observe the entry price
        self.observe_bid = self.timestep['bid'] if closed_trade else self.observe_bid if action == const_action_dict['HOLD'] else self.timestep['bid']
        self.observe_ask = self.timestep['ask'] if closed_trade else self.observe_ask if action == const_action_dict['HOLD'] else self.timestep['ask']
        
        # Calculate fluctuation %
        bid_fluctuation_pct = round((self.timestep['bid'] - self.observe_bid) / self.observe_bid * 100, 5)
        ask_fluctuation_pct = round((self.timestep['ask'] - self.observe_ask) / self.observe_ask * 100, 5)
        
        
        # State
        state_entry_action = self.default_entry_action if closed_trade else self.state[0] if action == const_action_dict['HOLD'] else action
        next_state = np.array([state_entry_action, bid_fluctuation_pct, ask_fluctuation_pct, equity_pct])
        self.state = next_state
        
        # Reward
        reward = profit
        
        done = self.update_timestep(self.timestep['index'] +1)
        if not done:
            # Stop trading if do not have balance, and there's no open trade
            if (self.trading_params_dict['acct_bal'] <= self.trading_params_dict['unit']) & (const_status_dict['OPEN'] not in trade_dict['status']):
                done = True
                
            # Stop trading if do not have enough balance to pay for required margin
            elif not sufficient_margin:
                done = True
        
        # Additional information
        info_dict = {
            'closed_trade': closed_trade,
            'sufficient_margin': sufficient_margin
        }
        return (self.state, reward, done, info_dict)

# Replay Buffer

### Normal Replay

In [None]:
class ReplayBuffer:
    def __init__(self, memory_size, state_size, action_size):
        self.memory_size = memory_size
        self.counter     = 0
        
        self.states      = np.empty((0, state_size))
        self.actions     = np.empty((0, action_size), dtype=np.int8)
        self.rewards     = np.empty(0)
        self.next_states = np.empty((0, state_size))
        self.terminals   = np.empty(0, dtype=np.float32)
        
    def preprocess(self, experience):
        state, action, reward, next_state, done = experience
        
        # One-Hot encoding
        one_hot_action = np.zeros(self.actions.shape[1], dtype=np.int8)
        one_hot_action[action] = 1
        
        return (state, one_hot_action, reward, next_state, (1 - done))
    
    # NOTE: not to use collections.deque, as it is much slower while perform sampling
    def __deque(self, store_index, element, elements):
        try:
            elements[store_index] = element
        except IndexError:
            elements = np.append(elements, [element], axis=0)
            
        return elements
    
    def store_transition(self, state, action, reward, next_state, done):
        state, action, reward, next_state, terminal = self.preprocess((state, action, reward, next_state, done))
        
        store_index      = self.counter % self.memory_size
        self.states      = self.__deque(store_index, state, self.states)
        self.actions     = self.__deque(store_index, action, self.actions)
        self.rewards     = self.__deque(store_index, reward, self.rewards)
        self.next_states = self.__deque(store_index, next_state, self.next_states)
        self.terminals   = self.__deque(store_index, terminal, self.terminals)
        
        self.counter += 1
        
    def currexp_to_sample(self, experience, experiences):
        states, actions, rewards, next_states, terminals = experiences
        
        # Include current experience
        if experience is not None:
            state, action, reward, next_state, terminal = self.preprocess(experience)
            
            states      = np.vstack([states, state])
            actions     = np.vstack([actions, action])
            rewards     = np.append(rewards, reward)
            next_states = np.vstack([next_states, next_state])
            terminals   = np.append(terminals, terminal)
            
        return (states, actions, rewards, next_states, terminals)
        
    def sample(self, batch_size, experience=None):
        batch_size  = batch_size if experience is None else batch_size -1
        minibatch   = np.random.choice(len(self.states), batch_size)
        
        states      = self.states[minibatch]
        actions     = self.actions[minibatch]
        rewards     = self.rewards[minibatch]
        next_states = self.next_states[minibatch]
        terminals   = self.terminals[minibatch]
        
        return self.currexp_to_sample(experience, (states, actions, rewards, next_states, terminals))

### Weighted Sample Replay

In [None]:
class WSRBuffer(ReplayBuffer):
    def __init__(self, memory_size, state_size, action_size, unusual_sample_factor=.99):
        super().__init__(memory_size, state_size, action_size)
        
        # Reference: https://medium.com/ml-everything/reinforcement-learning-with-sparse-rewards-8f15b71d18bf
        # Determine how much difference between experiences with low rewards and experiences with high rewards
        # The lower the value, the higher then difference, and 1 means no different
        self.unusual_sample_factor = unusual_sample_factor
        
    def sample(self, batch_size, experience=None):
        # Sort rewards index descendingly by absolute value
        sort_indexes = np.argsort(-np.abs(self.rewards))
        
        # Calculate probabilities and ensure sum of probabilities is max. 1
        probabilities = np.power(self.unusual_sample_factor, np.arange(len(sort_indexes)))
        probabilities = probabilities / np.sum(probabilities)
        
        # Sample minibatch
        batch_size = batch_size if experience is None else batch_size -1
        minibatch  = np.random.choice(np.arange(len(probabilities)), size=batch_size, p=probabilities)
        minibatch_indexes = sort_indexes[minibatch]
        
        states      = self.states[minibatch_indexes]
        actions     = self.actions[minibatch_indexes]
        rewards     = self.rewards[minibatch_indexes]
        next_states = self.next_states[minibatch_indexes]
        terminals   = self.terminals[minibatch_indexes]
        
        return self.currexp_to_sample(experience, (states, actions, rewards, next_states, terminals))

# Model

In [None]:
class MatrixModel:
    def __init__(self, state_size, action_size, init_value=.0):
        self.state_size  = state_size
        self.action_size = action_size
        self.init_value  = init_value
        
        self.states = np.empty((0, self.state_size))
        self.values = np.empty((0, self.action_size))
        
    def state_values(self, state):
        try:
            index  = self.states.tolist().index(state.tolist())
            values = self.values[index]
        except:
            self.states = np.append(self.states, [state], axis=0)
            self.values = np.append(self.values, [[self.init_value for _ in range(self.action_size)]], axis=0)
            values      = self.state_values(state)
            
        return values
    
    def state_action_value(self, state, action):
        values = self.state_values(state)
        return values[action]
    
    def set_state_action(self, state, action, value):
        values = self.state_values(state)
        values[action] = value
        
        # No need to update array as it's modified on referencing variable
        # index = self.states.tolist().index(state.tolist())
        # self.values[index] = values

In [None]:
class SequentialNetworkModel:
    def __init__(self, state_size, action_size, alpha, neurons=[]):
        # Input layer
        inputs = Input(shape=(state_size,))
        
        # Hidden layer
        for index, neuron in enumerate(neurons):
            connected_layer = inputs if index == 0 else layer
            layer = Dense(neuron, activation='relu', kernel_initializer='he_uniform')(connected_layer)
        
        # Output layer
        connected_layer = inputs if len(neurons) == 0 else layer
        outputs = Dense(action_size, activation='linear')(connected_layer)
        
        self.model = Model(inputs=inputs, outputs=outputs)
        self.model.compile(optimizer=RMSprop(lr=alpha), loss='mse')
        # Reference: https://medium.com/@jonathan_hui/rl-dqn-deep-q-network-e207751f7ae4
        # self.model.compile(optimizer=RMSprop(lr=alpha), loss=Huber(delta=1.0))
        
    def model_diagram(self, filename):
        out_path = OUT_PATH_IMAGE
        create_directory(out_path)
        plot_model(self.model, to_file=f'{out_path}{filename}.png', show_shapes=True)

# Agent

In [None]:
class Agent:
    def __init__(self, env):
        self.hyperparams_dict = {
            'epsilon': {
                'min': np.nan, 'max': np.nan, 'decay': np.nan, 'value': np.nan
            },
            'alpha': {
                'min': np.nan, 'max': np.nan, 'decay': np.nan, 'value': np.nan
            },
            'gamma': {
                'min': np.nan, 'max': np.nan, 'increase': np.nan, 'value': np.nan
            }
        }
        self.env = env
        
    def choose_action(self, state):
        return random.choice(self.env.available_actions())
    
    def learn(self, experience, next_action, episode):
        pass

## Off-Policy Agent

### Q-Learning

In [None]:
class QLearningAgent(Agent):
    def __init__(self, env, build_model=True):
        super().__init__(env)
        
        self.hyperparams_dict = {
            'epsilon': {
                'min': .01, 'max': 1., 'decay': .001, 'value': 1.
            },
            'alpha': {
                'min': .001, 'max': .9, 'decay': .001, 'value': .9
            },
            'gamma': {
                'min': .9, 'max': .9, 'increase': .0, 'value': .9
            }
        }
        self.env = env
        if build_model:
            self.main_model = self.build_model(env.state_size(), env.action_size())
        
    def build_model(self, state_size, action_size, init_value=.0):
        return MatrixModel(state_size, action_size, init_value)
    
#     # TODO
#     def save_model_checkpoint(self):
#         out_path = OUT_PATH_FILE
#         create_directory(out_path)
#         self.main_model.save(f'{out_path}{self.model_file}')
        
#     def load_model_checkpoint(self):
#         self.main_model = load_model(f'{OUT_PATH_FILE}{self.model_file}')
    
    def __random_argmax(self, q_values, random_if_empty=False):
        # Reference:
        # https://gist.github.com/stober/1943451
        indexes = np.nonzero(q_values == np.amax(q_values))[0]
        
        try:
            return np.random.choice(indexes)
        except TypeOfError as error:
            if not random_if_empty:
                raise Exception(error)
                
            return np.random.choice(indexes if indexes.size > 0 else q_values.size)
        
    def exploitation(self, q_values):
        # Validation whether action with highest Q-value is valid
        actions      = self.env.action_space()
        action_index = self.__random_argmax(q_values)
        action       = actions[action_index]
        
        if action not in self.env.available_actions():
            # Remove action if it's not a valid action on current state
            q_values = np.delete(q_values, action_index)
            del actions[action_index]

            # Select action with 2nd highest Q-value
            action_index = self.__random_argmax(q_values)
            action       = actions[action_index]
        
        return action
        
    def choose_action(self, state):
        # Exploration
        if np.random.uniform(0, 1) <= self.hyperparams_dict['epsilon']['value']:
            return random.choice(self.env.available_actions())
        
        # Exploitation
        else:
            q_values = self.main_model.state_values(state).copy()
            return self.exploitation(q_values)
        
    def adjust_hyperparams(self, param_name, episode):
        if param_name in ['epsilon', 'alpha']:
            min_value = self.hyperparams_dict[param_name]['min']
            max_value = self.hyperparams_dict[param_name]['max']
            rate      = self.hyperparams_dict[param_name]['decay']
        
        elif param_name == 'gamma':
            # Swap min. max. for incrementing
            min_value = self.hyperparams_dict[param_name]['max']
            max_value = self.hyperparams_dict[param_name]['min']
            rate      = self.hyperparams_dict[param_name]['increase']
            
        self.hyperparams_dict[param_name]['value'] = min_value + (max_value - min_value) * np.exp(-rate * episode)
        
    def learn(self, experience, next_action, episode):
        state, action, reward, next_state, done = experience
        
        # Q[s, a] = Q[s, a] + alpha * (reward + gamma * Max[Q(s’, A)] - Q[s, a])
        max_q_value = np.max(self.main_model.state_values(next_state))
        q_value = self.main_model.state_action_value(state, action)
        q_value = q_value + self.hyperparams_dict['alpha']['value'] * (reward + self.hyperparams_dict['gamma']['value'] * max_q_value - q_value)
        self.main_model.set_state_action(state, action, round(q_value, 10))
        
        # Adjust hyperparameters
        if done:
            self.adjust_hyperparams('epsilon', episode)
            self.adjust_hyperparams('alpha', episode)
            self.adjust_hyperparams('gamma', episode)

### Q-Network

In [None]:
class QNetworkAgent(QLearningAgent):
    def __init__(self, env, build_model=True,
                 sample_size=1, memory_size=1, model_file='QNetwork_MODEL.H5',
                 neurons=[]):
        
        super().__init__(env, build_model=False)
        
        self.hyperparams_dict = {
            'epsilon': {
                'min': .01, 'max': 1., 'decay': .0005, 'value': 1.
            },
            'alpha': {
                'min': .00025, 'max': .01, 'decay': .001, 'value': .01
            },
            'gamma': {
                'min': .9, 'max': .9, 'increase': .0, 'value': .9
            }
        }
        # self.memory       = ReplayBuffer(memory_size, self.env.state_size(), self.env.action_size())
        self.memory       = WSRBuffer(memory_size, self.env.state_size(), self.env.action_size(),
                                      unusual_sample_factor=.99)
        self.sample_size  = sample_size
        self.model_file   = model_file
        
        if build_model:
            self.main_network = self.build_model(self.env.state_size(), self.env.action_size(),
                                                 self.hyperparams_dict['alpha']['value'], neurons)
            self.main_model   = self.main_network.model
        
    def build_model(self, state_size, action_size, alpha, neurons):
        return SequentialNetworkModel(state_size, action_size, alpha, neurons=neurons)
    
    def save_model_checkpoint(self):
        out_path = OUT_PATH_FILE
        create_directory(out_path)
        self.main_model.save(f'{out_path}{self.model_file}')
        
    def load_model_checkpoint(self):
        self.main_model = load_model(f'{OUT_PATH_FILE}{self.model_file}')
    
    def choose_action(self, state):
        # Exploration
        if np.random.uniform(0, 1) <= self.hyperparams_dict['epsilon']['value']:
            return random.choice(self.env.available_actions())
        
        # Exploitation
        else:
            # Change [obs1,obs2,obs3] to [[obs1,obs2,obs3]] format
            state = state[np.newaxis, :]
            
            # Get Q-values for current state in [[q1,q2,q3]] format
            q_values = self.main_model.predict(state, batch_size=len(state))
            
            # Change [[q1,q2,q3]] to [q1,q2,q3] format
            q_values = q_values.reshape(-1)
            
            return self.exploitation(q_values)
        
    def learn(self, experience, next_action, episode):
        state, action, reward, next_state, done = experience
        self.memory.store_transition(state, action, reward, next_state, done)
        
        if self.memory.counter < self.sample_size:
            return False
        
        # states:      [[obs1,obs2,obs3],[obs1,obs2,obs3]...[obs1,obs2,obs3]]
        # actions:     [[0,0,1],[0,1,0]...[1,0,0]]
        # rewards:     [r1,r2...rN]
        # next_states: [[obs1,obs2,obs3],[obs1,obs2,obs3]...[obs1,obs2,obs3]]
        # terminals:   [t1,t2...tN]
        states, actions, rewards, next_states, terminals = self.memory.sample(self.sample_size, experience=experience)
        
        # Change actions from [[0,0,1],[0,1,0]...[1,0,0]] to [2,1...0] format
        action_values  = np.array(self.env.action_space(), dtype=np.int8)
        action_indexes = np.dot(actions, action_values)
        
        # Reference: https://www.youtube.com/watch?v=5fHngyN8Qhw
        # Get Q-values for states in [[q1,q2,q3],[q1,q2,q3]...[q1,q2,q3]] format
        q_values      = self.main_model.predict(states, batch_size=len(states))
        next_q_values = self.main_model.predict(next_states, batch_size=len(next_states))
        q_targets     = q_values.copy()
        
        # Calculate Q-targets
        sample_indexes = np.arange(self.sample_size, dtype=np.int32)
        q_targets[sample_indexes, action_indexes] = rewards + self.hyperparams_dict['gamma']['value'] * np.max(next_q_values, axis=1)
        
        # Update Q-targets
        self.main_model.fit(states, q_targets, epochs=1, verbose=0, batch_size=len(states))
        
        # Adjust hyperparameters
        if done:
            self.adjust_hyperparams('epsilon', episode)
            self.adjust_hyperparams('alpha', episode)
            self.adjust_hyperparams('gamma', episode)
            
            # Update optimizer learning rate
            K.eval(self.main_model.optimizer.lr.assign(self.hyperparams_dict['alpha']['value']))
        
        return True

### Deep Q-Network (DQN)

In [None]:
class DQNAgent(QNetworkAgent):
    def __init__(self, env,
                 sample_size=1, memory_size=1, model_file='DQN_MODEL.H5',
                 neurons=[1_024, 512, 256]):
        
        super().__init__(env, sample_size=sample_size, memory_size=memory_size, model_file=model_file,
                         neurons=neurons)

### Double DQN

In [None]:
class DoubleDQNAgent(DQNAgent):
    def __init__(self, env,
                 sample_size=1, memory_size=1, model_file='DoubleDQN_MODEL.H5',
                 neurons=[1_024, 512, 256]):
        
        super().__init__(env, sample_size=sample_size, memory_size=memory_size, model_file=model_file,
                         neurons=neurons)
        
        # Target Network
        self.hyperparams_dict['tau'] = .125
        self.target_network = self.build_model(self.env.state_size(), self.env.action_size(),
                                               self.hyperparams_dict['alpha']['value'], neurons)
        self.target_model   = self.target_network.model
        self.target_model.set_weights(self.main_model.get_weights())
        
    def learn(self, experience, next_action, episode):
        state, action, reward, next_state, done = experience
        self.memory.store_transition(state, action, reward, next_state, done)
        
        if self.memory.counter < self.sample_size:
            return False
        
        # states:      [[obs1,obs2,obs3],[obs1,obs2,obs3]...[obs1,obs2,obs3]]
        # actions:     [[0,0,1],[0,1,0]...[1,0,0]]
        # rewards:     [r1,r2...rN]
        # next_states: [[obs1,obs2,obs3],[obs1,obs2,obs3]...[obs1,obs2,obs3]]
        # terminals:   [t1,t2...tN]
        states, actions, rewards, next_states, terminals = self.memory.sample(self.sample_size, experience=experience)
        
        # Change actions from [[0,0,1],[0,1,0]...[1,0,0]] to [2,1...0] format
        action_values  = np.array(self.env.action_space(), dtype=np.int8)
        action_indexes = np.dot(actions, action_values)
        
        # Fixed Q-Target
        # Reference: https://www.youtube.com/watch?v=UCgsv6tMReY
        # Get Q-values for states in [[q1,q2,q3],[q1,q2,q3]...[q1,q2,q3]] format
        q_values      = self.target_model.predict(next_states, batch_size=len(next_states))
        next_q_values = self.main_model.predict(next_states, batch_size=len(next_states))
        q_targets     = self.main_model.predict(states, batch_size=len(states))
        
        # Calculate Q-targets
        max_q_indexes  = np.argmax(next_q_values, axis=1)
        sample_indexes = np.arange(self.sample_size, dtype=np.int32)
        q_targets[sample_indexes, action_indexes] = rewards + self.hyperparams_dict['gamma']['value'] * q_values[sample_indexes, max_q_indexes.astype(int)]
        
        # Update Q-targets
        self.main_model.fit(states, q_targets, epochs=1, verbose=0, batch_size=len(states))
        
        # Update target network weights
        # Reference: https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c
        main_weights   = self.main_model.get_weights()
        target_weights = self.target_model.get_weights()
        for weight_index in range(len(target_weights)):
            target_weights[weight_index] = main_weights[weight_index] * self.hyperparams_dict['tau'] + target_weights[weight_index] * (1 - self.hyperparams_dict['tau'])
        self.target_model.set_weights(target_weights)
        
        # Adjust hyperparameters
        if done:
            self.adjust_hyperparams('epsilon', episode)
            self.adjust_hyperparams('alpha', episode)
            self.adjust_hyperparams('gamma', episode)
            
            # Update optimizer learning rate
            K.eval(self.main_model.optimizer.lr.assign(self.hyperparams_dict['alpha']['value']))
        
        return True

### Dueling Double DQN

## On-Policy Agent

### SARSA

In [None]:
class SarsaAgent(QLearningAgent):
    def __init__(self, env):
        super().__init__(env)
        
    def learn(self, experience, next_action, episode):
        state, action, reward, next_state, done = experience
        
        # Q[s, a] = Q[s, a] + alpha * (reward + gamma * Q(s’, a’) - Q[s, a])
        next_q_value = self.main_model.state_action_value(next_state, next_action)
        q_value = self.main_model.state_action_value(state, action)
        q_value = q_value + self.hyperparams_dict['alpha']['value'] * (reward + self.hyperparams_dict['gamma']['value'] * next_q_value - q_value)
        self.main_model.set_state_action(state, action, round(q_value, 10))
        
        # Adjust hyperparameters
        if done:
            self.adjust_hyperparams('epsilon', episode)
            self.adjust_hyperparams('alpha', episode)
            self.adjust_hyperparams('gamma', episode)

### SARSA (λ)

In [None]:
class SarsaLambdaAgent(SarsaAgent):
    def __init__(self, env, episodic_trace=False):
        super().__init__(env)
        
        # Eligibility Trace
        self.hyperparams_dict['elig_trace'] = {
            'init': 1 / env.action_size(),
            'lambda': .9
        }
        self.e_model        = self.__build_e_model()
        self.episodic_trace = episodic_trace
        
    def __build_e_model(self):
        return self.build_model(env.state_size(), env.action_size(),
                                init_value=self.hyperparams_dict['elig_trace']['init'])
        
    def choose_action(self, state):
        # Exploration
        if np.random.uniform(0, 1) <= self.hyperparams_dict['epsilon']['value']:
            return random.choice(self.env.available_actions())
        
        # Exploitation
        else:
            q_values = self.main_model.state_values(state).copy()
            # Ensure state added to main model is added to elibility trace as well
            self.e_model.state_values(state)
            return self.exploitation(q_values)
        
    def learn(self, experience, next_action, episode):
        state, action, reward, next_state, done = experience
        
        # Reference:
        # https://naifmehanna.com/2018-10-18-implementing-sarsa-in-python/
        next_q_value = self.main_model.state_action_value(next_state, next_action)
        q_value = self.main_model.state_action_value(state, action)
        
        # Ensure state added to main model is added to elibility trace as well
        self.e_model.state_action_value(next_state, next_action)
        e_value = self.e_model.state_action_value(state, action)
        self.e_model.set_state_action(state, action, e_value +1)
        
        states_q_values = self.main_model.values
        states_e_values = self.e_model.values
        
        # Calculate & Update Q-matrix
        states_q_values = states_q_values + self.hyperparams_dict['alpha']['value'] * (reward + self.hyperparams_dict['gamma']['value'] * next_q_value - q_value) * states_e_values
        self.main_model.values = np.round(states_q_values, 10)
        
        # Decay & Update E-matrix
        states_e_values = states_e_values * self.hyperparams_dict['gamma']['value'] * self.hyperparams_dict['elig_trace']['lambda']
        self.e_model.values = np.round(states_e_values, 10)
        
        # Adjust hyperparameters
        if done:
            self.adjust_hyperparams('epsilon', episode)
            self.adjust_hyperparams('alpha', episode)
            self.adjust_hyperparams('gamma', episode)
            
            # Re-initialize Eligibility Trace on each episode:
            # https://stackoverflow.com/questions/29904270/eligibility-trace-reinitialization-between-episodes-in-sarsa-lambda-implementati
            if self.episodic_trace:
                self.e_model = self.__build_e_model()
                self.e_model.states = self.main_model.states.copy()
                self.e_model.values = np.full(self.e_model.states.shape, self.hyperparams_dict['elig_trace']['init'])

# Initialize Environment

In [None]:
currency_pair = 'EURUSD'
filename      = f'DAT_ASCII_{currency_pair}_T_201901.csv'

env = ForexEnv(SOURCE_PATH_DATA, filename, nrows=200, train_size=.5)

### Chart: Environment

In [None]:
EXEC_START = time.time()


data = []
price_types = ['bid', 'ask']
for price_index, prices in enumerate([env.bids, env.asks]):
    data.append(go.Scattergl(
        x = pd.DataFrame(env.datetimes)[0],
        y = prices,
        mode = 'lines',
        name = price_types[price_index].title()
    ))

title = f'{currency_pair} - Forex Environment'
plot_graph(data, title, 'Date Time', 'Price')


EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Training

In [None]:
def train(episodes, agent_type):
    try:
        sample_size = 1_000
        memory_size = 1_000_000
        neurons=[128]

        # Off-Policy agent
        if agent_type == 'Normal':
            agent = Agent(env)

        elif agent_type == 'Q-Learning':
            agent = QLearningAgent(env)

        elif agent_type == 'Q-Network':
            agent = QNetworkAgent(env, sample_size=sample_size, memory_size=memory_size)
            agent.main_network.model_diagram(agent_type)

        elif agent_type == 'DQN':
            agent = DQNAgent(env, sample_size=sample_size, memory_size=memory_size, neurons=neurons)
            agent.main_network.model_diagram(agent_type)

        elif agent_type == 'Double DQN':
            agent = DoubleDQNAgent(env, sample_size=sample_size, memory_size=memory_size, neurons=neurons)
            agent.main_network.model_diagram(agent_type)

        # On-Policy agent
        elif agent_type == 'SARSA':
            agent = SarsaAgent(env)

        elif agent_type == 'SARSA Lambda':
            agent = SarsaLambdaAgent(env, episodic_trace=False)


        # Performance tracking
        result_dict = {
            'total_profit': [],
            'used_margin': [],
            'acct_bal': [],
            'trades': []
        }

        # Training iteration
        for episode in range(episodes):
            # Walkthrough environment
            done  = False
            state = env.reset()

            while not done:
                # Choose action
                # scaled_state = state
                scaled_state  = env.scale_state(state)
                action = agent.choose_action(scaled_state)
                
                # Take action
                next_state, reward, done, info_dict = env.step(action)

                # Choose next action
                # scaled_next_state = next_state
                scaled_next_state = env.scale_state(next_state)
                next_action = agent.choose_action(scaled_next_state)

                # Learning
                scaled_reward = reward
                # scaled_reward = zero_one_scale(reward)
                agent.learn((scaled_state, action, scaled_reward, scaled_next_state, done), next_action, episode)

                state  = next_state
                action = next_action

            # Result Summary
            trade_df      = pd.DataFrame(env.trading_params_dict['trade_dict'])
            total_profits = sum(trade_df['profits'])

            open_trade_df = trade_df[trade_df['status'].isin([env.constant_values()['TRADE_STATUS']['OPEN']])]
            used_margin   = sum(open_trade_df['price'] * env.trading_params_dict['unit'])

            result_dict['total_profit'].append(round(total_profits, 5))
            result_dict['used_margin'].append(round(used_margin, 5))
            result_dict['acct_bal'].append(round(env.trading_params_dict['acct_bal'], 5))
            result_dict['trades'].append(env.trading_params_dict['trade_dict'])

            # Progress
            # clear_output(wait=True)
            ε = agent.hyperparams_dict['epsilon']['value']
            α = agent.hyperparams_dict['alpha']['value']
            γ = agent.hyperparams_dict['gamma']['value']
            
            try:
                print(f'EP: {episode+1 :,} / {episodes :,} | ε: {ε :.5f} | α: {α :.5f} | γ: {γ :.5f} | R: {total_profits :,.5f} | M: {agent.memory.counter :,}')
            except:
                print(f'EP: {episode+1 :,} / {episodes :,} | ε: {ε :.5f} | α: {α :.5f} | γ: {γ :.5f} | R: {total_profits :,.5f}')

        return result_dict, agent
    
    except KeyboardInterrupt:
        print('\n!!! KeyboardInterrupt Exception !!!')
        return result_dict, agent

In [None]:
EXEC_START = time.time()

# agent_type = 'Normal'
agent_type = 'Q-Learning'
# agent_type = 'SARSA'
# agent_type = 'SARSA Lambda'
# agent_type = 'Q-Network'
# agent_type = 'DQN'
# agent_type = 'Double DQN'

episodes = 5_000
# episodes = 3_000

# FOR PROFILING PURPOSE
# %lprun -f train \
result_dict, agent = train(episodes, agent_type)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
result_df = pd.DataFrame({
    'total_profit': result_dict['total_profit'],
    'acct_bal': result_dict['acct_bal'],
    'used_margin': result_dict['used_margin']
})
result_df['equity'] = result_df['acct_bal'] + result_df['used_margin']

# trade_df = pd.DataFrame(result_dict['trades'][0])
# trade_df = pd.DataFrame(result_dict['trades'][len(result_dict['trades']) -1])

### Chart: Rolling Profits

In [None]:
EXEC_START = time.time()


data = []
data.append(go.Scattergl(
    x = [x+1 for x in range(episodes)],
    y = result_df.rolling(100).mean()['total_profit'],
    mode = 'lines',
    name = 'Rolling Profits'
))

title = f'{currency_pair} - Rolling Profits - {agent_type}'
plot_graph(data, title, 'Episode', 'Amount')


EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

### Chart: Rolling Equity

In [None]:
EXEC_START = time.time()


data = []
data.append(go.Scattergl(
    x = [x+1 for x in range(episodes)],
    y = result_df.rolling(100).mean()['equity'],
    mode = 'lines',
    name = 'Equity'
))

title = f'{currency_pair} - Rolling Equity - {agent_type}'
plot_graph(data, title, 'Episode', 'Equity')


EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Testing

In [None]:
# TODO

### Chart: Trade

In [None]:
EXEC_START = time.time()


data = []
price_types = ['bid', 'ask']
for price_index, prices in enumerate([env.bids, env.asks]):
    data.append(go.Scattergl(
        x = pd.DataFrame(env.datetimes)[0],
        y = prices,
        mode = 'lines',
        name = price_types[price_index].title(),
        
        # Additional settings
        hoverinfo='skip'
    ))

markers = ['triangle-up', 'triangle-down']
trade_actions = ['buy', 'sell']
for trade_index, trade_action in enumerate(trade_actions):
    action_df = trade_df[trade_df['action'] == trade_index]
    
    data.append(go.Scattergl(
        x = action_df['datetime'],
        y = action_df['price'],
        mode = 'markers',
        name = trade_action.title(),
        
        # Additional settings
        marker = dict(
            size=15,
            symbol=markers[trade_index]
        ),
        hovertext=[f'Date Time: {row.datetime}<br />Action Index: {row.Index}<br />{"Open" if row.Index % 2 == 0 else "Closed"} at {row.price}<br />Profit: {row.profits}'
                   for row in action_df.itertuples()],
        hoverinfo='text'
    ))

title = f'{currency_pair} - Trade - {agent_type}'
plot_graph(data, title, 'Date Time', 'Price')


EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

### Chart: Trade Profits

In [None]:
EXEC_START = time.time()


y = trade_df['profits']

data = []
data.append(go.Scattergl(
    x = [x for x in range(len(trade_df))],
    y = y,
    mode = 'lines',
    name = f'Profits ({sum(y) :,.2f})'
))

title = f'{currency_pair} - Trade Profits - {agent_type}'
plot_graph(data, title, '', 'Amount')


EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)