# Alonso, Ariel - MFE Thesis - Reinforcement Learning Module

In [None]:
import pandas as pd
import numpy as np

# Dataset

## Data configuration definitions

In [None]:
DATA_PATH = 'C:/Users/Ariel/Documents/Maestria/TESIS/Mi Tesis/'

# Definition for data dates to use
DATA_DATES = ['18-12-19', '19-12-19', '20-12-19', '06-01-20', '07-01-20']

# Definition of order book data for the agents
TOP_OF_BOOK = ['Bid1_Price', 'Bid1_Size',
               'Ask1_Price', 'Ask1_Size']
FULL_BOOK = ['Bid1_Price', 'Bid1_Size',
             'Bid2_Price', 'Bid2_Size',
             'Bid3_Price', 'Bid3_Size',
             'Bid4_Price', 'Bid4_Size',
             'Bid5_Price', 'Bid5_Size',
             'Ask1_Price', 'Ask1_Size',
             'Ask2_Price', 'Ask2_Size',
             'Ask3_Price', 'Ask3_Size',
             'Ask4_Price', 'Ask4_Size',
             'Ask5_Price', 'Ask5_Size',]

In [None]:
# DATASET CHARACTERISTICS

MARKET_IMBALANCE = True # Defines if we summarize the entire book into one number
SIMPLIFY_INDICATORS = True # Defines if simplify the MACD signal and RSI indicators
BID_ASK_SPREAD = True # Defines if we add the bid_ask_spread column to the dataframe
QUANTIZE = False # Defines if we quantize the size_imbalance and the bid_ask_spread columns
REWARD_HORIZON = 100

## Data loading

In [None]:
# Load dataframes in lists
df_data_list = []
for date in DATA_DATES:
    df_data_list.append(pd.read_csv('{}AY24D_{}.csv'.format(DATA_PATH, date)))

## Dataset enrichment

In [None]:
# Cut first and last entries of the dataframes to remove inaccurate data
ENTRIES_REMOVED = 1000
ENTRIES_RETAINED = 1000
START = 3000

for i, df in enumerate (df_data_list):
    df = df[ENTRIES_REMOVED:-ENTRIES_REMOVED]
    if i < 5:
        df = df[START:START + ENTRIES_RETAINED]
    df_data_list[i] = df.reset_index(drop = True)
    df_data_list[i].drop(columns=['Datetime'], inplace = True) # We drop unwanted columns

In [None]:
# Technical indicators for enriching the dataset

import talib
INDICATORS_EMA_SPAN_MULTIPLIER = 10 # We augment the period of the MACD EMAs to filter short term noise better
RSI_THRESHOLD = 40 # Defines oversold and overbought percentages, ranges from 0 to 50

def MACD(df):
    '''Calculates the MACD and MACD SIGNAL indicators'''
    EMA_12 = df.Mid_Price.ewm(span=12*INDICATORS_EMA_SPAN_MULTIPLIER, adjust=False).mean()
    EMA_26 = df.Mid_Price.ewm(span=26*INDICATORS_EMA_SPAN_MULTIPLIER, adjust=False).mean()
    MACD = EMA_12 - EMA_26
    MACD_SIGNAL = MACD.ewm(span=9, adjust=False).mean()
    return MACD, MACD_SIGNAL

def RSI(df):
    '''Calculates the RSI indicator'''
    RSI = talib.RSI(df['Mid_Price'], timeperiod = 14*INDICATORS_EMA_SPAN_MULTIPLIER)
    return RSI

def indicators_enrichment(df):
    '''Incorporates indicators to the dataframe'''
    df['Mid_Price'] = (df['Bid1_Price'] + df['Ask1_Price'])/2
    
    # MACD & SIGNAL
    df['MACD'], df['MACD_SIGNAL'] = MACD(df)
    if SIMPLIFY_INDICATORS:
        # Simplifying the indicator
        df['MACD_SIGNAL_CROSS'] = np.sign(np.sign(df['MACD'] - df['MACD_SIGNAL']).diff())
        df.fillna({'MACD_SIGNAL_CROSS':0}, inplace=True)
        df.drop(columns=['MACD', 'MACD_SIGNAL'], inplace=True)
    
    # RSI
    df['RSI'] = RSI(df)
    df.fillna({'RSI':50}, inplace=True)# We fill the NaN with an indicator neutral value
    if SIMPLIFY_INDICATORS:
        # Simplifying the indicator
        overbought =  df['RSI'] > (100 - RSI_THRESHOLD)
        oversold   =  df['RSI'] < RSI_THRESHOLD
        neither    = (df['RSI'] > RSI_THRESHOLD) & (df['RSI'] < (100 - RSI_THRESHOLD))
        df['RSI'][overbought] = -1
        df['RSI'][oversold]   =  1
        df['RSI'][neither]    =  0
    
    df.drop(columns=['Mid_Price'], inplace=True)
    return df

# We apply the new indicators and incorporate them into the dataframes
for i, df in enumerate (df_data_list):
    df_data_list[i] = indicators_enrichment(df)

In [None]:
# Bid-Ask spread feature
def bid_ask_spread(df):
    df['Bid_Ask_spread'] = df['Ask1_Price'] - df['Bid1_Price'] 
    return df

# We apply the new indicators and incorporate them into the dataframes
if BID_ASK_SPREAD:
    for i, df in enumerate (df_data_list):
        df_data_list[i] = bid_ask_spread(df)

In [None]:
# Defining agents, discarding irrelevant data for TOB agent
df_FB = df_data_list
df_TOB = []
for i, df in enumerate (df_data_list):
    df_TOB.append(df.drop(columns=[e for e in FULL_BOOK if e not in TOP_OF_BOOK]))

In [None]:
# Market Imbalance calculations
tau = 1/20 # Penalizes bids and offers at distant prices

def book_imbalance(df, FULL_BOOK_AGENT):
    if FULL_BOOK_AGENT:
        Bid2_equiv = np.exp( - (df['Bid1_Price'] - df['Bid2_Price']) / tau ) * df['Bid2_Size']
        Bid3_equiv = np.exp( - (df['Bid1_Price'] - df['Bid3_Price']) / tau ) * df['Bid3_Size']
        Bid4_equiv = np.exp( - (df['Bid1_Price'] - df['Bid4_Price']) / tau ) * df['Bid4_Size']
        Bid5_equiv = np.exp( - (df['Bid1_Price'] - df['Bid5_Price']) / tau ) * df['Bid5_Size']
        Ask2_equiv = np.exp( - (df['Ask2_Price'] - df['Ask1_Price']) / tau ) * df['Ask2_Size']
        Ask3_equiv = np.exp( - (df['Ask3_Price'] - df['Ask1_Price']) / tau ) * df['Ask3_Size']
        Ask4_equiv = np.exp( - (df['Ask4_Price'] - df['Ask1_Price']) / tau ) * df['Ask4_Size']
        Ask5_equiv = np.exp( - (df['Ask5_Price'] - df['Ask1_Price']) / tau ) * df['Ask5_Size']
    
        df['Size_Imbalance'] = (df['Bid1_Size'] - df['Ask1_Size'])  + \
                                 (Bid2_equiv - Ask2_equiv)    + \
                                 (Bid3_equiv - Ask3_equiv)    + \
                                 (Bid4_equiv - Ask4_equiv)    + \
                                 (Bid5_equiv - Ask5_equiv)
    
        DROP_COLUMNS = [e for e in FULL_BOOK if e not in ('Bid1_Price','Ask1_Price')]
        df.drop(columns=DROP_COLUMNS, inplace=True)
        return df
    
    else:
        df['Size_Imbalance'] = df['Bid1_Size'] - df['Ask1_Size']
        df.drop(columns=['Bid1_Size', 'Ask1_Size'], inplace=True)
        return df    

if MARKET_IMBALANCE:
    for i, df in enumerate (df_TOB):
        df_TOB[i] = book_imbalance(df, False)
    for i, df in enumerate (df_FB):
        df_FB[i] = book_imbalance(df, True)

In [None]:
# Series quantizing
def quantize(df):
    QUANTILES_QTY = 4
    df['Size_Imbalance'] = pd.qcut(df['Size_Imbalance'], QUANTILES_QTY, labels=False, duplicates = 'drop')
    df['Bid_Ask_spread']   = pd.qcut(df['Bid_Ask_spread'], QUANTILES_QTY, labels=False, duplicates = 'drop')
    return df

# We apply the new quantizing method to the dataframes
if QUANTIZE:
    for i, df in enumerate (df_TOB):
        df_TOB[i] = quantize(df)
    for i, df in enumerate (df_FB):
        df_FB[i] = quantize(df)

In [None]:
# Date selection of the training, validation and testing datasets
TRAIN_DATA = ['18-12-19', '19-12-19', '20-12-19']
VALIDATION_DATA  = '06-01-20'
TEST_DATA = '07-01-20'

In [None]:
# We obtain the final dataframes of training and testing
df_validation_TOB_agent  = df_TOB[DATA_DATES.index(VALIDATION_DATA)]
df_validation_FB_agent   = df_FB[DATA_DATES.index(VALIDATION_DATA)]

df_test_TOB_agent = df_TOB[DATA_DATES.index(TEST_DATA)]
df_test_FB_agent   = df_FB[DATA_DATES.index(TEST_DATA)]

df_TOB_aux = df_TOB.copy()
df_FB_aux = df_FB.copy()

del df_TOB_aux[DATA_DATES.index(TEST_DATA)]
del df_TOB_aux[DATA_DATES.index(VALIDATION_DATA)]
del df_FB_aux[DATA_DATES.index(TEST_DATA)]
del df_FB_aux[DATA_DATES.index(VALIDATION_DATA)]

# Now we keep the daily cut indexes between days to prevent day hopping in training
daily_cuts = np.array([len(item) for item in df_TOB_aux])
daily_cuts = np.insert(daily_cuts, 0, 0) # We insert a 0 in the first position as this indicates the first day start

# We now concatenate all the traning dataframes into one, resetting the indexes, we already have the daily cuts
df_train_TOB_agent = pd.concat(df_TOB_aux, ignore_index = True)
df_train_FB_agent  = pd.concat(df_FB_aux, ignore_index = True)

# RL Agents Training Environment

In [None]:
# TRAINING ENVIRONMENT

import gym
from gym import spaces
import tensorflow
import numpy as np
import random
from sklearn import preprocessing

TRAINING_DAY_COUNT = len(TRAIN_DATA)
EPISODE_LENGTH = len(df_train_TOB_agent)

POSITION_LIMITS = False
MAX_CASH = 10000
MAX_BOND_QTY = 5

POSITION_PENALTY = False
POSITION_PENALTY_AMOUNT = 10

TRADING_PENALTY = False
NON_TRADING_PENALTY_AMOUNT = 1

POSITIVE_REWARD_BOOSTER = False
REWARD_BOOST_FACTOR = 10

REWARD_IMMEDIATE = True
REWARD_END_OF_EPISODE = False

HIGH_FREQ_PENALTY = False
HIGH_FREQ_TICKS = REWARD_HORIZON + 1
HIGH_FREQ_PENALTY_AMOUNT = 10


class AY24BooksTrainEnv(gym.Env):
    BUY  = 0
    SELL = 1
    HOLD = 2
    
    def __init__(self, df, daily_cuts = np.array([]), agent_bond_qty = 0, agent_cash = 0, tick = 0):
        self.df = df
        self.daily_cuts = daily_cuts
        self.daily_cuts_cumsum = daily_cuts.cumsum().tolist()
        self.traning_day = 0
        self.tick = tick
        self.terminal = False
        self.reward = 0
        self.cum_reward = 0
        self.hf_ticks = 0
        n_actions = 3
        self.action_space = spaces.Discrete(n_actions)
        self.observation_space = spaces.Box(low=-1, high=1, shape=[self.df.shape[1]  - 2], dtype=np.double)
        # The observation space includes bid-ask spread, MACD signal, RSI signal and market imbalance features
        # The - 2 results from substracting the bid and ask prices from the feature set
        self.agent_bond_qty = agent_bond_qty
        self.agent_cash = agent_cash
        self.data = self.df.loc[self.tick,:]
        self.future_prices = self.df.loc[self.tick + REWARD_HORIZON,:]
        self.df_normalized = self.normalize_input_data()
        self.df_normalized.drop(columns=['Bid1_Price','Ask1_Price'], inplace=True)
        self.data_normalized = self.df_normalized.loc[self.tick,:]
        self.state = self.data_normalized.tolist()
        self.pending_buy  = []
        self.pending_sell = []
          
    def normalize_input_data(self):
        x = self.df.values #returns a numpy array
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x)
        df_normalized = pd.DataFrame(x_scaled, columns=self.df.columns)
        return df_normalized
    
    def reset(self):
        self.day = 0
        self.tick = 0
        self.initial_tick = self.tick
        self.agent_cash = 0
        self.agent_bond_qty = 0
        self.passive = True
        self.reward = 0
        self.cum_reward = 0
        self.hf_ticks = 0
        self.data = self.df.loc[self.tick,:]
        self.future_prices = self.df.loc[self.tick + REWARD_HORIZON,:]
        self.data_normalized = self.df_normalized.loc[self.tick,:]
        self.state = self.data_normalized.tolist()
        observation = np.array(self.state)
        self.pending_buy  = []
        self.pending_sell = []
        return observation
    
    def buy_bond(self):
        self.agent_cash -= self.data.Ask1_Price
        self.agent_bond_qty += 1
        if POSITION_LIMITS:
            if self.agent_bond_qty < MAX_BOND_QTY and self.agent_cash > -MAX_CASH:
                self.agent_cash -= self.data.Ask1_Price
                self.agent_bond_qty += 1
            else:
                print("Can't buy, maximum bond quantity reached or maximum negative cash limit reached")

    def sell_bond(self):
        self.agent_cash += self.data.Bid1_Price
        self.agent_bond_qty -= 1
        if POSITION_LIMITS:
            if self.agent_bond_qty >= -MAX_BOND_QTY and self.agent_cash < MAX_CASH:
                self.agent_cash += self.data.Bid1_Price
                self.agent_bond_qty -= 1
            else:
                print("Can't sell, maximum negative bond quantity reached or maximum cash limit reached")
        
    def step(self, action):
        if action == self.BUY:
            print('Action: Buy')
            self.buy_bond()
            self.pending_sell.append(self.tick + REWARD_HORIZON)
            print('I am buying at tick', self.tick, ' and will sell at tick', self.tick + REWARD_HORIZON)
            self.passive = False
            if HIGH_FREQ_PENALTY and self.hf_ticks < HIGH_FREQ_TICKS:
                self.reward =- HIGH_FREQ_PENALTY_AMOUNT
            elif REWARD_IMMEDIATE:
                self.reward = self.future_prices.Bid1_Price - self.data.Ask1_Price
            else:
                self.reward = 0
            self.hf_ticks = 0
            if POSITIVE_REWARD_BOOSTER:
                if self.reward > 0:
                    self.reward *= REWARD_BOOST_FACTOR
        
        elif action == self.SELL:
            print('Action: Sell')
            self.sell_bond()
            self.pending_buy.append(self.tick + REWARD_HORIZON)
            print('I am selling at tick', self.tick, ' and will buy at tick', self.tick + REWARD_HORIZON)
            self.passive = False
            if HIGH_FREQ_PENALTY and self.hf_ticks < HIGH_FREQ_TICKS:
                self.reward =- HIGH_FREQ_PENALTY_AMOUNT
            elif REWARD_IMMEDIATE:
                self.reward = self.data.Bid1_Price - self.future_prices.Ask1_Price
            else:
                self.reward = 0
            self.hf_ticks = 0
            if POSITIVE_REWARD_BOOSTER:
                if self.reward > 0:
                    self.reward *= REWARD_BOOST_FACTOR
                    print('Reward boosteada=', self.reward)
        elif action == self.HOLD:
            print('Action: Hold')
            self.reward = 0
            self.hf_ticks += 1
        else:
            print('Error: Invalid action')
        
        # We close previously opened position
        if self.tick in self.pending_buy:
            print('Buying to close position...')
            self.buy_bond()
            self.pending_buy.remove(self.tick)
        elif self.tick in self.pending_sell:
            print('Selling to close position...')
            self.sell_bond()
            self.pending_sell.remove(self.tick)

        # When the end of the day approaches, we stop and close the opened position at the corresponding tick
        if self.tick >= (self.daily_cuts[self.day+1] - REWARD_HORIZON):
            for i in self.pending_buy:
                self.tick = i
                self.data = self.df.loc[self.tick,:]
                self.buy_bond()
            self.pending_buy = []
            for i in self.pending_sell:
                self.tick = i
                self.data = self.df.loc[self.tick,:]
                self.sell_bond()
            self.pending_sell = []
            
            self.day += 1
            self.tick = self.daily_cuts_cumsum[self.day]
        
        # When the end of the whole df is reached, the terminal state needs to be checked:
        ptf_value = self.agent_bond_qty * self.data.Bid1_Price + self.agent_cash if self.agent_bond_qty > 0 \
                    else self.agent_bond_qty * self.data.Ask1_Price + self.agent_cash
        self.terminal = self.tick >= self.initial_tick + EPISODE_LENGTH
        if self.terminal:
            print('---------END OF EPISODE----------')
            print('self.agent_cash=', round(self.agent_cash,2))
            print('self.agent_bond_qty=', self.agent_bond_qty)
            print('ptf_value=', ptf_value)
            if REWARD_END_OF_EPISODE:
                self.reward += ptf_value  # CASO DE REWARD AL FINAL
            if self.passive and TRADING_PENALTY:
                self.reward -= NON_TRADING_PENALTY_AMOUNT
                print('Test reward after NTP =', self.reward)
            if POSITIVE_REWARD_BOOSTER:
                if self.reward > 0:
                    self.reward *= REWARD_BOOST_FACTOR # NO BOOST AT THE END
                    print('reward =',self.reward)
            print('---------------------------------')
            return self.state, self.reward, self.terminal, {}

        
        # If no terminal state is reached, the training continues
        
        # We return the info of the next tick and the cash and position state in the observation state
        if POSITION_PENALTY:
            if abs(self.agent_bond_qty) > MAX_BOND_QTY:
                print("Bond Quantity penalty reached")
                self.reward =- POSITION_PENALTY_AMOUNT
        self.data = self.df.loc[self.tick,:]
        self.future_prices = self.df.loc[self.tick + REWARD_HORIZON,:]
        self.data_normalized = self.df_normalized.loc[self.tick,:]
        self.state = self.data_normalized.tolist()
        observation = np.array(self.state)
        
        return observation, self.reward, self.terminal, {} # 1. proximo book,

In [None]:
# Environment checks

from stable_baselines.common.env_checker import check_env

env = AY24BooksTrainEnv(df_train_FB_agent, daily_cuts)

# This will check the custom environment and output additional warnings if needed
check_env(env)

# RL Model definitions and selection

In [None]:
from stable_baselines import DQN
from stable_baselines import PPO2
from stable_baselines import ACER
from stable_baselines.common import make_vec_env
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize

# Model selection

MODEL = 'DQN' # Can be DQN, PPO2 or ACER

# Logging path definition
import datetime
log_dir = "logs\\" + MODEL + "_" + "EP" + str(EPISODE_LENGTH) + "_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Model creation

# Case 1: DQN model
if MODEL == 'DQN':
    from stable_baselines.deepq.policies import MlpPolicy, LnMlpPolicy, CnnPolicy
    env_TOB = AY24BooksTrainEnv(df_train_TOB_agent, daily_cuts)
    env_FB  = AY24BooksTrainEnv(df_train_FB_agent, daily_cuts)
    model_TOB = DQN(LnMlpPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed = 1, n_cpu_tf_sess = 1)
    model_FB  = DQN(LnMlpPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed = 1, n_cpu_tf_sess = 1)

# Case 2: PPO2 model
elif MODEL == 'PPO2':
    from stable_baselines.common.policies import MlpPolicy, CnnLstmPolicy, CnnPolicy, MlpLnLstmPolicy
    env_TOB = DummyVecEnv([lambda: AY24BooksTrainEnv(df_train_TOB_agent, daily_cuts)])
    env_FB  = DummyVecEnv([lambda: AY24BooksTrainEnv(df_train_FB_agent, daily_cuts)])
    # Additional normalization layer
    env_TOB = VecNormalize(env_TOB)
    env_FB  = VecNormalize(env_FB)
    model_TOB = PPO2(MlpPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed=1, n_cpu_tf_sess=1)
    model_FB  = PPO2(MlpPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed=1, n_cpu_tf_sess=1)

# Case 3: ACER model
elif MODEL == 'ACER':
    from stable_baselines.common.policies import MlpPolicy, CnnLstmPolicy, CnnPolicy, MlpLnLstmPolicy
    env_TOB = DummyVecEnv([lambda: AY24BooksTrainEnv(df_train_TOB_agent, daily_cuts)])
    env_FB  = DummyVecEnv([lambda: AY24BooksTrainEnv(df_train_FB_agent, daily_cuts)])
    # Additional normalization layer
    env_TOB = VecNormalize(env_TOB)
    env_FB  = VecNormalize(env_FB)
    model_TOB = ACER(MlpLnLstmPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed=1, n_cpu_tf_sess=1)
    model_FB  = ACER(MlpLnLstmPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed=1, n_cpu_tf_sess=1)
    
else:
    print('Invalid model')

In [None]:
# NOW OPEN TENSORBOARD
# In Anaconda shell, change to env: "conda activate tf-1.15".
# Type "tensorboard --logdir logs"
# Open Tensorboard on browser at: http://localhost:6006/

# RL Agents Testing Environment

In [None]:
# TESTING ENVIRONMENT

import matplotlib.pyplot as plt

class AY24BooksTestEnv(AY24BooksTrainEnv):
    def reset(self):
        self.tick = 0 # We set the testing data always at the beginning of each DF
        self.agent_cash = 0
        self.agent_bond_qty = 0
        self.agent_bond_qty_graph = []
        self.agent_cash_graph = []
        self.agent_ptf_value = []  
        self.data = self.df.loc[self.tick,:]
        self.data_normalized = self.df_normalized.loc[self.tick,:]
        self.state = self.data_normalized.tolist()
        observation = np.array(self.state)
        self.pending_buy  = []
        self.pending_sell = []
        return observation
        
    def step(self, action):
        ptf_value = self.agent_bond_qty * self.data.Bid1_Price + self.agent_cash if self.agent_bond_qty > 0 \
                    else self.agent_bond_qty * self.data.Ask1_Price + self.agent_cash
        # We check if the end of the dataset has been reached
        self.terminal = self.tick > len(self.df.index.unique()) - REWARD_HORIZON - 1
        if self.terminal:
            print('Closing position in tick ', self.tick)
            for i in self.pending_buy:
                self.tick = i
                self.data = self.df.loc[self.tick,:]
                self.buy_bond()
            self.pending_buy = []
            for i in self.pending_sell:
                self.tick = i
                self.data = self.df.loc[self.tick,:]
                self.sell_bond()
            self.pending_sell = []
            
            print('-----END OF DATAFRAME-----')
            print('Tick number is:', self.tick)
            print('self.agent_cash=', round(self.agent_cash,2))
            print('self.agent_bond_qty=', self.agent_bond_qty)
            print('reward=',self.reward)
            info = {'bid1_price': self.data.Bid1_Price,
                    'agent_bond_qty': self.agent_bond_qty,
                    'agent_cash': self.agent_cash,
                    'agent_ptf_value': ptf_value}
            return self.state, self.reward, self.terminal, info

        if action == self.BUY:
            print('Action: Buy')
            self.buy_bond()
            self.pending_sell.append(self.tick + REWARD_HORIZON)
            print('I am buying at tick', self.tick, ' and will sell at tick', self.tick + REWARD_HORIZON)
            
        elif action == self.SELL:
            print('Action: Sell')
            self.sell_bond()
            self.pending_buy.append(self.tick + REWARD_HORIZON)
            print('I am selling at tick', self.tick, ' and will buy at tick', self.tick + REWARD_HORIZON)
            
        elif action == self.HOLD:
            print('Action: Hold')
            pass
        else:
            print('Error: Invalid action')
        
        # We close previously opened position
        if self.tick in self.pending_buy:
            print('Buying to close position...')
            self.buy_bond()
            self.pending_buy.remove(self.tick)
        elif self.tick in self.pending_sell:
            print('Selling to close position...')
            self.sell_bond()
            self.pending_sell.remove(self.tick)
        
        self.tick += 1

        # Devolvemos en observation la info del proximo tick
        self.data = self.df.loc[self.tick,:]
        self.data_normalized = self.df_normalized.loc[self.tick,:]
        self.cum_reward += self.reward
        self.state = self.data_normalized.tolist()
        observation = np.array(self.state)
        ptf_value = self.agent_bond_qty * self.data.Bid1_Price + self.agent_cash if self.agent_bond_qty > 0 \
                    else self.agent_bond_qty * self.data.Ask1_Price + self.agent_cash
        self.reward = 0 # We are only interested in the reward at end of episode at testing time
        info = {'bid1_price': self.data.Bid1_Price,
                'agent_bond_qty': self.agent_bond_qty,
                'agent_cash': self.agent_cash,
                'agent_ptf_value': ptf_value}
        return observation, self.reward, self.terminal, info   # 1. proximo book,

# Model Training and predictions

## Training configuration

In [None]:
TRAINING_ITERATIONS = 20 # Number of models that will predict on the test data, each more trained than the previous one
INCREMENTAL_TIMESTEPS = 20000 # Ticks that each model will train (cumulative)
TRAINING_TIMESTEPS = np.arange(0,(TRAINING_ITERATIONS+1) * INCREMENTAL_TIMESTEPS, INCREMENTAL_TIMESTEPS).tolist()
RESET_MODEL = True
SEED = 1 # Seed setting

## Training and predictions on the training and validation set 

In [None]:
# Training and prediction loop on the training data, and information storage for future graphs

agent_cash_TOB_iterations = []
agent_cash_FB_iterations  = []

for iteration in range(len(TRAINING_TIMESTEPS)):
    print ('Training iteration number ', iteration)
    agent_cash_TOB = []
    agent_cash_FB  = []
    
    # Prediction loop
    for i, date in enumerate(DATA_DATES[0:4]):
        print('Day to predict is ', date)
        # We need to vectorize the test environment for PPO2 or ACER models
        if MODEL == 'PPO2' or 'ACER':
            env_validation_TOB = DummyVecEnv([lambda: AY24BooksTestEnv(df_TOB[i])])
            env_validation_FB  = DummyVecEnv([lambda: AY24BooksTestEnv(df_FB[i])])
        else:
            env_validation_TOB = AY24BooksTestEnv(df_TOB[i])
            env_validation_FB  = AY24BooksTestEnv(df_FB[i])

        obs_validation_TOB = env_validation_TOB.reset()
        obs_validation_FB  = env_validation_FB.reset()
        
        # Prediction of the TOB agent
        while(True):
            action, _states = model_TOB.predict(obs_validation_TOB)
            obs_validation_TOB, rewards, dones, info = env_validation_TOB.step(action)
            if dones:
                print('agent_cash_TOB_day is', info[0]['agent_cash'])
                agent_cash_TOB_day = info[0]['agent_cash']
                break

        # Prediction of the FB agent
        while(True):
            action, _states = model_FB.predict(obs_validation_FB)
            obs_validation_FB, rewards, dones, info = env_validation_FB.step(action)
            if dones:
                print('agent_cash_FB_day is', info[0]['agent_cash'])
                agent_cash_FB_day = info[0]['agent_cash']
                break
        
        agent_cash_TOB.append(agent_cash_TOB_day)
        agent_cash_FB.append(agent_cash_FB_day)
    
    agent_cash_TOB_iterations.append(agent_cash_TOB)
    agent_cash_FB_iterations.append(agent_cash_FB)
    
    if iteration < len(TRAINING_TIMESTEPS)-1:
        # We train the model now and run the prediction in the next iteration
        if RESET_MODEL is True:
            del model_TOB
            del model_FB
            if MODEL == 'DQN':
                model_TOB = DQN(LnMlpPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed = SEED, n_cpu_tf_sess = 1)
                model_FB  = DQN(LnMlpPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed = SEED, n_cpu_tf_sess = 1)
            elif MODEL == 'PPO2':
                model_TOB = PPO2(MlpPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed=SEED, n_cpu_tf_sess=1)
                model_FB  = PPO2(MlpPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed=SEED, n_cpu_tf_sess=1)
            elif MODEL == 'ACER':
                model_TOB = ACER(MlpLnLstmPolicy, env_TOB, verbose=1, tensorboard_log=".\\{}_TOB".format(log_dir), seed=SEED, n_cpu_tf_sess=1)
                model_FB  = ACER(MlpLnLstmPolicy, env_FB, verbose=1, tensorboard_log=".\\{}_FB".format(log_dir), seed=SEED, n_cpu_tf_sess=1)
            print('I will train for this amount of timesteps:', TRAINING_TIMESTEPS[iteration+1])
            model_TOB.learn(total_timesteps=TRAINING_TIMESTEPS[iteration+1])
            model_FB.learn(total_timesteps=TRAINING_TIMESTEPS[iteration+1])
        else:
            print('I will train for this amount of timesteps:', TRAINING_TIMESTEPS[iteration+1]-TRAINING_TIMESTEPS[iteration])
            model_TOB.learn(total_timesteps=TRAINING_TIMESTEPS[iteration+1]-TRAINING_TIMESTEPS[iteration])
            model_FB.learn(total_timesteps=TRAINING_TIMESTEPS[iteration+1]-TRAINING_TIMESTEPS[iteration])

In [None]:
# print_profit() function for evaluating the performance of the TOB and FB agents
# predictions on the training and validation sets dates

def print_profit(TOB_series, FB_series):
    df_TOB = pd.DataFrame(TOB_series)
    df_TOB.columns = ['TRAIN DAY %d %s TOB'% (i, item) for i,item in enumerate(TRAIN_DATA)] + ['VALIDATION DAY %s TOB' % VALIDATION_DATA]
    df_FB = pd.DataFrame(FB_series)
    df_FB.columns  = ['TRAIN DAY %d %s FB'% (i, item) for i,item in enumerate(TRAIN_DATA)] + ['VALIDATION DAY %s FB' % VALIDATION_DATA]
    df = pd.concat([df_TOB, df_FB], axis = 1)
    ax = df.plot(figsize = (16,8),
            title=('Agent Profit' + ' (' + MODEL + ', ENT=' + str(ENTRIES_RETAINED) + ', RH=' + str(REWARD_HORIZON) + ')'),
            xlabel = ('Training Timesteps'),
            rot = 45,
            ylabel = ('Agent Profit ($)'),
            sort_columns = True,
            grid = True,
            style = ['-','-','-','-','--','--','--','--'],
            color = ['b', 'g', 'r', 'c', 'b', 'g', 'r', 'c'],
            linewidth = 1,
            )
    ax.set_xticks(df.index)
    ax.set_xticklabels(TRAINING_TIMESTEPS, rotation=45)
    
    df_validation = df[['VALIDATION DAY %s TOB' % VALIDATION_DATA, 'VALIDATION DAY %s FB' % VALIDATION_DATA]]
    ax_validation = df_validation.plot(figsize = (16,8),
            title=('Agent Profit' + ' (' + MODEL + ', ENT=' + str(ENTRIES_RETAINED) + ', RH=' + str(REWARD_HORIZON) + ')'),
            xlabel = ('Training Timesteps'),
            rot = 45,
            ylabel = ('Agent Profit ($)'),
            sort_columns = True,
            grid = True,
            color = ['r', 'g'],#, 'r', 'c', 'b', 'g', 'r', 'c'],
            linewidth = 1,
            )
    ax_validation.set_xticks(df_validation.index)
    ax_validation.set_xticklabels(TRAINING_TIMESTEPS, rotation=45)

In [None]:
# Produce graphs of training vs validation profit and isolated validation profit
print_profit(agent_cash_TOB_iterations, agent_cash_FB_iterations)

## Predictions on the testing dataset 

In [None]:
#We test now on the test day

# We need to vectorize the test environment for PPO2 or ACER models
if MODEL == 'PPO2' or 'ACER':
    env_test_TOB = DummyVecEnv([lambda: AY24BooksTestEnv(df_test_TOB_agent)])
    env_test_FB  = DummyVecEnv([lambda: AY24BooksTestEnv(df_test_FB_agent)])
else:
    env_test_TOB = AY24BooksTestEnv(df_test_TOB_agent)
    env_test_FB  = AY24BooksTestEnv(df_test_FB_agent)

obs_test_TOB = env_test_TOB.reset()
obs_test_FB  = env_test_FB.reset()

bid1_price_day = []
agent_bond_qty_TOB_day = []
agent_bond_qty_FB_day  = []
agent_cash_TOB_day = []
agent_cash_FB_day  = []
agent_ptf_value_TOB_day = []
agent_ptf_value_FB_day  = []

# Prediction of the TOB agent
while(True):
    action, _states = model_TOB.predict(obs_test_TOB)
    obs_test_TOB, rewards, dones, info = env_test_TOB.step(action)
    bid1_price_day.append(info[0]['bid1_price'])
    agent_bond_qty_TOB_day.append(info[0]['agent_bond_qty'])
    agent_cash_TOB_day.append(info[0]['agent_cash'])
    agent_ptf_value_TOB_day.append(info[0]['agent_ptf_value'])
    if dones:
        break

# Prediction of the FB agent
while(True):
    action, _states = model_FB.predict(obs_test_FB)
    obs_test_FB, rewards, dones, info = env_test_FB.step(action)
    agent_bond_qty_FB_day.append(info[0]['agent_bond_qty'])
    agent_cash_FB_day.append(info[0]['agent_cash'])
    agent_ptf_value_FB_day.append(info[0]['agent_ptf_value'])
    if dones:
        break

In [None]:
# Definiton of another printing function to evaluate the bond positions and ptf value of the test day predictions
def print_graphs(benchmark_series, TOB_series, FB_series, title, ylabel, scaled = False):
    if scaled:
        print('Bond QTY final TOB: ', TOB_series[-1])
        print('Bond QTY final FB: ', FB_series[-1])
        plt.figure(figsize=(12,8))
        plt.subplot(2, 1, 1)
        plt.title(title + ' (' + MODEL + ', EP=' + str(EPISODE_LENGTH) + ', RH=' + str(REWARD_HORIZON) + ')')
        #plt.xlabel('Timesteps')
        plt.ylabel('Bid1_Price($)')
        plt.plot(benchmark_series, label='Bid1_Price', alpha=1, color = 'blue')
        plt.subplot(2, 1, 2)
        #plt.title('Bond Quantity')
        plt.xlabel('Timesteps')
        plt.ylabel(ylabel)
        plt.plot(TOB_series, label="TOB", color = 'red')
        plt.plot(FB_series, label="FB", color = 'green')
        plt.legend(loc="upper left")
    else:
        print('Cash final TOB: $', TOB_series[-1])
        print('Cash final FB: $', FB_series[-1])
        benchmark_series = (benchmark_series - benchmark_series[0])
        plt.figure(figsize=(12,8))
        plt.title(title + ' (' + MODEL + ', EP=' + str(EPISODE_LENGTH) + ', RH=' + str(REWARD_HORIZON) + ')')
        plt.xlabel('Timesteps')
        plt.ylabel(ylabel)
        plt.plot(benchmark_series, label='Long 1 Bond', alpha=0.5, color = 'blue')
        plt.plot(TOB_series, label="TOB", color = 'red')
        plt.plot(FB_series, label="FB", color = 'green')
        plt.legend(loc="upper left")
    plt.show()

In [None]:
# Produce test day graphs
date = TEST_DATA
print_graphs(bid1_price_day, agent_bond_qty_TOB_day, agent_bond_qty_FB_day, 'TEST ' + date + ' Agent Bond Quantity', 'Bond Quantity', scaled=True)
print_graphs(bid1_price_day, agent_ptf_value_TOB_day, agent_ptf_value_FB_day, 'TEST ' + date + ' Agent Portfolio Value', 'PTF Value ($)')

In [None]:
# Final graph to plot once all the seeds configuration values have been run.
# Values in d are the results of different previous experiments.
d = {'TEST DAY 07-01-20 TOB': [-4.13, -1.67, -3.21, -5.18, -3.79, -5.14, -2.71, -2.28], 'TEST DAY 07-01-20 FB': [-3.36, -6.58, -3.21, -3.56, -2.63, -2.91, -3.77, -3.18]}
df = pd.DataFrame(data=d)
df.index = [1,2,3,4,5,6,7,8]
ax = df.plot(figsize = (16,8),
            title=('Agent Profit on Test Data' + ' (' + MODEL + ', ENT=' + str(ENTRIES_RETAINED) + ', RH=' + str(REWARD_HORIZON) + ')'),
            xlabel = ('DQN Model #Seed'),
            rot = 0,
            ylabel = ('Agent Profit ($)'),
            sort_columns = True,
            grid = True,
            style = ['-','-'],
            color = ['r', 'g'],
            linewidth = 1,
            )
ax.set_xticks(df.index)