## Import Module

In [None]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Environment Class

In [None]:
class Environment:
    
    # constructor
    def __init__(self, company1, company2, price_col, training_dataset_ratio=0.8, nrm=1, one_episode_num_step=30):
        
        self.company1 = company1
        self.stock_price1 = None
        self.stock_price1_train = None
        self.stock_price1_test = None
        
        self.company2 = company2
        self.stock_price2 = None
        self.stock_price2_train = None
        self.stock_price2_test = None
        
        self.price_col = price_col
        self.stock_price_length = None
        
        self.training_dataset_ratio = training_dataset_ratio
        self.nrm = nrm
        
        self.stock_price_final = []
        self.local_current_step = None
        self.global_current_step = None
        self.purpose = None
        self.old_prupoese = None
        self.one_episode_num_step = one_episode_num_step
        
        # read two company's stock price
        self.load_data()
        
        # extract close price from two stock price and convert to numpy array
        self.extract_close_price()
        
        # split two stock close price into training set and testing set
        self.split_dataset()
        
        # record system's info
        self.system_holding_stock1_unit = None
        self.system_holding_stock1_avg_price = None
        self.system_holding_stock1_wait2sell = False
        self.system_holding_stock2_unit = None
        self.system_holding_stock2_avg_price = None
        self.system_holding_stock2_wait2sell = False
        
    
    # read two company's stock price
    def load_data(self):
        
        try:
            self.stock_price1 = pd.read_csv(self.company1 + ".csv")
        except:
            raise Exception("Cannot load {}".format(self.company1 + ".csv"))
            
        try:
            self.stock_price2 = pd.read_csv(self.company2 + ".csv")
        except:
            raise Exception("Cannot load {}".format(self.company2 + ".csv"))
            
    
    # extract close price from two stock price and convert to numpy array
    def extract_close_price(self):
        
        try:
            self.stock_price1 = self.stock_price1[self.price_col]
            self.stock_price2 = self.stock_price2[self.price_col]
        except:
            raise Exception("Cannot extract stock price column: {}.".format(self.price_col))
            
        
        self.stock_price1 = self.stock_price1.values
        self.stock_price2 = self.stock_price2.values
        self.stock_price_length = len(self.stock_price1)
        
        
    # split two stock close price into training set and testing set
    def split_dataset(self):
        
        index = round(self.stock_price_length * self.training_dataset_ratio)
        
        self.stock_price1_train = self.stock_price1[:index]
        self.stock_price1_test = self.stock_price1[index:]
        
        self.stock_price2_train = self.stock_price2[:index]
        self.stock_price2_test = self.stock_price2[index:]
        
        print("===============Environment Info===============")
        print("Stock1: {}".format(self.company1))
        print("Stock2: {}".format(self.company2))
        print("Price Column: {}".format(self.price_col))
        print("Ngative Return Multiplier: {}".format(self.nrm))
        print("Number of Days in One Episode: {}".format(self.one_episode_num_step))
        print("Total number of day for training: {}".format(str(len(self.stock_price1_train))))
        print("Total number of day for testing: {}".format(str(len(self.stock_price1_test))))
        print("==============================================")
        
    
    # reset environment: must specify purpose for training or tetsing
    def reset(self, purpose):
        
        self.purpose = purpose
        
        if self.purpose != self.old_prupoese:
            self.prepare_final_data()
            self.old_prupoese = self.purpose
            self.global_current_step = -1
        
        self.global_current_step += 1
        self.local_current_step = 0
        
        self.system_holding_stock1_unit = 0
        self.system_holding_stock1_avg_price = 0
        self.system_holding_stock1_wait2sell = False
        self.system_holding_stock2_unit = 0
        self.system_holding_stock2_avg_price = 0
        self.system_holding_stock2_wait2sell = False
        
        if self.global_current_step == len(self.stock_price_final)-self.one_episode_num_step+1:
            self.global_current_step = 0
            
        '''
        format of state environment should return:
        state: [current stock1 price,
                number of units of stock1 which system holding,
                current stock2 price,
                number of units of stock2 which system holding,
                current spread,
                spread return,
                spread mean during past 15 days,
                current spread / spread mean during past 15 days,
                spread mean during past 10 days,
                current spread / spread mean during past 10 days
                spread mean during past 7 days
                current spread / spread mean during past 7 days
                spread mean during past 5 days
                current spread / spread mean during past 5 days]
        '''
        if self.purpose == "train":
            stock_price1 = self.stock_price1_train[self.global_current_step]
            stock_price2 = self.stock_price2_train[self.global_current_step]
        else:
            stock_price1 = self.stock_price1_test[self.global_current_step]
            stock_price2 = self.stock_price2_test[self.global_current_step]
        
        
        additional_state = np.array([stock_price1, self.system_holding_stock1_unit, stock_price2, self.system_holding_stock2_unit])
        original_state = self.stock_price_final[self.global_current_step]
        
        return np.insert(original_state, 0, additional_state)
            
            
    # prepare train data
    def prepare_final_data(self):
        
        # spread of two stock
        if self.purpose == "train":
            spread = self.stock_price1_train - self.stock_price2_train
        else:
            spread = self.stock_price1_test - self.stock_price2_test
        
        
        for idx, value in enumerate(spread):
            
            one_step = np.empty(shape=(10))
            one_step_idx = 0
            
            # current spread
            current_spread = value
            one_step[one_step_idx] = current_spread
            one_step_idx += 1
            
            # daily return of spread
            yesterday_spread = spread[idx-1] if idx-1 >= 0 else value
            daily_return_spread = current_spread - yesterday_spread
            one_step[one_step_idx] = daily_return_spread
            one_step_idx += 1
            
            # spread mean during past 15 days
            if idx != 0:
                temp_idx = 0 if (idx-15<0) else (idx-15)
                spread_mean_15_days = np.mean(spread[temp_idx:idx])
            else:
                spread_mean_15_days = value
            
            one_step[one_step_idx] = spread_mean_15_days
            one_step_idx += 1
            
            # curren spread / spread mean during past 15 days
            one_step[one_step_idx] = current_spread / spread_mean_15_days
            one_step_idx += 1
            
            # spread mean during past 10 days
            if idx != 0:
                temp_idx = 0 if (idx-10<0) else (idx-10)
                spread_mean_10_days = np.mean(spread[temp_idx:idx])
            else:
                spread_mean_10_days = value
            one_step[one_step_idx] = spread_mean_10_days
            one_step_idx += 1
            
            # curren spread / spread mean during past 10 days
            one_step[one_step_idx] = current_spread / spread_mean_10_days
            one_step_idx += 1
            
            # spread mean during past 7 days
            if idx != 0:
                temp_idx = 0 if (idx-7<0) else (idx-7)
                spread_mean_7_days = np.mean(spread[temp_idx:idx])
            else:
                spread_mean_7_days = value
            one_step[one_step_idx] = spread_mean_7_days
            one_step_idx += 1
            
            # curren spread / spread mean during past 7 days
            one_step[one_step_idx] = current_spread / spread_mean_7_days
            one_step_idx += 1
            
            # spread mean during past 5 days
            if idx != 0:
                temp_idx = 0 if (idx-5<0) else (idx-5)
                spread_mean_5_days = np.mean(spread[temp_idx:idx])
            else:
                spread_mean_5_days = value
            one_step[one_step_idx] = spread_mean_5_days
            one_step_idx += 1
            
            # curren spread / spread mean during past 5 days
            one_step[one_step_idx] = current_spread / spread_mean_5_days
            one_step_idx += 1
            
            self.stock_price_final.append(one_step)
            
    # get next new state
    def get_new_state(self, step_idx):
        
        '''
        format of state environment should return:
        state: [current stock1 price,
                number of units of stock1 which system holding,
                current stock2 price,
                number of units of stock2 which system holding,
                current spread,
                spread return,
                spread mean during past 15 days,
                current spread / spread mean during past 15 days,
                spread mean during past 10 days,
                current spread / spread mean during past 10 days
                spread mean during past 7 days
                current spread / spread mean during past 7 days
                spread mean during past 5 days
                current spread / spread mean during past 5 days]
        '''
        
        if self.purpose == "train":
            stock_price1 = self.stock_price1_train[step_idx]
            stock_price2 = self.stock_price2_train[step_idx]
        else:
            stock_price1 = self.stock_price1_test[step_idx]
            stock_price2 = self.stock_price2_test[step_idx]
        
        original_new_state = self.stock_price_final[step_idx]
        additional_new_state = np.array([stock_price1, self.system_holding_stock1_unit, stock_price2, self.system_holding_stock2_unit])
        return np.insert(original_new_state, 0, additional_new_state)
    
    
    # calculate reward given system's action
    def calculate_reward(self, step_idx, action):
        
        '''
        format of action environment should receive:
        action: [current pattern,
                 quantity for two stocks]
        
        current pattern: 
        type: integer
        0 => buy stock1 and sell stock2
        1 => sell stock1 and buy stock2
        2 => no operation
        
        quantity for two stocks: 
        type: list
        [0~10, 0~10] (11*11 combination) 
        '''
        
        if self.purpose == "train":
            stock_price1 = self.stock_price1_train[step_idx]
            stock_price2 = self.stock_price2_train[step_idx]
        else:
            stock_price1 = self.stock_price1_test[step_idx]
            stock_price2 = self.stock_price2_test[step_idx]
            
            
        pattern = action[0]
        quantity = action[1]
        stock1_quantity = quantity[0]
        stock2_quantity = quantity[1]
        
        # buy stock1 and sell stock2
        if pattern == 0:
            
            # process stock1
            
            # already buy some units of stock1
            if self.system_holding_stock1_wait2sell is True:
                self.system_holding_stock1_wait2sell = True
                
                if (self.system_holding_stock1_unit + stock1_quantity) == 0:
                    self.system_holding_stock1_avg_price = 0
                else:
                    self.system_holding_stock1_avg_price = ((self.system_holding_stock1_unit * self.system_holding_stock1_avg_price)+(stock1_quantity * stock_price1))/(self.system_holding_stock1_unit + stock1_quantity)
                
                
                self.system_holding_stock1_unit += stock1_quantity
                reward = 0
            
            # already sell some units of stock1
            else:
                q = min(self.system_holding_stock1_unit, stock1_quantity)
                reward = (self.system_holding_stock1_avg_price - stock_price1)*q
                self.system_holding_stock1_unit -= q
                stock1_quantity -= q
                
                if self.system_holding_stock1_unit > 0:
                    # self.system_holding_stock1_wait2sell remains False
                    # self.system_holding_stock1_avg_price remains original price
                    pass
                
                elif self.system_holding_stock1_unit == 0:
                    # self.system_holding_stock1_wait2sell remains False
                    self.system_holding_stock1_avg_price = 0
                    
                    # check if system want to buy more
                    if stock1_quantity > 0:
                        self.system_holding_stock1_wait2sell = True
                        self.system_holding_stock1_avg_price = ((self.system_holding_stock1_unit * self.system_holding_stock1_avg_price)+(stock1_quantity * stock_price1))/(self.system_holding_stock1_unit + stock1_quantity)
                        self.system_holding_stock1_unit += stock1_quantity

                else:
                    raise Exception("The number of units of stock1 which system holds should not be negative.")
                    
            # process stock2
            
            # already buy some units of stock2
            if self.system_holding_stock2_wait2sell is True:
                
                q = min(self.system_holding_stock2_unit, stock2_quantity)
                reward = (stock_price2 - self.system_holding_stock2_avg_price)*q
                self.system_holding_stock2_unit -= q
                stock2_quantity -= q
                
                if self.system_holding_stock2_unit > 0:
                    # self.system_holding_stock2_wait2sell remains True
                    # self.system_holding_stock2_avg_price remains original price
                    pass
                
                elif self.system_holding_stock2_unit == 0:
                    self.system_holding_stock2_wait2sell = False
                    self.system_holding_stock2_avg_price = 0
                    
                    # check if system want to sell more
                    if stock2_quantity > 0:
                        self.system_holding_stock2_wait2sell = False
                        self.system_holding_stock2_avg_price = ((self.system_holding_stock2_unit * self.system_holding_stock2_avg_price)+(stock2_quantity * stock_price2))/(self.system_holding_stock2_unit + stock2_quantity)
                        self.system_holding_stock2_unit += stock2_quantity

                else:
                    raise Exception("The number of units of stock2 which system holds should not be negative.")
                
            # already sell some units of stock2
            else:
                self.system_holding_stock2_wait2sell = False
                
                if (self.system_holding_stock2_unit + stock2_quantity) == 0:
                    self.system_holding_stock2_avg_price = 0
                else:
                    self.system_holding_stock2_avg_price = ((self.system_holding_stock2_unit * self.system_holding_stock2_avg_price)+(stock2_quantity * stock_price2))/(self.system_holding_stock2_unit + stock2_quantity)
                
                self.system_holding_stock2_unit += stock2_quantity
                reward = 0
                
            
            
        # sell stock1 and buy stock2
        elif pattern == 1:
            
            # process stock1
            
            # already buy some units of stock1
            if self.system_holding_stock1_wait2sell is True:
                
                q = min(self.system_holding_stock1_unit, stock1_quantity)
                reward = (stock_price1 - self.system_holding_stock1_avg_price)*q
                self.system_holding_stock1_unit -= q
                stock1_quantity -= q
                
                if self.system_holding_stock1_unit > 0:
                    # self.system_holding_stock1_wait2sell remains True
                    # self.system_holding_stock1_avg_price remains original price
                    pass
                
                elif self.system_holding_stock1_unit == 0:
                    self.system_holding_stock1_wait2sell = False
                    self.system_holding_stock1_avg_price = 0
                    
                    # check if system want to sell more
                    if stock1_quantity > 0:
                        self.system_holding_stock1_wait2sell = False
                        self.system_holding_stock1_avg_price = ((self.system_holding_stock1_unit * self.system_holding_stock1_avg_price)+(stock1_quantity * stock_price1))/(self.system_holding_stock1_unit + stock1_quantity)
                        self.system_holding_stock1_unit += stock1_quantity

                else:
                    raise Exception("The number of units of stock1 which system holds should not be negative.")
            
            # already sell some units of stock1
            else:
                self.system_holding_stock1_wait2sell = False
                
                if (self.system_holding_stock1_unit + stock1_quantity) == 0:
                    self.system_holding_stock1_avg_price = 0
                else:
                    self.system_holding_stock1_avg_price = ((self.system_holding_stock1_unit * self.system_holding_stock1_avg_price)+(stock1_quantity * stock_price1))/(self.system_holding_stock1_unit + stock1_quantity)
                self.system_holding_stock1_unit += stock1_quantity
                reward = 0
                
                
            # process stock2
            
            # already buy some units of stock2
            if self.system_holding_stock2_wait2sell is True:
                self.system_holding_stock2_wait2sell = True
                
                if (self.system_holding_stock2_unit + stock2_quantity) == 0:
                    self.system_holding_stock2_avg_price = 0
                else:
                    self.system_holding_stock2_avg_price = ((self.system_holding_stock2_unit * self.system_holding_stock2_avg_price)+(stock2_quantity * stock_price2))/(self.system_holding_stock2_unit + stock2_quantity)
                
                self.system_holding_stock2_unit += stock2_quantity
                reward = 0
                
            # already sell some units of stock2
            else:
                
                q = min(self.system_holding_stock2_unit, stock2_quantity)
                reward = (self.system_holding_stock2_avg_price - stock_price2)*q
                self.system_holding_stock2_unit -= q
                stock2_quantity -= q
                
                if self.system_holding_stock2_unit > 0:
                    # self.system_holding_stock2_wait2sell remains False
                    # self.system_holding_stock2_avg_price remains original price
                    pass
                
                elif self.system_holding_stock2_unit == 0:
                    self.system_holding_stock2_wait2sell = False
                    self.system_holding_stock2_avg_price = 0
                    
                    # check if system want to buy more
                    if stock2_quantity > 0:
                        self.system_holding_stock2_wait2sell = True
                        self.system_holding_stock2_avg_price = ((self.system_holding_stock2_unit * self.system_holding_stock2_avg_price)+(stock2_quantity * stock_price2))/(self.system_holding_stock2_unit + stock2_quantity)
                        self.system_holding_stock2_unit += stock2_quantity

                else:
                    raise Exception("The number of units of stock2 which system holds should not be negative.")
            
            
        # no operation
        elif pattern == 2:
            reward = 0
            
        else:
            raise Exception("No pattern match.")
            
            
        if self.purpose == "train":
            reward = reward * self.nrm if reward < 0 else reward
            
        return reward
        
    
    # go next step: must provide action
    def step(self, action):
        
        self.local_current_step += 1
        step_idx = self.global_current_step + self.local_current_step
        
        # get new state
        new_state = self.get_new_state(step_idx)
        
        # calculate reward given action
        reward = self.calculate_reward(step_idx, action)
            
        # is done
        done = True if((self.local_current_step == self.one_episode_num_step-1) or (step_idx==len(self.stock_price_final)-1)) else False
        
        return new_state, reward, done

## Pattern Agent

- **Pattern Agent's State (14) :**
 - current stock1 price
 - number of units of stock1 which system holding
 - current stock2 price
 - number of units of stock2 which system holding
 - current spread
 - spread return
 - spread mean during past 15 days
 - current spread / spread mean during past 15 days
 - spread mean during past 10 days
 - current spread / spread mean during past 10 days
 - spread mean during past 7 days
 - current spread / spread mean during past 7 days
 - spread mean during past 5 days
 - current spread / spread mean during past 5 days
 
    
- **Pattern Agent's Action (3) :**
 - buy stock1 and sell stock2
 - sell stock1 and buy stock1
 - no operation

In [None]:
class PatternAgent:
    
    # constructor
    def __init__(self,
                 state_dim,
                 action_dim, 
                 learning_rate,
                 gamma,
                 exploration_rate,
                 exploration_decay,
                 exploration_min,
                 replay_buffer_size,
                 batch_size):
        
        # input and output dimension
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # learning rate 
        self.learning_rate = learning_rate
        
        # discount q value
        self.gamma = gamma
        
        # exploration
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.exploration_min = exploration_min
        
        # replay buffer
        # an experience: [state1(14), action1(1), reward1(1), state2(14), done(1)]
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.replay_buffer = np.empty((self.replay_buffer_size, 14+1+1+14+1))
        self.replay_buffer_counter = 0
        
        # double deep Q network
        self.update_critic = self.build_nn(model_name="PatternAgent-UpdateCritic")
        self.update_critic.summary()
        self.target_critic = self.build_nn(model_name="PatternAgent-TargetCritic")
        self.target_critic.summary()
        
    
    # build neural network as model
    def build_nn(self, model_name):
        
        inputs = keras.Input(shape=(self.state_dim, ), name="InputLayer")
        hidden1 = keras.layers.Dense(units=24, activation="relu", name="HiddenLayer1")(inputs)
        hidden2 = keras.layers.Dense(units=12, activation="relu", name="HiddenLayer2")(hidden1)
        outputs = keras.layers.Dense(units=self.action_dim, activation="linear", name="OutputLayer")(hidden2)
        
        model = keras.models.Model(inputs=inputs, outputs=outputs, name=model_name)
        model.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate))
        
        return model
    
    
    # sample action
    def sample_action(self, state):
        
        # explore
        if np.random.uniform(0, 1) < self.exploration_rate:
            action_idx = np.random.choice(3, 1)
        
        else:
            state = np.reshape(state, (1, self.state_dim))
            action_value = self.update_critic.predict(state)[0]
            action_idx = np.argmax(action_value)
        
        return action_idx
    
    
    # store experience
    def store_experience(self, state1, action1, reward1, state2, done):
        
        experience = np.empty((14+1+1+14+1))
        experience[0:14] = state1[:]
        experience[14] = action1
        experience[15] = reward1
        experience[16:30] = state2[:]
        experience[30] = int(done)
        
        self.replay_buffer[self.replay_buffer_counter % self.replay_buffer_size] = experience
        self.replay_buffer_counter += 1
        
    
    # copy update critic's weight to target critic
    def set_target_critic_weight(self):
        
        self.target_critic.set_weights(self.update_critic.get_weights())
        
    
    # train update critic
    def train(self):
        
        if self.replay_buffer_counter < self.batch_size:
            return

        mask = np.random.choice(a=min(self.replay_buffer_counter, self.replay_buffer_size), size=self.batch_size)
        batch_experience = self.replay_buffer[mask][:]
        
        for experience in batch_experience:
            
            state1 = experience[0:14]
            action1 = experience[14]
            reward1 = experience[15]
            state2 = experience[16:30]
            done = experience[30]
            
            if done is True:
                target_reward = reward1
            
            else:
                # select action by update critic
                state2 = np.reshape(state2, (1, self.state_dim))
                action_value = self.update_critic.predict(state2)[0]
                action_idx = np.argmax(action_value)
                
                # estimate q value by target critic
                action_value = self.target_critic.predict(state2)[0]
                q_value = action_value[action_idx]
                
                # calculate target reward
                target_reward = reward1 + self.gamma*q_value
                
            
            # fit update critic with revised action value
            state1 = np.reshape(state1, (1, self.state_dim))
            action_value = self.update_critic.predict(state1)[0]
            action_value[int(action1)] = target_reward
            action_value = np.reshape(action_value, (1, self.action_dim))
            self.update_critic.fit(x=state1, y=action_value, epochs=1, verbose=0)
            
            
        # exploration rate decay
        self.exploration_rate = max(self.exploration_rate*self.exploration_decay, self.exploration_min)

## Quantity Agent

- **Quantity Agent's State (14+1) :**
 - current stock1 price
 - number of units of stock1 which system holding
 - current stock2 price
 - number of units of stock2 which system holding
 - current spread
 - spread return
 - spread mean during past 15 days
 - current spread / spread mean during past 15 days
 - spread mean during past 10 days
 - current spread / spread mean during past 10 days
 - spread mean during past 7 days
 - current spread / spread mean during past 7 days
 - spread mean during past 5 days
 - current spread / spread mean during past 5 days
 - current pattern



- **Quantity Agent's Action (11*11) :**
 - [0, 0]
 - [0, 1]
 - [0, 2]
 - [0, 3]
 - ...
 - [1, 0]
 - [1, 2]
 - [1, 3]
 - ...
 - [9, 0]
 - [9, 1]
 - [9, 2]
 - [9, 3]
 - ...
 - [10, 7]
 - [10, 8]
 - [10, 9]
 - [10, 10]

In [None]:
class QuantityAgent:
    
    # constructor
    def __init__(self,
                 state_dim,
                 action_dim, 
                 learning_rate,
                 gamma,
                 exploration_rate,
                 exploration_decay,
                 exploration_min,
                 replay_buffer_size,
                 batch_size):
        
        
        # input and output dimension
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # learning rate 
        self.learning_rate = learning_rate
        
        # discount q value
        self.gamma = gamma
        
        # exploration
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.exploration_min = exploration_min
        
        # replay buffer
        # an experience: [state1(15), action1(1), reward1(1), state2(15), done(1)]
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.replay_buffer = np.empty((self.replay_buffer_size, 15+1+1+15+1))
        self.replay_buffer_counter = 0
        
        # double deep Q network
        self.update_critic = self.build_nn(model_name="QuantityAgent-UpdateCritic")
        self.update_critic.summary()
        self.target_critic = self.build_nn(model_name="QuantityAgent-TargetCritic")
        self.target_critic.summary()
        
        # action list: [[0,0], [0,1], ..., [10,10]]
        self.action_list = self.get_action_list()
        
    
    # build neural network as model
    def build_nn(self, model_name):
        
        inputs = keras.Input(shape=(self.state_dim, ), name="InputLayer")
        hidden1 = keras.layers.Dense(units=28, activation="relu", name="HiddenLayer1")(inputs)
        hidden2 = keras.layers.Dense(units=14, activation="relu", name="HiddenLayer2")(hidden1)
        outputs = keras.layers.Dense(units=self.action_dim, activation="linear", name="OutputLayer")(hidden2)
        
        model = keras.models.Model(inputs=inputs, outputs=outputs, name=model_name)
        model.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate))
        
        return model    
    
    
    # generate action list
    def get_action_list(self):
        
        final_list = []
        
        for i in range(0, 11):
            for j in range(0, 11):
                temp_list = [i, j]
                final_list.append(temp_list)
                
        return final_list
    
    
    # sample action
    def sample_action(self, state):
        
        # explore
        if np.random.uniform(0, 1) <= self.exploration_rate:
            action_idx = np.random.choice(11*11, 1)
        
        else:
            state = np.reshape(state, (1, self.state_dim))
            action_value = self.update_critic.predict(state)[0]
            action_idx = np.argmax(action_value)
            
        quantity_list = self.get_quantity_list(action_idx)
        
        return quantity_list
    
    
    # translate action index to quantity list
    def get_quantity_list(self, action_idx):
        
        return self.action_list[int(action_idx)]
    
    # translate quantity list to action index
    def get_action_idx(self, quantity_list):

        for idx, lst in enumerate(self.action_list):
            if lst == quantity_list:
                return idx
        
    
    # store experience
    def store_experience(self, state1, quantity_list, reward1, state2, done):
        
        experience = np.empty((15+1+1+15+1))
        experience[0:15] = state1[:]
        experience[15] = self.get_action_idx(quantity_list)
        experience[16] = reward1
        experience[17:32] = state2[:]
        experience[32] = int(done)
        
        self.replay_buffer[self.replay_buffer_counter % self.replay_buffer_size] = experience
        self.replay_buffer_counter += 1
        
        
    # copy update critic's weight to target critic
    def set_target_critic_weight(self):
        
        self.target_critic.set_weights(self.update_critic.get_weights())
        
    
    # train update critic
    def train(self):
        
        if self.replay_buffer_counter < self.batch_size:
            return

        mask = np.random.choice(a=min(self.replay_buffer_counter, self.replay_buffer_size), size=self.batch_size)
        batch_experience = self.replay_buffer[mask][:]
        
        for experience in batch_experience:
            
            state1 = experience[0:15]
            action1 = experience[15]
            reward1 = experience[16]
            state2 = experience[17:32]
            done = experience[32]
            
            if done is True:
                target_reward = reward1
            
            else:
                # select action by update critic
                state2 = np.reshape(state2, (1, self.state_dim))
                action_value = self.update_critic.predict(state2)[0]
                action_idx = np.argmax(action_value)
                
                # estimate q value by target critic
                action_value = self.target_critic.predict(state2)[0]
                q_value = action_value[action_idx]
                
                # calculate target reward
                target_reward = reward1 + self.gamma*q_value
                
            
            # fit update critic with revised action value
            state1 = np.reshape(state1, (1, self.state_dim))
            action_value = self.update_critic.predict(state1)[0]
            action_value[int(action1)] = target_reward
            action_value = np.reshape(action_value, (1, self.action_dim))
            self.update_critic.fit(x=state1, y=action_value, epochs=1, verbose=0)
            
            
        # exploration rate decay
        self.exploration_rate = max(self.exploration_rate*self.exploration_decay, self.exploration_min)

## System Class

- **System's Mission :** System should implement pair-trading on two stocks, 'AAPL' and 'GOOG', and gain profits from it.


- **System Composition :** System consists of two agents, pattern agent who deciding the pattern and quantity agent who deciding the quantity of two stocks to buy and sell.

- **System's State (14) :**
 - current stock1 price
 - number of units of stock1 which system holding
 - current stock2 price
 - number of units of stock2 which system holding
 - current spread
 - spread return
 - spread mean during past 15 days
 - current spread / spread mean during past 15 days
 - spread mean during past 10 days
 - current spread / spread mean during past 10 days
 - spread mean during past 7 days
 - current spread / spread mean during past 7 days
 - spread mean during past 5 days
 - current spread / spread mean during past 5 days
 
- **System's Action (3 x (11 x 11)) :**
 - [Current Pattern, [Quantity1, Quantity2]]

In [None]:
class System:
    
    # constructor
    def __init__(self, 
                 pattern_agent_state_dim,
                 pattern_agent_action_dim,
                 pattern_agent_learning_rate,
                 pattern_agent_gamma,
                 pattern_agent_exploration_rate,
                 pattern_agent_exploration_decay,
                 pattern_agent_exploration_min,
                 pattern_agent_replay_buffer_size, 
                 pattern_agent_batch_size,
                 quantity_agent_state_dim,
                 quantity_agent_action_dim,
                 quantity_agent_learning_rate,
                 quantity_agent_gamma,
                 quantity_agent_exploration_rate, 
                 quantity_agent_exploration_decay, 
                 quantity_agent_exploration_min, 
                 quantity_agent_replay_buffer_size, 
                 quantity_agent_batch_size):
        
        
        # build pattern agent
        self.pattern_agent = PatternAgent(state_dim=pattern_agent_state_dim,
                                          action_dim=pattern_agent_action_dim,
                                          learning_rate=pattern_agent_learning_rate,
                                          gamma=pattern_agent_gamma,
                                          exploration_rate=pattern_agent_exploration_rate,
                                          exploration_decay=pattern_agent_exploration_decay,
                                          exploration_min=pattern_agent_exploration_min,
                                          replay_buffer_size=pattern_agent_replay_buffer_size,
                                          batch_size=pattern_agent_batch_size
                                         )
        
        # build quantity agent
        self.quantity_agent = QuantityAgent(state_dim=quantity_agent_state_dim,
                                           action_dim=quantity_agent_action_dim,
                                           learning_rate=quantity_agent_learning_rate,
                                           gamma=quantity_agent_gamma,
                                           exploration_rate=quantity_agent_exploration_rate,
                                           exploration_decay=quantity_agent_exploration_decay,
                                           exploration_min=quantity_agent_exploration_min,
                                           replay_buffer_size=quantity_agent_replay_buffer_size,
                                           batch_size=quantity_agent_batch_size
                                          )
        
    # sample an action
    def sample_action(self, state):
        
        '''
        Format and contents of state which system will receive:
        state = [
            current stock1 price
            number of units of stock1 which system holding
            current stock2 price
            number of units of stock2 which system holding
            current spread
            spread return
            spread mean during past 15 days
            current spread / spread mean during past 15 days
            spread mean during past 10 days
            current spread / spread mean during past 10 days
            spread mean during past 7 days
            current spread / spread mean during past 7 days
            spread mean during past 5 days
            current spread / spread mean during past 5 days
        ]
        
        Format and contents of action which system will return:
        action = [
            pattern,
            [quantity1, quantity2]
        ]
        '''
        
        pattern_action = self.pattern_agent.sample_action(state)
        state = np.append(state, pattern_action)
        quantity_action = self.quantity_agent.sample_action(state)
        action = [pattern_action, quantity_action]
        
        return action
    
    
    # store experience
    def store_experience(self, state1, action1, reward1, state2, done):
        
        self.pattern_agent.store_experience(state1, action1[0], reward1, state2, done)
        self.quantity_agent.store_experience(np.append(state1, action1[0]), action1[1], reward1, np.append(state2, self.pattern_agent.sample_action(state2)), done)
        
        
    # train system
    def train(self):
        
        self.pattern_agent.set_target_critic_weight()
        self.pattern_agent.train()
        
        self.quantity_agent.set_target_critic_weight()
        self.quantity_agent.train()

## Pair Trading Game Class

In [None]:
class PairTradingGame:
    
    # constructor
    def __init__(self):
        
        # necessary parameter for system
        pattern_agent_state_dim = 14
        pattern_agent_action_dim = 3
        pattern_agent_learning_rate = 0.001
        pattern_agent_gamma = 0.95
        pattern_agent_exploration_rate = 0.98
        pattern_agent_exploration_decay = 0.995
        pattern_agent_exploration_min = 0.01
        pattern_agent_replay_buffer_size = 600
        pattern_agent_batch_size = 55
        
        quantity_agent_state_dim = 15
        quantity_agent_action_dim = 11*11
        quantity_agent_learning_rate = 0.001
        quantity_agent_gamma = 0.95
        quantity_agent_exploration_rate = 0.98
        quantity_agent_exploration_decay = 0.995
        quantity_agent_exploration_min = 0.01
        quantity_agent_replay_buffer_size = 700
        quantity_agent_batch_size = 80
        
        # build system
        self.system = System(pattern_agent_state_dim=pattern_agent_state_dim,
                             pattern_agent_action_dim=pattern_agent_action_dim,
                             pattern_agent_learning_rate=pattern_agent_learning_rate,
                             pattern_agent_gamma=pattern_agent_gamma,
                             pattern_agent_exploration_rate=pattern_agent_exploration_rate,
                             pattern_agent_exploration_decay=pattern_agent_exploration_decay,
                             pattern_agent_exploration_min=pattern_agent_exploration_min,
                             pattern_agent_replay_buffer_size=pattern_agent_replay_buffer_size,
                             pattern_agent_batch_size=pattern_agent_batch_size,
                             quantity_agent_state_dim=quantity_agent_state_dim,
                             quantity_agent_action_dim=quantity_agent_action_dim,
                             quantity_agent_learning_rate=quantity_agent_learning_rate,
                             quantity_agent_gamma=quantity_agent_gamma,
                             quantity_agent_exploration_rate=quantity_agent_exploration_rate,
                             quantity_agent_exploration_decay=quantity_agent_exploration_decay,
                             quantity_agent_exploration_min=quantity_agent_exploration_min,
                             quantity_agent_replay_buffer_size=quantity_agent_replay_buffer_size,
                             quantity_agent_batch_size=quantity_agent_batch_size
                             )
        
        # necessary parameter for environment
        company1 = "AAPL"
        company2 = "GOOG"
        price_col = "Close"
        training_dataset_ratio = 0.75
        self.nrm = 1
        one_episode_num_step = 40
        
        # build environment
        self.env = Environment(company1=company1,
                               company2=company2,
                               price_col=price_col,
                               training_dataset_ratio=training_dataset_ratio,
                               nrm=self.nrm,
                               one_episode_num_step=one_episode_num_step)
        
        
        # total training episode for system, and store total reward in each episode
        self.total_training_episode = 1000
        self.training_episode_reward = []
        
        
    
    # start training system
    def start_training(self):
        
        for episode in range(self.total_training_episode):
            
            # a flag to indicate the end of episode
            done = False
            
            # reset environement
            state1 = self.env.reset(purpose="train")
            
            # total reward in this episode
            total_reward = 0
            
            # in an episode ...
            while done is False:
                
                # system will generate an action given current state
                action1 = self.system.sample_action(state1)
                
                # environment will generate info given current action
                state2, reward1, done = self.env.step(action1)
                
                # store this step (experience) into replay buffer
                self.system.store_experience(state1, action1, reward1, state2, done)
                
                # update variable
                state1 = state2
                total_reward += reward1
                
            # when an episode ends ...
            print("#%.4d Episode's Total Reward: %.4d" %(episode, total_reward))
            self.training_episode_reward.append(total_reward)
            self.system.train()
            
    
    # show training result
    def show_training_result(self):
        
        x = list(range(0, self.total_training_episode))
        y = self.training_episode_reward
        plt.plot(x, y, label="nrm = {}".format(self.nrm))
        
        plt.title("Double DQN's Performance on Pair Trading")
        plt.xlabel("Episode")
        plt.ylabel("Total Reward")
        plt.legend()
        
        plt.show()

## Main Driver

In [None]:
pair_trading_game = PairTradingGame()

pair_trading_game.start_training()
pair_trading_game.show_training_result()

# pair_trading_game.start_testing()
# pair_trading_game.show_testing_result()