## Import Package

In [None]:
import tensorflow as tf
import numpy as np
import keras
import random

## Agent

In [None]:
class Agent():
    
    def __init__(self):
        self.state_dim = 20
        self.action_dim = 3
        
        self.replay_buffer_size = 600
        self.exp_idx = 0
        self.state_buffer = np.zeros((self.replay_buffer_size, self.state_dim))
        self.action_buffer = np.zeros((self.replay_buffer_size))
        self.new_state_buffer = np.zeros((self.replay_buffer_size, self.state_dim))
        self.reward_buffer = np.zeros((self.replay_buffer_size))
        self.done_buffer = np.zeros((self.replay_buffer_size))
        self.batch_exp_size = 32
        
        self.reward_discount = 0.95
        self.learning_rate = 0.0001
        self.exploration_rate = 1.0
        self.exploration_min = 0.01
        self.exploration_decay = 0.995
        
        self.update_nn = self.build_nn()
        self.target_nn = self.build_nn()
    
    
    def build_nn(self):
        model = keras.models.Sequential()
        model.add(keras.layers.LSTM(units=256, input_shape=(self.state_dim, 1), return_sequences=True))
        model.add(keras.layers.LSTM(units=128, return_sequences=False))
        model.add(keras.layers.Dense(units=128, activation="relu"))
        model.add(keras.layers.Dense(units=64, activation="relu"))
        model.add(keras.layers.Dense(units=32, activation="relu"))
        model.add(keras.layers.Dense(units=3, activation="linear"))
        
        model.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate))
        model.summary()
        
        return model
        
    
    def sample_action(self, state):
        if np.random.random() <= self.exploration_rate:
            action = np.random.choice([1, 0, -1])
        else:
            action = self.update_nn.predict(state)
            action = action[0]
            action = np.argmax(action)-1
        return action
    
    
    def store_experience(self, state, action, new_state, daily_reward, done):
        idx = self.exp_idx % self.replay_buffer_size
        
        self.state_buffer[idx] = state
        self.action_buffer[idx] = action
        self.new_state_buffer[idx] = new_state
        self.reward_buffer[idx] = daily_reward
        self.done_buffer[idx] = done
        
        self.exp_idx += 1
        
    
    def train_update_nn(self):
        if self.exp_idx <= self.batch_exp_size:
            return
        
        batch_exp = random.sample(range(0, min(self.replay_buffer_size, self.exp_idx)), self.batch_exp_size)
        
        states = []
        q_values = []
        
        for idx in batch_exp:
            state = self.state_buffer[idx]
            action = self.action_buffer[idx]
            new_state = self.new_state_buffer[idx]
            reward = self.reward_buffer[idx]
            done = self.done_buffer[idx]
            
            if done == 1:
                target = reward
            else:
                new_state = np.reshape(new_state, (1, len(new_state), 1))
                target = reward + self.reward_discount*np.amax(self.target_nn.predict(new_state)[0])
            
            state = np.reshape(state, (1, len(state), 1))
            q_value = self.update_nn.predict(state)
            q_value[0][int(action+1)] = target
            
            state = np.reshape(state, (20, 1))
            q_value = np.reshape(q_value, (3))
            
            states.append(state)
            q_values.append(q_value)
        
        states = np.array(states)
        q_values = np.array(q_values)
        
        self.update_nn.fit(x=states, y=q_values, epochs=1, batch_size=32, verbose=1)
        self.exploration_rate = max(self.exploration_min, self.exploration_rate*self.exploration_decay)
        
        
    def train_target_nn(self):
        self.target_nn.set_weights(self.update_nn.get_weights()) 