# Install Packages

In [None]:
!pip3 install box2d-py
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras

# Import Libraries

In [1]:
import tensorflow as tf 
import numpy as np 
import gym
import tensorflow.keras as keras
from tensorflow.keras.models import load_model


# Setup Environment

In [2]:
env= gym.make("LunarLander-v2")
print(env.observation_space.shape)

(8,)


# Define DDDQN NN Model

In [12]:
class DDDQN(keras.Model):
#     def __init__(self, n_actions, fc1_dims, fc2_dims):
    def __init__(self):
        super(DDDQN, self).__init__()
        self.dense1 = keras.layers.Dense(128, activation='relu')
        self.dense2 = keras.layers.Dense(128, activation='relu')
        #value layer required for DDQN
        self.V = keras.layers.Dense(1, activation=None)
        #Advantage layer requred for DDQN
        self.A = keras.layers.Dense(env.action_space.n, activation=None)
        #feedforward functions
        
    def call(self, input_data):
        x = self.dense1(input_data)
        x = self.dense2(x)
        V = self.V(x)
        A = self.A(x)
        # Q update based on the Value and Action based on the average mean of a set of states
        Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
        return Q
    
    def advantage(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        A = self.A(x)
        return A

# Setup experience buffer

In [13]:
class experiece_replay():
    def __init__(self, memory_buffer = 1000000):
        self.memory_buffer_size = memory_buffer
        self.state_memory = np.zeros((self.memory_buffer_size, *(env.observation_space.shape)), dtype = np.float32)
        self.action_memory = np.zeros((self.memory_buffer_size), dtype = np.int32)
        self.reward_memory = np.zeros((self.memory_buffer_size), dtype = np.float32)
        self.next_state_memory = np.zeros((self.memory_buffer_size, *(env.observation_space.shape)), dtype = np.float32)
        self.terminal_memory = np.zeros((self.memory_buffer_size), dtype = np.bool)
        self.memory_cntr = 0 
        
        
    def store_experience(self, state, action, reward, next_state, terminal):
        """Function which populates memory, will pop left once max buffer size exceeded
        input (state, action, reward, next_state, terminal)
        return None
        """
        #wrap around
        idx = self.memory_cntr % self.memory_buffer_size
        #populate memory
        self.state_memory[idx] = state
        self.action_memory[idx] = action
        self.reward_memory[idx] = reward
        self.next_state_memory[idx] = next_state
        self.terminal_memory[idx] = 1- int(terminal)
        self.memory_cntr += 1
        
    def sample_experience(self, batch_size):
        """Function which samples memory for a given batch size randomly
        input batch size
        return batched states, actions, rewards, next_states, terminals
        """
        max_memory = min(self.memory_cntr , self.memory_buffer_size)
        batch = np.random.choice(max_memory, batch_size, replace = False)
        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        next_states = self.next_state_memory[batch]
        terminals = self.terminal_memory[batch]
        return states, actions, rewards, next_states, terminals
        

# Build Learning Agent

In [20]:
class agent():
    def __init__ (self, gamma = 0.99, replace = 100, lr = 0.001):
        self.gamma = gamma
        self.epsilon = 1
        self.epsilon_min = 0.01
#         self.epsilon_deccay_rate = 0.99
        self.epsilon_difference = 1e-3
        self.replace = replace
        self.train_step = 0
        self.memory = experiece_replay()
        self.batch_size = 64
        self.q_network = DDDQN()
        self.target_network = DDDQN()
        #use Adam over standard grad decent 
        optimizer_ = tf.keras.optimizers.Adam(learning_rate=lr)
        #optimization to minimize mean squared error
        self.q_network.compile(loss='mse', optimizer = optimizer_)
        self.target_network.compile(loss= 'mse', optimizer = optimizer_)
        
    def choose_action(self, state):
        """Function which returns an action based on epsilon greedy policy
        returns action
        """
        if np.random.rand() <= self.epsilon:
            return np.random.choice([i for i in range(env.action_space.n)])
        else:
            actions = self.q_network.advantage(np.array([state]))
            action = np.argmax(actions)
            return action
        
    def update_memory(self, state, action, reward, next_state, terminal):
        """Function which updates memory based on input s,a,r,n_s,t
        """
        self.memory.store_experience(state, action, reward, next_state, terminal)
        
    def update_target_network(self)::
        """Function whcih sets target networks weights based on q network weights
        """
        self.target_network.set_weights(self.q_network.get_weights())
        
    def update_epsilon(self):
        """Function for epsilon annealing 
        """
#         if self.epsilon > self.epsilon_min:
#             self.epsilon *= self.epsilon_deccay_rate
#         else:
#             self.epsilon = self.epsilon_min
#         return self.epsilon
        self.epsilon = self.epsilon - self.epsilon_difference if self.epsilon > self.epsilon_min else self.epsilon_min
    
    def save_model(self):
        """Function for saving model and target weights 
        """
        self.q_network.save("model.h5")
        self.target_network.save("target_net.h5")
        
    def load_model(self):
        """Function for loading saved weights 
        """
        self.q_network = load_model("model.h5")
        self.target_network = load_model("target.h5")
    
    def learn(self):
        """Function that upadates Q table 
        """
        if self.memory.memory_cntr < self.batch_size:
            return
        
        if self.train_step % self.replace == 0:
            self.update_target_network()
        states, actions, rewards, next_states, terminals = self.memory.sample_experience(self.batch_size)
        target_val = self.q_network.predict(states)
        next_state_val = self.target_network.predict(next_states)
        max_action = np.argmax(self.q_network.predict(next_states), axis = 1)
        batch_idx = np.arange(self.batch_size, dtype = np.int32)
        q_target = np.copy(target_val)
        q_target[batch_idx, actions] = rewards + self.gamma * next_state_val[batch_idx, max_action]*terminals
        self.q_network.train_on_batch(states, q_target)
        self.update_epsilon()
#         print(self.epsilon)
        self.train_step += 1
        
        
            
        

# Agent instance

In [23]:
lunar_agent = agent()
episodes = 500
for episode in range(episodes):
    terminal = False
    state = env.reset()
    total_reward = 0 
    
    while not terminal:
#         env.render()
        action = lunar_agent.choose_action(state)
        next_state, reward, terminal, _ = env.step(action)
        lunar_agent.update_memory(state, action, reward, next_state, terminal)
        lunar_agent.learn()
        state = next_state
        total_reward += reward
        
        if terminal:
            print("total reward {} at episode {} with epsilon {}".format(total_reward, episode, lunar_agent.epsilon))
lunar_agent.save_model()

total reward -83.65534828794836 at episode 0 with epsilon 1
total reward -244.3719863695172 at episode 1 with epsilon 0.8689999999999999


KeyboardInterrupt: 