In [1]:
# It is good to refer github following as : 
# https://github.com/shivaverma/OpenAIGym/blob/master/lunar-lander/discrete/lunar_lander.py
import gym
import sys
import random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque
from tensorflow.keras.layers import Dense

In [2]:
class DQN(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = Dense(64,activation='relu')
        self.fc2 = Dense(64,activation='relu')
        self.out = Dense(action_size,kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

    def call(self,x):
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.out(x)
        return q

In [3]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        
        # Experience Replay
        self.batch_size = 64
        self.train_start = 1000
        self.memory = deque(maxlen=5000)
        
        self.model        = DQN(self.state_size, self.action_size)
        self.target_model = DQN(self.state_size, self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        
        self.update_target_model()
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def choose_action(self,state):
        # Exploration and Exploitation
        if (np.random.rand() <= self.epsilon):
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state))
        
    def train_model(self, state, action, reward, next_state, done):
        # Train from Experience Replay
        if len(self.memory) < self.train_start:
            return 0.0
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        mini_batch = random.sample(self.memory, self.batch_size)
        
        states      = np.array([sample[0][0] for sample in mini_batch])
        actions     = np.array([sample[1] for sample in mini_batch])
        rewards     = np.array([sample[2] for sample in mini_batch])
        next_states = np.array([sample[3][0] for sample in mini_batch])
        dones       = np.array([sample[4] for sample in mini_batch])
        
        
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            predicts = self.model(states)
            one_hot_action = tf.one_hot(actions, self.action_size)
            predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)
            
            target_predicts = self.target_model(next_states)
            target_predicts = tf.stop_gradient(target_predicts)
            
            max_q = np.amax(target_predicts, axis=-1)
            targets = rewards + (1 - dones) * self.discount_factor * max_q
            loss = tf.reduce_mean(tf.square(targets - predicts))
            
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        return loss

In [4]:
%matplotlib tk

ENV_NAME = 'LunarLander-v2'
EPISODES = 1000

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores, episodes, epsilons, losses = [], [], [], []
    score_avg = 0
    
    end = False
    
    fig = plt.figure(1)
    fig.clf()
    
    for e in range(EPISODES):
        done = False
        score = 0
        loss_list = []

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            #env.render()

            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            score += reward
            reward = 0.1 if not done or score == 500 else -1

            agent.remember(state, action, reward, next_state, done)

            loss = agent.train_model(state, action, reward, next_state, done)
            loss_list.append(loss)

            state = next_state
            if done:
                agent.update_target_model()

                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | mem length: {:4d} | epsilon: {:.4f}'
                      .format(e, score_avg, len(agent.memory), agent.epsilon))

                episodes.append(e)
                scores.append(score_avg)
                epsilons.append(agent.epsilon)
                losses.append(np.mean(loss_list))
                plt.subplot(311)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('LunarLanderv2 DQN')
                plt.grid()
                
                plt.subplot(312)
                plt.plot(episodes, epsilons, 'b')
                plt.xlabel('episode')
                plt.ylabel('epsilon')
                plt.grid()
                
                plt.subplot(313)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('losses')
                plt.grid()
                
                plt.savefig('./save_model/LunarLanderv2_dqn.png')

                if score_avg > 200:
                    agent.model.save_weights('./save_model/LunarLanderv2_dqn', save_format='tf')
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/LunarLanderv2_dqn_epi',  episodes)
            np.save('./save_model/LunarLanderv2_dqn_score',scores)
            np.save('./save_model/LunarLanderv2_dqn_loss', losses)
            print("End")
            break

Env Name :  LunarLander-v2
States 8, Actions 4
epi:   0 | score avg -163.26 | mem length:   68 | epsilon: 1.0000
epi:   1 | score avg -157.97 | mem length:  142 | epsilon: 1.0000
epi:   2 | score avg -154.44 | mem length:  277 | epsilon: 1.0000
epi:   3 | score avg -161.97 | mem length:  360 | epsilon: 1.0000
epi:   4 | score avg -157.27 | mem length:  473 | epsilon: 1.0000
epi:   5 | score avg -192.35 | mem length:  551 | epsilon: 1.0000
epi:   6 | score avg -211.57 | mem length:  673 | epsilon: 1.0000
epi:   7 | score avg -200.16 | mem length:  752 | epsilon: 1.0000
epi:   8 | score avg -231.88 | mem length:  872 | epsilon: 1.0000
epi:   9 | score avg -220.49 | mem length:  944 | epsilon: 1.0000
epi:  10 | score avg -206.30 | mem length: 1023 | epsilon: 0.9763
epi:  11 | score avg -192.27 | mem length: 1089 | epsilon: 0.9139
epi:  12 | score avg -194.42 | mem length: 1183 | epsilon: 0.8319
epi:  13 | score avg -194.85 | mem length: 1269 | epsilon: 0.7633
epi:  14 | score avg -186.25 

In [5]:
env.close()