In [None]:
"""
    algorithm: Nature_DQN( two Q networks) 
        Nature_DQN have two network, in this case we can avoid the circular dependency between the target Q and 
evaluation Q. If only a network, the TD-error will be greater deviation. but the Nature DQN is different from the DDQN.
You can see more detail at:  https://www.cnblogs.com/pinard/p/9756075.html.
    three key points:
        1. function approximation
        2. target Q network
        3. experience replay

    environment: MountainCar-v0
        State: [position, velocity]， position[-0.6, 0.6]， velocity[-0.1, 0.1]
        Action: 0(left)  or  1(stop)  or  2(right)
        Reward: -1 for every step
        Done: get hilltop
        
    prerequisites:  tensorflow 2.2(tensorflow >= 2.0)
    notice： don‘t env.make(MountainCar).unwrapper, which will cost a huge amount of time.
    
    author: Xinchen Han

    date: 2020/7/27

"""

In [None]:
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
from collections import deque
import gym
import random
import tensorflow as tf

In [None]:
seed = 3

env_id = 'MountainCar-v0'
env = gym.make(env_id)
# env = env.unwrapped
# print("the dimension of the state",env.observation_space.shape[0])
# print("the dimension of the action",env.action_space)
max_episodes = 500

env.seed(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

render = True
state_dim = 2
action_dim = 3

In [None]:
class Nature_DQN(object):
    def __init__(self):
        self.step = 0
        self.update_freq = 100
        self.replay_size = 2000
        self.replay_queue = deque(maxlen=self.replay_size)
        self.model = self.create_model()
        self.target_model = self.create_model()
        self.optimizer = tf.keras.optimizers.Adam(1e-2)

    def create_model(self):
        input = keras.layers.Input(shape=state_dim)
        hidden1 = keras.layers.Dense(64, activation='tanh')(input)
        hidden2 = keras.layers.Dense(16, activation='relu')(hidden1)
        output = keras.layers.Dense(action_dim)(hidden2)
        model = keras.models.Model(inputs=[input], outputs=[output])
        return model

    def choose_action(self, state, epsilon=0.1):
        if np.random.uniform() < epsilon - self.step * 0.0001:
            return np.random.choice([0, 1, 2])
        return np.argmax(self.model.predict(np.array([state]))[0])

    def fill_queue(self, state, action, state_, reward, done):
        if state_[0] >= 0.4:
            reward += 1
        self.replay_queue.append((state, action, state_, reward, done))

    def train(self, batch_size=64, gamma=0.95):
        if len(self.replay_queue) < self.replay_size:
            return
        self.step += 1
        if self.step % self.update_freq == 0:  #  have trained self.update_freq times to update target_model
            self.target_model.set_weights(self.model.get_weights())

        replay_batch = random.sample(self.replay_queue, batch_size)
        state_batch = np.array([replay[0] for replay in replay_batch])
        action_batch = np.array([replay[1] for replay in replay_batch])
        next_state_batch = np.array([replay[2] for replay in replay_batch])
        reward_batch = np.array([replay[3] for replay in replay_batch])
        done_batch = np.array([replay[4] for replay in replay_batch])

        with tf.GradientTape() as tape:
            tape.watch(self.model.variables)
            Q = self.model(tf.convert_to_tensor(state_batch, dtype=tf.float32))
            Q_next = self.target_model(tf.convert_to_tensor(next_state_batch, dtype=tf.float32))

            max_action = tf.argmax(Q_next, axis=1)
            Q = tf.reduce_sum(tf.one_hot(action_batch, action_dim) * Q, axis=1)
            Q_next = tf.reduce_sum(tf.one_hot(max_action, action_dim) * Q_next, axis=1)

            target_value = (1 - done_batch) * gamma * Q_next + reward_batch
            loss = tf.reduce_mean(tf.square(Q - target_value) * 0.5)

        grads = tape.gradient(loss, self.model.variables)
        self.optimizer.apply_gradients(zip(grads, self.model.variables))
        

In [None]:
if __name__ == '__main__':
    agent = Nature_DQN()
    score_buffer = []
    for episode in range(max_episodes):
        state = env.reset()
        score = 0
        while True:
            action = agent.choose_action(state)
            if render:
                env.render()
            state_, reward, done, _ = env.step(action)
            agent.fill_queue(state, action, state_, reward, done)
            agent.train()
            score += reward
            state = state_
            if done:
                score_buffer.append(score)
                print('episode:', episode, 'score:', score, 'max:', np.max(score_buffer))
                break

        if np.mean(score_buffer[-20:]) > -160:
            break
    env.close()
    plt.plot(score_buffer)
    plt.show()