In [None]:
"""
    algorithm: Distributional DQN(C51)
        In the algorithm，the Q network accept a state, and its outputs are not Q(s,a) but a distribution of this state.
This distribution describes the possibility of all values of the state for every action.
More details you can learn from the paper: https://arxiv.org/pdf/1707.06887.pdf
        key points:
            1. Q-value --> Q-distribution

    environment: CartPole-v0
    state:
        1.Cart Position:[-4.8,4.8],  2.Cart Velocity[-Inf,Inf],  3.Pole Angle[-24 deg, 24 deg]
        4.Pole Velocity [-Inf,Inf]
    action:
        0: left    
        1: right
    reward: 1 for every step    
        
    prerequisites: tensorflow 2.2(tensorflow >= 2.0)
    notice：
    
    author: Xinchen Han
    date: 2020/7/30

"""


In [None]:
from tensorflow import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from collections import deque
import gym
import random

In [None]:
"""Environment"""
env = gym.make('CartPole-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

"""Random seed"""
env.seed(6)
np.random.seed(6)
random.seed(6)
tf.random.set_seed(6)

"""Set hyperparameters"""
alpha = 0.9
gamma = 0.9
max_episodes = 500

render = True


In [None]:
class Distributional_DQN(object):
    def __init__(self):
        self.step = 0
        self.batch_size = 64
        self.update_freq = 100
        self.replay_size = 1000
        self.atom_num = 51
        self.replay_queue = deque(maxlen=self.replay_size)
        self.model = self.create_model()
        self.target_model = self.create_model()
        self.optimizer = keras.optimizers.Adam(1e-3)
        self.min_value = -10
        self.max_value = 10
        self.vrange = np.linspace(self.min_value, self.max_value, self.atom_num)
        self.deltaz = float(self.max_value - self.min_value) / (self.atom_num - 1)

    def create_model(self):
        input = keras.layers.Input(shape=state_dim)
        hidden1 = keras.layers.Dense(64, activation='relu')(input)
        hidden2 = keras.layers.Dense(16, activation='tanh')(hidden1)
        hidden3 = keras.layers.Dense(action_dim * self.atom_num)(hidden2)
        reshape = keras.layers.Reshape((action_dim, self.atom_num))(hidden3)
        output = tf.nn.log_softmax(reshape, 2)
        model = keras.models.Model(inputs=[input], outputs=[output])
        return model

    def choose_action(self, state, epsilon=0.1):
        if np.random.uniform() > epsilon - self.step * 0.0001:
            qdist = np.exp(self.model(state.reshape(1,4)).numpy())
            qvalues = (qdist * self.vrange).sum(-1)
            return qvalues.argmax(1)[0]
        else:
            return random.sample(list(np.arange(0, action_dim)), 1)[0]

    def fill_replay(self, state, action, state_, reward, done):
        self.replay_queue.append((state, action, state_, reward, done))

    def model_train(self):
        self.step += 1
        if self.step % self.update_freq == 0:
            self.target_model.set_weights(self.model.get_weights())

        replay_batch = random.sample(self.replay_queue, self.batch_size)
        state_batch = np.array([replay[0] for replay in replay_batch])
        action_batch = np.array([replay[1] for replay in replay_batch])
        next_state_batch = np.array([replay[2] for replay in replay_batch])
        reward_batch = np.array([replay[3] for replay in replay_batch])
        done_batch = np.array([replay[4] for replay in replay_batch])

        b_dist_ = np.exp(self.target_model(next_state_batch).numpy())
        b_a_ = (b_dist_ * self.vrange).sum(-1).argmax(1)
        b_tzj = np.clip(gamma * (1 - done_batch[:, None]) * self.vrange[None, :] + reward_batch[:, None],
                        self.min_value, self.max_value)
        b_i = (b_tzj - self.min_value) / self.deltaz
        b_l = np.floor(b_i).astype('int64')
        b_u = np.ceil(b_i).astype('int64')
        templ = b_dist_[range(self.batch_size), b_a_, :] * (b_u - b_i)
        tempu = b_dist_[range(self.batch_size), b_a_, :] * (b_i - b_l)
        b_m = np.zeros((self.batch_size, self.atom_num))

        for j in range(self.batch_size):
            for k in range(self.atom_num):
                b_m[j][b_l[j][k]] += templ[j][k]
                b_m[j][b_u[j][k]] += tempu[j][k]
        b_m = tf.convert_to_tensor(b_m, dtype='float32')
        b_index = np.stack([range(self.batch_size), action_batch], 1)
        b_index = tf.convert_to_tensor(b_index, 'int64')

        self._train_func(state_batch, b_index, b_m)

    @tf.function
    def _train_func(self, b_o, b_index, b_m):
        with tf.GradientTape() as tape:
            b_dist_a = tf.gather_nd(self.model(b_o), b_index)
            loss = tf.reduce_mean(tf.negative(tf.reduce_sum(b_dist_a * b_m, 1)))

        grad = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))


In [None]:
if __name__ == "__main__":
    agent = Distributional_DQN()
    score_list = []
    for episode in range(max_episodes):
        state = env.reset()
        score = 0
        while True:
            action = agent.choose_action(state)
            if render:
                env.render()
            state_, reward, done, _ = env.step(action)
            agent.fill_replay(state, action, state_, reward, done)
            if len(agent.replay_queue) >= agent.replay_size:
                agent.model_train()
            state = state_
            score += reward
            if done:
                score_list.append(score)
                print('episode:', episode, 'score:', score, 'max_score:', np.max(score_list))
                if len(agent.replay_queue) >= agent.replay_size:
                    print("   Training   ....")
                break
        if np.mean(score_list[-10:]) > 180:
            break

    env.close()
    plt.plot(score_list, color='orange')
    plt.show()
