In [1]:
# Find RL_Note path and append sys path
import os, sys
cwd = os.getcwd()
pos = cwd.find('RL_Note')
root_path = cwd[0:pos] + 'RL_Note'
sys.path.append(root_path)
print(root_path)

e:\MyNote\RL_Note


In [2]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense
from pys.utils.prioritized_memory import ProportionalPrioritizedMemory

In [3]:
class DQN(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = Dense(255,activation='relu')
        self.fc2 = Dense(255,activation='relu')
        self.out = Dense(action_size,kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

    def call(self,x):
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.out(x)
        return q

In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper-parameters for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        
        # Experience Replay
        self.batch_size = 8
        self.train_start = 500
        self.buffer_size = 10000
        self.memory = ProportionalPrioritizedMemory(capacity=self.buffer_size)
        
        # Neural Network Architecture
        self.model        = DQN(self.state_size, self.action_size)
        self.target_model = DQN(self.state_size, self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        
        self.update_target_model()
        self.show_media_info = False

    def remember(self, state, action, reward, next_state, done):
        state       = np.array(state,dtype=np.float32)
        action      = np.array(action)
        reward      = np.array([reward],dtype=np.float32)
        done        = np.array([done],dtype=np.float32)
        next_state  = np.array(next_state,dtype=np.float32)
        transition  = (state, action, reward, next_state, done)
        self.memory.append(transition)
        
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def choose_action(self,state):
        # Exploration and Exploitation
        if (np.random.rand() <= self.epsilon):
            return random.randrange(self.action_size)
        else:
            state = tf.convert_to_tensor([state], dtype=tf.float32)
            return np.argmax(self.model.predict(state))
    
    # def get_td_error(self):

    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return 0.0
        # Decaying Exploration Ratio
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        # Sampling from the memory
        # ER
        # mini_batch = random.sample(self.memory, self.batch_size)
        # PER
        mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)

        states      = tf.convert_to_tensor(np.array([sample[0] for sample in mini_batch]))
        actions     = tf.convert_to_tensor(np.array([sample[1] for sample in mini_batch]))
        rewards     = tf.convert_to_tensor(np.array([sample[2] for sample in mini_batch]))
        next_states = tf.convert_to_tensor(np.array([sample[3] for sample in mini_batch]))
        dones       = tf.convert_to_tensor(np.array([sample[4] for sample in mini_batch]))
        
        if self.show_media_info == False:
            self.show_media_info = True
            print('Start to train, check batch shapes')
            print('shape of states', np.shape(states),type(states))
            print('shape of actions', np.shape(actions),type(actions))
            print('shape of rewards', np.shape(rewards),type(rewards))
            print('shape of next_states', np.shape(next_states),type(next_states))
            print('shape of dones', np.shape(dones),type(dones))
            
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            q = self.model(states)
            one_hot_action = tf.one_hot(actions, self.action_size)
            q = tf.reduce_sum(one_hot_action * q, axis=1)
            q = tf.expand_dims(q,axis=1)
            
            target_q = tf.stop_gradient(self.target_model(next_states))
            max_q = tf.reduce_max(target_q,axis=1)
            max_q = tf.expand_dims(max_q,axis=1)
            targets = rewards + (1 - dones) * self.discount_factor * max_q
            td_error = targets - q
            loss = tf.reduce_mean(is_weights * tf.square(targets - q))
            loss_out = tf.reduce_mean(tf.square(targets - q))
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        self.update_target_model()

        sample_importance = td_error.numpy()
        for i in range(self.batch_size):
            self.memory.update(idxs[i],sample_importance[i])
        return loss_out

    def save_model(self):
        self.model.save_weights('./save_model/cartpole_dqn_per_TF', save_format='tf')

In [5]:
%matplotlib tk

ENV_NAME = 'CartPole-v1'
EPISODES = 1000
END_SCORE = 400

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    scores_avg, scores_raw, episodes, losses = [], [], [], []
    epsilons = []
    score_avg = 0
    
    end = False
    
    for e in range(EPISODES):
        # Episode initialization
        done = False
        score = 0
        state = env.reset()
        critic_losses = []
        actor_losses = []
        while not done:
            #env.render()

            # Interact with env.
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            loss = agent.train_model()
            state = next_state

            # 
            score += reward
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | mem length: {:4d} | epsilon: {:.4f}'
                      .format(e, score_avg, len(agent.memory), agent.epsilon))

                # Save data for plot
                episodes.append(e)
                scores_avg.append(score_avg)
                scores_raw.append(score)
                epsilons.append(agent.epsilon)
                losses.append(loss)

                # View data
                plt.clf()
                plt.subplot(311)
                plt.plot(episodes, scores_avg, 'b')
                plt.plot(episodes, scores_raw, 'b', alpha=0.8, linewidth=0.5)
                plt.xlabel('episode'); plt.ylabel('average score'); plt.grid()
                plt.title('LunarLanderv2 DQN PER')
                plt.subplot(312)
                plt.plot(episodes, epsilons, 'b')
                plt.xlabel('episode'); plt.ylabel('epsilon'); plt.grid()
                plt.subplot(313)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode'); plt.ylabel('losses') ;plt.grid()
                plt.savefig('./result/N255_B8/cartpole_dqn_per_TF.png')

                if score_avg > END_SCORE:
                    agent.save_model()
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./result/N255_B8/cartpole_dqn_per_TF_epi',  episodes)
            np.save('./result/N255_B8/cartpole_dqn_per_TF_scores_avg',scores_avg)
            np.save('./result/N255_B8/cartpole_dqn_per_TF_scores_raw',scores_raw)
            np.save('./result/N255_B8/cartpole_dqn_per_TF_loss', losses)
            print("End")
            break

Env Name :  CartPole-v1
States 4, Actions 2
epi:   0 | score avg 14.00 | mem length:   14 | epsilon: 1.0000
epi:   1 | score avg 15.80 | mem length:   46 | epsilon: 1.0000
epi:   2 | score avg 16.02 | mem length:   64 | epsilon: 1.0000
epi:   3 | score avg 15.92 | mem length:   79 | epsilon: 1.0000
epi:   4 | score avg 17.23 | mem length:  108 | epsilon: 1.0000
epi:   5 | score avg 16.90 | mem length:  122 | epsilon: 1.0000
epi:   6 | score avg 18.11 | mem length:  151 | epsilon: 1.0000
epi:   7 | score avg 17.90 | mem length:  167 | epsilon: 1.0000
epi:   8 | score avg 17.31 | mem length:  179 | epsilon: 1.0000
epi:   9 | score avg 18.58 | mem length:  209 | epsilon: 1.0000
epi:  10 | score avg 18.82 | mem length:  230 | epsilon: 1.0000
epi:  11 | score avg 17.84 | mem length:  239 | epsilon: 1.0000
epi:  12 | score avg 19.16 | mem length:  270 | epsilon: 1.0000
epi:  13 | score avg 23.94 | mem length:  337 | epsilon: 1.0000
epi:  14 | score avg 23.65 | mem length:  358 | epsilon: 1.0

InvalidArgumentError: Value for attr 'TI' of float is not in the list of allowed values: uint8, int32, int64
	; NodeDef: {{node OneHot}}; Op<name=OneHot; signature=indices:TI, depth:int32, on_value:T, off_value:T -> output:T; attr=axis:int,default=-1; attr=T:type; attr=TI:type,default=DT_INT64,allowed=[DT_UINT8, DT_INT32, DT_INT64]> [Op:OneHot]