In [1]:
# Find RL_Note path and append sys path
import os, sys
cwd = os.getcwd()
pos = cwd.find('RL_Note')
root_path = cwd[0:pos] + 'RL_Note'
sys.path.append(root_path)

In [2]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, concatenate, Lambda
import matplotlib.pyplot as plt
from pys.utils.ou_noise import OUActionNoise
from pys.utils.prioritized_memory import ProportionalPrioritizedMemory

In [3]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, action_min, action_max):
        super(Actor, self).__init__()
        self.action_min = action_min
        self.action_max = action_max

        self.fc1 = Dense(64, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        self.fc3 = Dense(16, activation='relu')
        self.out= Dense(action_size, activation='tanh',kernel_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)) # -1 ~ +1

    def call(self, x):
        x       = self.fc1(x)
        x       = self.fc2(x)
        x       = self.fc3(x)
        action  = self.out(x)
        # return self.projected_to_action_space(action)
        a = Lambda(lambda x: x*self.action_max)(action)
        return a

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1 = Dense(32, activation='relu')
        self.s2 = Dense(32, activation='relu')
        self.a1 = Dense(16, activation='relu')
        self.a2 = Dense(32, activation='relu')
        self.fc1= Dense(64, activation='relu')
        self.fc2= Dense(64, activation='relu')
        self.out= Dense(1,  activation='linear')

    def call(self,state,action):
        # state  = state_action[0]
        # action = state_action[1]
        s = self.s1(state)
        s = self.s2(s)
        a = self.a1(action)
        a = self.a2(a)
        c = concatenate([s,a],axis=-1)
        x = self.fc1(c)
        x = self.fc2(x)
        q = self.out(x)
        return q

In [4]:
class DDPGAgent:
    def __init__(self, state_size, action_size, action_min, action_max):
        self.state_size = state_size
        self.action_size= action_size
        self.action_min = action_min
        self.action_max = action_max

        # Hyper params for learning
        self.discount_factor = 0.99
        self.actor_learning_rate  = 0.001
        self.critic_learning_rate = 0.002
        self.tau = 0.005

        # Experience Replay
        self.batch_size = 64
        self.train_start = 1000
        self.buffer_size = 50000
        self.memory = ProportionalPrioritizedMemory(capacity=self.buffer_size)
        # HER

        self.critic         = Critic(self.state_size, self.action_size)
        self.target_critic  = Critic(self.state_size, self.action_size)
        self.actor          = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.target_actor   = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.critic_optimizer   = tf.keras.optimizers.Adam(lr=self.critic_learning_rate)
        self.actor_optimizer    = tf.keras.optimizers.Adam(lr=self.actor_learning_rate)
        self.actor.build(input_shape=(None, self.state_size))
        self.target_actor.build(input_shape=(None, self.state_size))
        state_in = Input((self.state_size,))
        action_in = Input((self.action_size,))
        self.actor(state_in)
        self.target_actor(state_in)
        self.critic(state_in, action_in)
        self.target_critic(state_in, action_in)
        self.actor.summary()
        self.critic.summary()
        
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

        std_dev = 0.1
        self.ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
        
        self.show_media_info = False

    def remember(self, state, action, reward, next_state, done):
        transition = (state, action, reward, next_state, done)
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        action = tf.convert_to_tensor([action], dtype=tf.float32)
        next_state = tf.convert_to_tensor([next_state], dtype=tf.float32)
        # ER / PER
        self.memory.append(transition)

    def update_target_model(self):
        tau = self.tau
        for (net, target_net) in zip(   self.actor.trainable_variables,
                                        self.target_actor.trainable_variables):
            target_net.assign(tau * net + (1.0 - tau) * target_net)
        for (net, target_net) in zip(   self.critic.trainable_variables,
                                        self.target_critic.trainable_variables):
            target_net.assign(tau * net + (1.0 - tau) * target_net)

    def get_action(self,state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        action = self.actor(state)
        # Exploration and Exploitation
        action_from_net = action.numpy()[0]
        action_from_noise = self.ou_noise()
        return np.clip(action_from_net+action_from_noise,self.action_min,self.action_max)

    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return
        # Sampling from the memory
        # ER
        # mini_batch = random.sample(self.memory, self.batch_size)
        # PER
        mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)

        states      = tf.convert_to_tensor(np.array([sample[0] for sample in mini_batch]))
        actions     = tf.convert_to_tensor(np.array([sample[1] for sample in mini_batch]))
        rewards     = tf.convert_to_tensor(np.array([sample[2] for sample in mini_batch]),dtype=tf.float32)
        rewards     = tf.expand_dims(rewards, axis = 1)
        next_states = tf.convert_to_tensor(np.array([sample[3] for sample in mini_batch]))
        dones       = tf.convert_to_tensor(np.array([sample[4] for sample in mini_batch]),dtype=tf.float32)
        dones       = tf.expand_dims(dones, axis = 1)
        
        # if self.show_media_info == False:
        #     self.show_media_info = True
        # print('Start to train, check batch shapes')
        print('shape of states', np.shape(states),type(states))
        print('shape of actions', np.shape(actions),type(actions))
        print('shape of next_states', np.shape(next_states),type(next_states)) 
        print('shape of dones', np.shape(dones),type(dones))

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states,training=True)
            target_q = self.target_critic(next_states,target_actions,training=True)
            target_value = rewards + (1 - dones) * self.discount_factor * target_q
            q = self.critic(states, actions,training=True)
            td_error = target_value - q
            critic_loss = tf.math.reduce_mean(is_weights*tf.math.square(td_error))
        critic_params = self.critic.trainable_variables
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.critic_optimizer.apply_gradients(zip(critic_grads, critic_params))

        with tf.GradientTape() as tape:
            new_actions = self.actor(states,training=True)
            new_q = self.critic(states, new_actions,training=True)
            actor_loss = -tf.reduce_mean(new_q)
        actor_params = self.actor.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.actor_optimizer.apply_gradients(zip(actor_grads, actor_params))
        
        self.update_target_model()
        for i in range(self.batch_size):
            self.memory.update(idxs[i],td_error[i])
        return

    def save_model(self):
        self.actor.save_weights("./save_model/LunarLanderContinuous_ddpg_per_TF_actor", save_format="tf")
        self.critic.save_weights("./save_model/LunarLanderContinuous_ddpg_per_TF_critic", save_format="tf")
        return


In [5]:
%matplotlib tk

ENV_NAME = 'LunarLanderContinuous-v2'
EPISODES = 5000
END_SCORE = 250

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size  = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    action_min  = env.action_space.low[0]
    action_max  = env.action_space.high[0]

    agent = DDPGAgent(state_size, action_size, action_min, action_max)
    print('Env Name : ',ENV_NAME)
    print('States {0}, Actions {1}'.format(state_size, action_size))
    for i in range(0,action_size):
        print('Action{0:d} space {1:.2f} ~ {2:.2f}'.format(i, env.action_space.low[i], env.action_space.high[i]))
    scores_avg, scores_raw, episodes, losses = [], [], [], []
    score_avg = 0

    end = False
    show_media_info = True
    
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        while not done:
            # if e%100 == 0:
            #     env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            agent.train_model()
            state = next_state
            # 
            score += reward
            if show_media_info:
                print("State Shape : ", np.shape(state))
                print("Action Shape : ", np.shape(action))
                print("Reward Shape : ", np.shape(reward))
                print("done Shape : ", np.shape(done))
                show_media_info = False
            if done:
                tau = 0.95
                score_avg = tau * score_avg + (1.0 - tau) * score if score_avg != 0 else score
                print("episode: {0:3d} | score avg: {1:3.2f} | mem size {2:6d} |"
                    .format(e, score_avg, len(agent.memory)))

                # Save data for plot
                episodes.append(e)
                scores_avg.append(score_avg)
                scores_raw.append(score)

                plt.plot(episodes, scores_avg, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('LunarLanderContinuous DDPG PER TF')
                plt.grid()
                plt.savefig("./save_model/LunarLanderContinuous_ddpg_per_TF.png")

                # 이동 평균이 0 이상일 때 종료
                if score_avg > END_SCORE:
                    agent.save_model()
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/data/cartpole_dqn_per_TF_epi',  episodes)
            np.save('./save_model/data/cartpole_dqn_per_TF_scores_avg',scores_avg)
            np.save('./save_model/data/cartpole_dqn_per_TF_scores_raw',scores_raw)
            print("End")
            break

Model: "actor"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             multiple                  576       
_________________________________________________________________
dense_15 (Dense)             multiple                  4160      
_________________________________________________________________
dense_16 (Dense)             multiple                  1040      
_________________________________________________________________
dense_17 (Dense)             multiple                  34        
Total params: 5,810
Trainable params: 5,810
Non-trainable params: 0
_________________________________________________________________
Model: "critic"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  288       
___________________________________________

TypeError: 'int' object is not subscriptable

In [None]:
env.close()