In [1]:
# Find RL_Note path and append sys path
import os, sys
cwd = os.getcwd()
pos = cwd.find('RL_Note')
root_path = cwd[0:pos] + 'RL_Note'
sys.path.append(root_path)

In [2]:
# Refer from
#  https://pasus.tistory.com/138
#  https://horomary.hatenablog.com/entry/2020/06/26/003806
#  https://keras.io/examples/rl/ddpg_pendulum/
#
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, concatenate, Lambda
import matplotlib.pyplot as plt
from pys.utils.ou_noise import OUActionNoise
from pys.utils.ER import ReplayMemory
from pys.utils.PER import ProportionalPrioritizedMemory
from pys.utils.HER import HindsightMemory
from pys.config.env_config import env_configs

In [3]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size, action_min, action_max):
        super(Actor, self).__init__()
        self.action_min = action_min
        self.action_max = action_max

        self.fc1 = Dense(64, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        # self.fc3 = Dense(16, activation='relu')
        self.out= Dense(action_size, activation='tanh',kernel_initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)) # -1 ~ +1

    def call(self, x):
        x       = self.fc1(x)
        x       = self.fc2(x)
        # x       = self.fc3(x)
        action  = self.out(x)
        # return self.projected_to_action_space(action)
        a = Lambda(lambda x: x*self.action_max)(action)
        return a

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.s1 = Dense(16, activation='relu')
        self.s2 = Dense(32, activation='relu')
        self.a1 = Dense(32, activation='relu')
        self.a2 = Dense(32, activation='relu')
        self.fc1= Dense(64, activation='relu')
        self.fc2= Dense(64, activation='relu')
        self.out= Dense(1,  activation='linear')

    def call(self,state_action):
        state  = state_action[0]
        action = state_action[1]
        s = self.s1(state)
        s = self.s2(s)
        a = self.a1(action)
        a = self.a2(a)
        c = concatenate([s,a],axis=-1)
        x = self.fc1(c)
        x = self.fc2(x)
        q = self.out(x)
        return q


In [4]:
class DDPGAgent:
    def __init__(self, env:object, cfg:dict):
        self.state_size = env.observation_space.shape[0]
        self.action_size= env.action_space.shape[0]
        self.action_min = env.action_space.low[0]
        self.action_max = env.action_space.high[0]
        self.env_name   = cfg["ENV"]
        self.rl_type    = "DDPG"
        self.er_type    = cfg["ER"].upper()
        print('Env Name : ',cfg["ENV"])
        print('States {0}, Actions {1}'.format(self.state_size, self.action_size))
        for i in range(self.action_size):
            print(i+1,'th Action space {0:.2f} ~ {1:.2f}'.format(env.action_space.low[i], env.action_space.high[i]))
        self.filename = cfg["ENV"] + '_' + cfg["RL"] + '_' + cfg["ER"]

        # Experience Replay
        self.batch_size = 64
        self.train_start = 2000
        self.buffer_size = 50000
        if self.er_type == "ER":
            self.memory = ReplayMemory(capacity=self.buffer_size)
        elif self.er_type == "PER":
            self.memory = ProportionalPrioritizedMemory(capacity=self.buffer_size)
        elif self.er_type == "HER":
            self.memory = HindsightMemory(\
                capacity            = self.buffer_size,\
                replay_n            = cfg["HER"]["REPLAY_N"],\
                replay_strategy     = cfg["HER"]["STRATEGY"],\
                reward_func         = cfg["HER"]["REWARD_FUNC"],\
                done_func           = cfg["HER"]["DONE_FUNC"])
            self.filename = cfg["ENV"] + '_' + cfg["RL"] + '_' + cfg["ER"] + '_' + cfg["HER"]["STRATEGY"]

        # Hyper params for learning
        self.discount_factor = 0.99
        self.actor_learning_rate  = 0.001
        self.critic_learning_rate = 0.002
        self.tau = 0.005

        # Networks
        self.critic         = Critic(self.state_size, self.action_size)
        self.target_critic  = Critic(self.state_size, self.action_size)
        self.actor          = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.target_actor   = Actor(self.state_size, self.action_size, self.action_min, self.action_max)
        self.critic_optimizer   = tf.keras.optimizers.Adam(lr=self.critic_learning_rate)
        self.actor_optimizer    = tf.keras.optimizers.Adam(lr=self.actor_learning_rate)

        self.actor.build(input_shape=(None, self.state_size))
        self.target_actor.build(input_shape=(None, self.state_size))
        state_in = Input((self.state_size,))
        action_in = Input((self.action_size,))
        self.actor(state_in)
        self.target_actor(state_in)
        self.critic([state_in, action_in])
        self.target_critic([state_in, action_in])
        # self.actor.summary()
        # self.critic.summary()
        
        self.hard_update_target_model()

        # Noise
        self.noise_std_dev = 0.2
        self.ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(self.noise_std_dev) * np.ones(1))

        # Miscellaneous
        self.show_media_info = False

    def remember(self, state, action, reward, next_state, done, goal=None):
        state       = np.array(state,       dtype=np.float32)
        action      = np.array(action,      dtype=np.float32)
        reward      = np.array([reward],    dtype=np.float32)
        done        = np.array([done],      dtype=np.float32)
        next_state  = np.array(next_state,  dtype=np.float32)
        if self.er_type == "HER":
            goal        = np.array(goal,        dtype=np.float32)
            transition  = (state, action, reward, next_state, done, goal)
        else:
            transition  = (state, action, reward, next_state, done)
        self.memory.append(transition)
        return

    def hard_update_target_model(self):
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    def soft_update_target_model(self):
        tau = self.tau
        for (net, target_net) in zip(   self.actor.trainable_variables,
                                        self.target_actor.trainable_variables):
            target_net.assign(tau * net + (1.0 - tau) * target_net)
        for (net, target_net) in zip(   self.critic.trainable_variables,
                                        self.target_critic.trainable_variables):
            target_net.assign(tau * net + (1.0 - tau) * target_net)

    def get_action(self,state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        action = self.actor(state)
        # Exploration and Exploitation
        action_from_net = action.numpy()[0]
        action_from_noise = self.ou_noise()
        return np.clip(action_from_net+action_from_noise,self.action_min,self.action_max)

    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return 0.0,0.0
        # Sampling from the memory
        if self.er_type == "ER" or self.er_type == "HER":
            mini_batch = self.memory.sample(self.batch_size)
        elif self.er_type == "PER":
            mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)

        states      = tf.convert_to_tensor(np.array([sample[0] for sample in mini_batch]))
        actions     = tf.convert_to_tensor(np.array([sample[1] for sample in mini_batch]))
        rewards     = tf.convert_to_tensor(np.array([sample[2] for sample in mini_batch]))
        next_states = tf.convert_to_tensor(np.array([sample[3] for sample in mini_batch]))
        dones       = tf.convert_to_tensor(np.array([sample[4] for sample in mini_batch]))
        
        if self.show_media_info == False:
            self.show_media_info = True
            print('Start to train, check batch shapes')
            print('shape of states', np.shape(states),type(states))
            print('shape of actions', np.shape(actions),type(actions))
            print('shape of rewards', np.shape(rewards),type(rewards))
            print('shape of next_states', np.shape(next_states),type(next_states))
            print('shape of dones', np.shape(dones),type(dones))
            if self.er_type == "HER":
                goals = tf.convert_to_tensor(np.array([sample[5] for sample in mini_batch]))
                print('shape of goals', np.shape(goals),type(goals))

        # Update critic
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states,training=True)
            target_q = self.target_critic([next_states,target_actions],training=True)
            target_value = rewards + (1 - dones) * self.discount_factor * target_q
            q = self.critic([states, actions],training=True)
            td_error = target_value - q
            if self.er_type == "ER" or self.er_type == "HER":
                critic_loss = tf.math.reduce_mean(tf.math.square(target_value - q))
            elif self.er_type == "PER":
                critic_loss = tf.math.reduce_mean(is_weights * tf.math.square(target_value - q))
        critic_loss_out = critic_loss.numpy()
        critic_params = self.critic.trainable_variables
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.critic_optimizer.apply_gradients(zip(critic_grads, critic_params))

        # Update critic
        with tf.GradientTape() as tape:
            new_actions = self.actor(states,training=True)
            new_q = self.critic([states, new_actions],training=True)
            actor_loss = -tf.reduce_mean(new_q)
        actor_loss_out = actor_loss.numpy()
        actor_params = self.actor.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.actor_optimizer.apply_gradients(zip(actor_grads, actor_params))
        
        if self.er_type == "PER":
            sample_importance = td_error.numpy()
            for i in range(self.batch_size):
                self.memory.update(idxs[i], sample_importance[i])

        self.soft_update_target_model()
        return critic_loss_out, actor_loss_out

    def load_model(self):
        # self.actor.load_weights( "./save_model/" + self.filename + "_TF_actor")
        # self.critic.load_weights("./save_model/" + self.filename + "_TF_critic")
        return

    def save_model(self):
        # self.actor.save_weights( "./save_model/" + self.filename + "_TF_actor", save_format="tf")
        # self.critic.save_weights("./save_model/" + self.filename + "_TF_critic", save_format="tf")
        return

In [5]:
def done_function(state):
    return False

def reward_function(state, action, next_state, done):
    costh   = state[0]
    sinth   = state[1]
    th      = np.arctan2(sinth,costh)
    thdot   = state[2]
    u       = action[0]
    costs   = th ** 2 + 0.1 * thdot + 0.001 * (u ** 2)
    return -costs

In [11]:
cfg = { "ENV":"Pendulum-v0",\
        "RL":"DDPG",\
        "ER":"ER",\
        # "HER":\
        #     {
        #         "REPLAY_N":8,\
        #         "STRATEGY":"FINAL",\
        #         "REWARD_FUNC":reward_function,\
        #         "DONE_FUNC":done_function,\
        #     }
        # }
env_config = env_configs[cfg["ENV"]]
FILENAME = cfg["ENV"] + '_' + cfg["RL"] + '_' + cfg["ER"]
EPISODES = env_config["EPISODES"]
END_SCORE = env_config["END_SCORE"]

In [12]:
%matplotlib tk

figure = plt.gcf()
figure.set_size_inches(8,6)

if __name__ == "__main__":
    env = gym.make(cfg["ENV"])
    agent = DDPGAgent(env, cfg)
    scores_avg, scores_raw, episodes, losses = [], [], [], []
    critic_mean, actor_mean = [], []
    score_avg = 0
    end = False
    show_media_info = True
    goal = np.array([1.0,0.0,0.0])
    
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        critic_losses = []
        actor_losses = []
        while not done:
            # env.render()

            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.remember(state, action, reward, next_state, done, goal)
            critic_loss, actor_loss = agent.train_model()
            state = next_state

            score += reward
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)
            if show_media_info:
                print("State Shape : ", np.shape(state),    type(state),    state)
                print("Action Shape : ",np.shape(action),   type(action),   action)
                print("Reward Shape : ",np.shape(reward),   type(reward),   reward)
                print("Done Shape : ",  np.shape(done),     type(done),     done)
                print("Goal Shape : ",  np.shape(goal),     type(goal),     goal)
                show_media_info = False
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {0:3d} | score avg: {1:3.2f} | mem size {2:6d} |"
                    .format(e, score_avg, len(agent.memory)))

                episodes.append(e)
                scores_avg.append(score_avg)
                scores_raw.append(score)
                critic_mean.append(np.mean(critic_losses))
                actor_mean.append(np.mean(actor_losses))
                # View data
                plt.clf()
                plt.subplot(311)
                plt.plot(episodes, scores_avg, 'b')
                plt.plot(episodes, scores_raw, 'b', alpha=0.8, linewidth=0.5)
                plt.xlabel('episode'); plt.ylabel('average score'); plt.grid()
                plt.title(cfg["ENV"] +'_' + cfg["RL"] +'_' + cfg["ER"])
                plt.subplot(312)
                plt.plot(episodes, critic_mean, 'b.',markersize=3)
                plt.xlabel('episode'); plt.ylabel('critic loss'); plt.grid()
                plt.subplot(313)
                plt.plot(episodes, actor_mean, 'b.',markersize=3)
                plt.xlabel('episode'); plt.ylabel('actor loss'); plt.grid()
                # plt.savefig("./result/" + FILENAME + "_TF.jpg", dpi=100)

                # 이동 평균이 0 이상일 때 종료
                if score_avg > END_SCORE:
                    agent.save_model()
                    end = True
                    break
        if end == True:
            env.close()
            # np.save("./save_model/data/" + FILENAME + "_TF_epi",  episodes)
            # np.save("./save_model/data/" + FILENAME + "_TF_scores_avg",scores_avg)
            # np.save("./save_model/data/" + FILENAME + "_TF_scores_raw",scores_raw)
            # np.save("./save_model/data/" + FILENAME + "_TF_critic_mean",critic_mean)
            # np.save("./save_model/data/" + FILENAME + "_TF_actor_mean",actor_mean)
            print("End")
            break

Env Name :  Pendulum-v0
States 3, Actions 1
1 th Action space -2.00 ~ 2.00
State Shape :  (3,) <class 'numpy.ndarray'> [ 0.9938495  -0.11073921  0.76229894]
Action Shape :  (1,) <class 'numpy.ndarray'> [-0.0018271]
Reward Shape :  () <class 'numpy.float64'> -0.09860783483182112
Done Shape :  () <class 'bool'> False
Goal Shape :  (3,) <class 'numpy.ndarray'> [1. 0. 0.]
episode:   0 | score avg: -762.92 | mem size    400 |
episode:   1 | score avg: -823.49 | mem size    800 |
episode:   2 | score avg: -815.68 | mem size   1200 |
episode:   3 | score avg: -919.77 | mem size   1600 |
Start to train, check batch shapes
shape of mini_batch (64, 6) <class 'list'>
shape of states (64, 3) <class 'tensorflow.python.framework.ops.EagerTensor'>
shape of actions (64, 1) <class 'tensorflow.python.framework.ops.EagerTensor'>
shape of rewards (64, 1) <class 'tensorflow.python.framework.ops.EagerTensor'>
shape of next_states (64, 3) <class 'tensorflow.python.framework.ops.EagerTensor'>
shape of dones (

KeyboardInterrupt: 

In [None]:
env.close()

  self.func()
