In [1]:
#A2C

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Lambda
from tensorflow.keras.optimizers import Adam

import numpy as np
import matplotlib.pyplot as plt


## A2C Actor NN

In [3]:
class Actor(Model):
    def __init__(self, action_dim, action_bound):
        super(Actor, self).__init__()
        self.action_bound = action_bound

        self.h1 = Dense(64, activation='relu')
        self.h2 = Dense(64, activation='relu')
        self.h3 = Dense(16, activation='relu')
        self.mu = Dense(action_dim, activation='tanh')
        self.std = Dense(action_dim, activation='softplus')

    def call(self, states):
        x = self.h1(states)
        x = self.h2(x)
        x = self.h3(x)
        mu = self.mu(x)
        std = self.std(x)

        mu = Lambda(lambda x: x * self.action_bound)(mu)

        return [mu, std]

# Critic NN

In [4]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.h1 = Dense(64, activation='relu')
        self.h2 = Dense(32, activation='relu')
        self.h3 = Dense(16, activation='relu')
        self.v = Dense(1, activation='linear')

    def call(self, states):
        x = self.h1(states)
        x = self.h2(x)
        x = self.h3(x)
        v = self.v(x)

        return v

## agent

In [5]:
class A2Cagent(object):
    def __init__(self, env):
        self.GAMMA = 0.99
        self.BATCH_SIZE = 32
        self.ACTOR_LR = 0.0001
        self.CRITIC_LR = 0.001

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.actor = Actor(self.action_dim, self.action_bound)
        self.critic = Critic()
        self.actor.build(input_shape=(None, self.state_dim))
        self.critic.build(input_shape=(None, self.state_dim))

        self.actor_opt = Adam(self.ACTOR_LR)
        self.critic_opt = Adam(self.CRITIC_LR)

        self.save_episode_reward = []

    def log_pdf(self, mu, std, actions):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (actions - mu) ** 2 / var - 0.5 * tf.math.log(2 * np.pi * var)
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
    

    def get_action(self, state):
        mu, std = self.actor(state)
        mu = mu.numpy()[0]
        std = std.numpy()[0]
        std = np.clip(std, self.std_bound[0], self.std_bound[1])
        action = np.random.normal(mu, std, size=self.action_dim)

        return action
    
    def actor_learn(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            mu, std = self.actor(states, training=True)
            log_policy_pdf = self.log_pdf(mu, std, actions)

            loss_policy = log_policy_pdf * advantages
            loss = -tf.reduce_mean(-loss_policy)

        grads = tape.gradient(loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(zip(grads, self.actor.trainable_variables))


    def critic_learn(self, states, td_targets):
        with tf.GradientTape() as tape:
            td_hat = self.critic(states, training=True)
            loss = tf.reduce_mean(tf.square(td_targets - td_hat))

        grads = tape.gradient(loss, self.critic.trainable_variables)
        self.critic_opt.apply_gradients(zip(grads, self.critic.trainable_variables))


    def td_target(self, rewards, next_v, dones):
        y_i = np.zeros(next_v.shape)
        for i in range(next_v.shape[0]):
            if dones[i]:
                y_i[i] = rewards[i]
            else:
                y_i[i] = rewards[i] + self.GAMMA * next_v[i]
        return y_i
    
    def load_weights(self, path):
        actor_weights_path = path + 'pendulum_actor.h5'
        critic_weights_path = path + 'pendulum_critic.h5'
        self.actor.load_weights(actor_weights_path)
        self.critic.load_weights(critic_weights_path)

    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(1, len(batch)):
            unpack = np.append(unpack, batch[idx], axis=0)
        return unpack
    
    def train(self, max_episodes=1000):
        for ep in range(int(max_episodes)):
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = [], [], [], [], []
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = state[0]
            i = 0
            while not done:
                i= i+1
                print (i)
                action = self.get_action(tf.convert_to_tensor([state] , dtype=tf.float32))
                action = np.clip(action, -self.action_bound, self.action_bound)
                next_state, reward, done, _, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, self.action_dim])
                reward = np.reshape(reward, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                done = np.reshape(done, [1, 1])

                train_reward = (reward + 8) / 8

                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append(train_reward)
                batch_next_state.append(next_state)
                batch_done.append(done)

                if len(batch_state) < self.BATCH_SIZE:
                    state = next_state[0]
                    episode_reward += reward[0]
                    time += 1
                    continue

                states = self.unpack_batch(batch_state)
                actions = self.unpack_batch(batch_action)
                train_reward = self.unpack_batch(batch_reward)
                next_states = self.unpack_batch(batch_next_state)
                dones = self.unpack_batch(batch_done)

                batch_state, batch_action, batch_reward, batch_next_state, batch_done = [], [], [], [], []

                # calculate td_target
                next_v = self.critic(tf.convert_to_tensor(next_states, dtype=tf.float32))
                td_targets = self.td_target(train_reward, next_v, dones)

                # update critic
                self.critic_learn(tf.convert_to_tensor(states, dtype=tf.float32), tf.convert_to_tensor(td_targets, dtype=tf.float32))

                # calculate advantages
                v = self.critic(tf.convert_to_tensor(states, dtype=tf.float32))
                next_v = self.critic(tf.convert_to_tensor(next_states, dtype=tf.float32))
                advantages = train_reward + self.GAMMA * next_v * (1 - dones) - v

                # update actor
                self.actor_learn(tf.convert_to_tensor(states, dtype=tf.float32), tf.convert_to_tensor(
                    actions, dtype=tf.float32), tf.convert_to_tensor(advantages, dtype=tf.float32))
                
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            print('Episode: ', ep, ' Reward: ', episode_reward)
            self.save_episode_reward.append(episode_reward)

            if ep % 10 == 0:
                self.actor.save_weights('./weights/pendulum_actor.h5')
                self.critic.save_weights('./weights/pendulum_critic.h5')

        np.savetxt('./weights/pendulum_actor.txt', self.save_episode_reward, fmt='%f')
        print(self.save_episode_reward)


        def plot_result(self):
            plt.plot(np.arange(len(self.save_episode_reward)), self.save_episode_reward)
            plt.ylabel('Total Reward')
            plt.xlabel('Episode')
            plt.show()


## main 

In [6]:
import gymnasium as gym

def main():
    max_episodes = 1000
    env = gym.make('Pendulum-v1')
    agent = A2Cagent(env)

    agent.train(max_episodes)

    agent.plot_result()

if __name__ == '__main__':
    main()    

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


2023-03-11 18:23:44.409561: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at xla_ops.cc:418 : NOT_FOUND: could not find registered platform with id: 0x1773a55f0


NotFoundError: could not find registered platform with id: 0x1773a55f0 [Op:__inference__update_step_xla_1611]