In [1]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# problem = "Pendulum-v0"
problem = "LunarLanderContinuous-v2"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  8
Size of Action Space ->  2
Max Value of Action ->  1.0
Min Value of Action ->  -1.0


In [3]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [4]:
class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

# Create actor and critic

In [5]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(128, activation="relu")(inputs)
    out = layers.Dense(128, activation="relu")(out)
    outputs = layers.Dense(num_actions, activation="linear", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(128, activation="relu")(concat)
    out = layers.Dense(128, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [6]:
def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # Adding noise to action
    sampled_actions = sampled_actions.numpy() + noise

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return np.squeeze(legal_action)

In [7]:
std_dev = 0.3
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.002
actor_lr = 0.0015

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 600
# Discount factor for future rewards
gamma = 0.98
# Used to update target networks
tau = 0.005

buffer = Buffer(50000, 64)

2021-11-29 09:22:50.457081: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

# Takes about 4 min to train
for ep in range(total_episodes):

    prev_state = env.reset()
    episodic_reward = 0

    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.
        env.render()

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        action = policy(tf_prev_state, ou_noise)

        # Recieve state and reward from environment.
        state, reward, done, info = env.step(action)

        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward

        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)

        # End this episode when `done` is True
        if done:
            break

        prev_state = state

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()



Episode * 0 * Avg Reward is ==> -143.57117872719678
Episode * 1 * Avg Reward is ==> -143.62239395344278
Episode * 2 * Avg Reward is ==> -324.2397237582259
Episode * 3 * Avg Reward is ==> -368.34861438862845
Episode * 4 * Avg Reward is ==> -427.79785894830763
Episode * 5 * Avg Reward is ==> -452.1324575365574
Episode * 6 * Avg Reward is ==> -449.2811621768324
Episode * 7 * Avg Reward is ==> -536.7568141628435
Episode * 8 * Avg Reward is ==> -490.87714576432677
Episode * 9 * Avg Reward is ==> -468.5832149955074
Episode * 10 * Avg Reward is ==> -471.4442268830551
Episode * 11 * Avg Reward is ==> -470.4613022063022
Episode * 12 * Avg Reward is ==> -451.1878132869792
Episode * 13 * Avg Reward is ==> -429.01388484489144
Episode * 14 * Avg Reward is ==> -421.2657765233734
Episode * 15 * Avg Reward is ==> -416.39409448748586
Episode * 16 * Avg Reward is ==> -412.17929920494726
Episode * 17 * Avg Reward is ==> -405.37880069826133
Episode * 18 * Avg Reward is ==> -404.12445819545536
Episode * 19

Episode * 156 * Avg Reward is ==> -123.40412586324378
Episode * 157 * Avg Reward is ==> -123.59938740771676
Episode * 158 * Avg Reward is ==> -123.05392671778364
Episode * 159 * Avg Reward is ==> -123.34676779310253
Episode * 160 * Avg Reward is ==> -119.84474761932411
Episode * 161 * Avg Reward is ==> -119.3197355246991
Episode * 162 * Avg Reward is ==> -116.45529352014846
Episode * 163 * Avg Reward is ==> -115.55926178085531
Episode * 164 * Avg Reward is ==> -116.8747218790912
Episode * 165 * Avg Reward is ==> -117.5327256059949
Episode * 166 * Avg Reward is ==> -116.1592958597087
Episode * 167 * Avg Reward is ==> -115.97017544759107
Episode * 168 * Avg Reward is ==> -117.46100117693418
Episode * 169 * Avg Reward is ==> -117.72008158829567
Episode * 170 * Avg Reward is ==> -117.89751505374446
Episode * 171 * Avg Reward is ==> -117.70316570367686
Episode * 172 * Avg Reward is ==> -114.74049204989247
Episode * 173 * Avg Reward is ==> -115.28666084798223
Episode * 174 * Avg Reward is ==

Episode * 309 * Avg Reward is ==> -127.49886813890416
Episode * 310 * Avg Reward is ==> -126.70614578080809
Episode * 311 * Avg Reward is ==> -126.91531087873281
Episode * 312 * Avg Reward is ==> -128.21266071996683
Episode * 313 * Avg Reward is ==> -129.3598476532614
Episode * 314 * Avg Reward is ==> -131.47176526888833
Episode * 315 * Avg Reward is ==> -130.78213612595218
Episode * 316 * Avg Reward is ==> -130.75660992754774
Episode * 317 * Avg Reward is ==> -131.35829276070746
Episode * 318 * Avg Reward is ==> -131.70633286733155
Episode * 319 * Avg Reward is ==> -131.79721798623703
Episode * 320 * Avg Reward is ==> -130.28806733580282
Episode * 321 * Avg Reward is ==> -131.0136930579942
Episode * 322 * Avg Reward is ==> -130.9890754965368
Episode * 323 * Avg Reward is ==> -131.75227053292082
Episode * 324 * Avg Reward is ==> -131.40589147687552
Episode * 325 * Avg Reward is ==> -133.94625184620554
Episode * 326 * Avg Reward is ==> -130.947706909284
Episode * 327 * Avg Reward is ==>

Episode * 462 * Avg Reward is ==> -127.60892651858413
Episode * 463 * Avg Reward is ==> -126.91667408932904
Episode * 464 * Avg Reward is ==> -126.82269217295614
Episode * 465 * Avg Reward is ==> -128.05758990797523
Episode * 466 * Avg Reward is ==> -128.03864089823537
Episode * 467 * Avg Reward is ==> -128.02023870664496
Episode * 468 * Avg Reward is ==> -128.33873570036502
Episode * 469 * Avg Reward is ==> -129.02866823204295
Episode * 470 * Avg Reward is ==> -128.26722646838422
Episode * 471 * Avg Reward is ==> -129.69195218722479
Episode * 472 * Avg Reward is ==> -129.57405366310041
Episode * 473 * Avg Reward is ==> -128.94043704242208
Episode * 474 * Avg Reward is ==> -128.90755501978123
Episode * 475 * Avg Reward is ==> -134.2756520691105
Episode * 476 * Avg Reward is ==> -136.68273890919394
Episode * 477 * Avg Reward is ==> -136.72953856968041
Episode * 478 * Avg Reward is ==> -137.6591402843854
Episode * 479 * Avg Reward is ==> -133.1082213518343
Episode * 480 * Avg Reward is =