In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.run_functions_eagerly(True)

In [2]:
problem = "Pendulum-v0"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [3]:
class Buffer:
    def __init__(self, capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.capacity = capacity
        self.counter = 0
        self.batch_size = 64
        self.state_buffer = np.zeros((self.capacity, num_states))
        self.action_buffer = np.zeros((self.capacity, num_actions))
        self.reward_buffer = np.zeros((self.capacity, 1))
        self.next_state_buffer = np.zeros((self.capacity, num_states))

    def insert(self, obs_tuple):
        index = self.index
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.counter += 1
    
    def append(self, d):
        for i in d:
            self.insert(i)
    
    @property
    def raw(self):
        d = [(self.state_buffer[i], self.action_buffer[i], self.reward_buffer[i], self.next_state_buffer[i]) for i in range(self.size)]
        return d
            
    @property
    def size(self):
        if self.counter >= self.capacity:
            return self.capacity
        else:
            return self.counter
    
    @property
    def index(self):
        return self.counter % self.capacity
    
    @property
    def batches(self):
        state_batch = tf.convert_to_tensor(self.state_buffer)
        action_batch = tf.convert_to_tensor(self.action_buffer)
        reward_batch = tf.convert_to_tensor(self.reward_buffer)
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer)
        return state_batch, action_batch, reward_batch, next_state_batch
    
    @property
    def sample(self):
        if self.batch_size > self.counter:
            return self.batches
        
        idx = np.random.choice(self.size, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[idx])
        action_batch = tf.convert_to_tensor(self.action_buffer[idx])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[idx])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[idx])
        return state_batch, action_batch, reward_batch, next_state_batch
        
        

In [4]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [5]:
@tf.function
def ddpg_update(d, actor, actor_optimizer, critic, critic_optimizer, update_critic=True, random=False):
    if random:
        state_batch, action_batch, reward_batch, next_state_batch = d.sample
    else:
        state_batch, action_batch, reward_batch, next_state_batch = d.batches
    
    if update_critic:
        with tf.GradientTape() as tape:
            target_actions = actor(next_state_batch, training=True)
            y = reward_batch + gamma * critic([next_state_batch, target_actions], training=True)
            critic_value = critic([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor(state_batch, training=True)
        critic_value = critic([state_batch, actions], training=True)
        # Used `-value` as we want to maximize the value given
        # by the critic for our actions
        actor_loss = -tf.math.reduce_mean(critic_value)

    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))

# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def target_update(target, model):
    target.variables = model.variables

In [6]:
@tf.function
def teacher_update(actor, actor_optim, critic, critic_optim, d, meta):
    state_batch, action_batch, reward_batch, _ = d.batches
    
    with tf.GradientTape() as tape:
        actions = actor(state_batch, training=True)
        reward = critic([state_batch, actions], training=True)
        log_reward = tf.math.log(reward)
        actor_loss = tf.math.multiply(tf.cast(meta, dtype="float32"), tf.reduce_sum(log_reward))
        
    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optim.apply_gradients(zip(actor_grad, actor.trainable_variables))
        
    
    with tf.GradientTape() as tape:
        reward = critic([state_batch, action_batch], training=True)
        critic_loss = tf.math.reduce_mean(tf.math.square(reward - reward_batch))
    
    critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
    critic_optim.apply_gradients(zip(critic_grad, critic.trainable_variables))
    

In [7]:
# give: policy(model) and current state
# return: next action
def next_action(policy, state):
    state = np.expand_dims(state, axis=0)
    
    sampled_actions = tf.squeeze(policy(state))
    sampled_actions = sampled_actions.numpy()

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

# sample n steps using policy
# give: policy, n
# return: [sars], accumulated reward
def generate(policy, n):
    prev_state = env.reset()
    d = []
    total_reward = 0
    for i in range(n):
        action = next_action(policy, prev_state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        d.append((prev_state, action, reward, state))
        prev_state = state
        if done:
            break
            
    return d, total_reward

def copy(model):
    new_model = tf.keras.models.clone_model(model)
    new_model.set_weights(model.get_weights())
    
    return new_model


In [8]:
std_dev = 0.2

# Learning rate for actor-critic models
teacher_lr = 0.0001
actor_lr = 0.0001
critic_lr = 0.001

total_iterations = 10000
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
d0_rollout = 100
d1_rollout = 200
pi_rollout = 50
buffer_capacity = 50000
log_interval = 10
history_rewards = []

In [9]:
# Step 1: Initialize pi
teacher_actor = get_actor()
teacher_critic = get_critic()

# Step 1: Initialize pie
actor = get_actor()
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
critic = get_critic()
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)

# Making the weights equal initially
teacher_actor.set_weights(actor.get_weights())
teacher_actor_optim = tf.keras.optimizers.Adam(actor_lr)
teacher_critic.set_weights(critic.get_weights())
teacher_critic_optim = tf.keras.optimizers.Adam(critic_lr)

temp_optimizer = tf.keras.optimizers.Adam(actor_lr)

# Step 2:
d0, d1 = Buffer(d0_rollout), Buffer(d1_rollout)
data, meta_pi = generate(actor, d1_rollout)
d1.append(data)

# Step 3:
buffer = Buffer(buffer_capacity)
buffer.append(d1.raw)

In [10]:
# Step 4:
for it in range(total_iterations):
    # Step 5: Generate d0
    data, _ = generate(teacher_actor, d0_rollout)
    d0.append(data)
    
    # Step 6: Update pi to temp
    # ********* ddpg_update的optimizer用 actor_optimizer (不知道會不會出事)
    temp = copy(actor)
    ddpg_update(d0, temp, temp_optimizer, critic, critic_optimizer, update_critic=False)
    
    # Step 7: 
    data, meta_pip = generate(temp, d1_rollout)
    d1.append(data)
    meta = meta_pip - meta_pi
    
    # Step 8:
    teacher_update(teacher_actor, teacher_actor_optim, teacher_critic, teacher_critic_optim, d0, meta)

    # Step 9:
    buffer.append(d0.raw)
    buffer.append(d1.raw)
    
    # Step 10:
    ddpg_update(buffer, actor, actor_optimizer, critic, critic_optimizer, random=True)
    # meta_pi = meta_pip
    data_temp, meta_pi = generate(actor, d1_rollout)
    history_rewards.append(meta_pi)
    
    if it % log_interval == 0:
        print("Episode {} reward: {}".format(it, meta_pi))
    
plt.plot(history_rewards)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

Episode 0 reward: -870.717488606997
Episode 10 reward: -1013.8461791143451
Episode 20 reward: -1068.9809432472348
Episode 30 reward: -1501.687395972337
Episode 40 reward: -1407.4490556160506
Episode 50 reward: -1173.94552009445
Episode 60 reward: -1496.678985178475
Episode 70 reward: -1498.0454613479258
Episode 80 reward: -1627.790199192135
Episode 90 reward: -1673.228671873225
Episode 100 reward: -1737.574665962455
Episode 110 reward: -1504.3181772394787
Episode 120 reward: -1569.193186226533


KeyboardInterrupt: 