In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.run_functions_eagerly(True)

In [2]:
problem = "Pendulum-v0"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [3]:
class Buffer:
    def __init__(self, capacity=100000):
        # Number of "experiences" to store at max
        self.capacity = capacity
        self.counter = 0
        self.batch_size = 64
        self.state_buffer = np.zeros((self.capacity, num_states))
        self.action_buffer = np.zeros((self.capacity, num_actions))
        self.reward_buffer = np.zeros((self.capacity, 1))
        self.next_state_buffer = np.zeros((self.capacity, num_states))

    def insert(self, obs_tuple):
        index = self.index
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.counter += 1
    
    def append(self, d):
        for i in d:
            self.insert(i)
    
    def sample(self):
        idx = np.random.choice(self.size, self.batch_size)
        
        state_batch = tf.convert_to_tensor(self.state_buffer[idx])
        action_batch = tf.convert_to_tensor(self.action_buffer[idx])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[idx])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[idx])
        
        return state_batch, action_batch, reward_batch, next_state_batch
    
    @property
    def raw(self):
        d = [(self.state_buffer[i], self.action_buffer[i], self.reward_buffer[i], self.next_state_buffer[i]) for i in range(self.size)]
        return d
            
    @property
    def size(self):
        if self.counter >= self.capacity:
            return self.capacity
        else:
            return self.counter
    
    @property
    def index(self):
        return self.counter % self.capacity
    
    @property
    def batches(self):
        state_batch = tf.convert_to_tensor(self.state_buffer)
        action_batch = tf.convert_to_tensor(self.action_buffer)
        reward_batch = tf.convert_to_tensor(self.reward_buffer)
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer)
        return state_batch, action_batch, reward_batch, next_state_batch

In [4]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [5]:
@tf.function
def ddpg_update(d, actor, actor_optimizer, critic, critic_optimizer, update_critic=True):
    state_batch, action_batch, reward_batch, next_state_batch = d.sample()
    
    if update_critic:
        with tf.GradientTape() as tape:
            target_actions = actor(next_state_batch, training=True)
            y = reward_batch + gamma * critic([next_state_batch, target_actions], training=True)
            critic_value = critic([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor(state_batch, training=True)
        critic_value = critic([state_batch, actions], training=True)
        # Used `-value` as we want to maximize the value given
        # by the critic for our actions
        actor_loss = -tf.math.reduce_mean(critic_value)

    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))

# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def target_update(target, model):
    target.variables = model.variables

In [6]:
@tf.function
def teacher_update(actor, actor_optim, critic, critic_optim, d, meta):
    state_batch, action_batch, reward_batch, _ = d.sample()
    
    with tf.GradientTape() as tape:
        reward = critic([state_batch, action_batch], training=True)
        critic_loss = tf.math.reduce_mean(tf.math.square(reward - reward_batch))
    
    critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
    critic_optim.apply_gradients(zip(critic_grad, critic.trainable_variables))
    
    with tf.GradientTape() as tape:
        target_actions = actor(state_batch, training=True)
        reward = critic([state_batch, target_actions], training=False)
        print('reward', reward)
        log_reward = tf.math.log(reward + 0.00001)
        print('log_reward', log_reward)
        actor_loss = tf.math.multiply(tf.cast(meta, dtype="float32"), tf.reduce_sum(log_reward))
        
    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optim.apply_gradients(zip(actor_grad, actor.trainable_variables))
    
    return actor_loss.numpy()

In [7]:
# give: policy(model) and current state
# return: next action
def next_action(policy, state):
    state = np.expand_dims(state, axis=0)
    
    sampled_actions = tf.squeeze(policy(state))
    sampled_actions = sampled_actions.numpy()

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

# sample n steps using policy
# give: policy, n
# return: [sars], accumulated reward
def generate(policy, n):
    prev_state = env.reset()
    d = []
    total_reward = 0
    for i in range(n):
        action = next_action(policy, prev_state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        d.append((prev_state, action, reward, state))
        prev_state = state
        if done:
            break
            
    return d, total_reward

def copy(model):
    new_model = tf.keras.models.clone_model(model)
    new_model.set_weights(model.get_weights())
    
    return new_model

In [8]:
std_dev = 0.2

# Learning rate for actor-critic models
teacher_lr = 0.0001
actor_lr = 0.0001
critic_lr = 0.001

total_iterations = 100
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
d0_rollout = 100
d1_rollout = 200
pi_rollout = 50
buffer_capacity = 50000
log_interval = 50

# To store reward history of each episode
it_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
meta_reward_list = []
teacher_loss_list = []

In [9]:
# Step 1: Initialize pi
teacher_actor = get_actor()
teacher_critic = get_critic()

# Step 1: Initialize pie
actor = get_actor()
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
critic = get_critic()
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)

# Making the weights equal initially
teacher_actor.set_weights(actor.get_weights())
teacher_actor_optim = tf.keras.optimizers.Adam(actor_lr)
teacher_critic.set_weights(critic.get_weights())
teacher_critic_optim = tf.keras.optimizers.Adam(critic_lr)

# Step 2:
d0, d1 = Buffer(d0_rollout), Buffer(d1_rollout)
data, meta_pi = generate(actor, d1_rollout)
d1.append(data)

# Step 3:
buffer = Buffer(buffer_capacity)
buffer.append(d1.raw)

In [None]:
# Step 4:
for it in range(total_iterations):
    print('Iteration', it)
    # Step 5: Generate d0
    data, _ = generate(teacher_actor, d0_rollout)
    d0.append(data)
    
    # Step 6: Update pi to temp
    # ********* ddpg_update的optimizer用 actor_optimizer (不知道會不會出事)
    temp = copy(actor)
    ddpg_update(d0, temp, actor_optimizer, critic, critic_optimizer, update_critic=False)
    
    # Step 7: 
    data, meta_pip = generate(temp, d1_rollout)
    d1.append(data)
    meta = meta_pip - meta_pi
    print('Meta Reward:', meta)
    
    # Step 8:
    loss = teacher_update(teacher_actor, teacher_actor_optim, teacher_critic, teacher_critic_optim, d0, meta)
    print('Teacher Loss:', loss)
    teacher_loss_list.append(loss)

    # Step 9:
    buffer.append(d0.raw)
    buffer.append(d1.raw)
    
    # Step 10:
    ddpg_update(buffer, actor, actor_optimizer, critic, critic_optimizer)
    data_temp, meta_pi = generate(actor, d1_rollout)
    
    meta_reward_list.append(meta)
    it_reward_list.append(meta_pi)
    avg_reward = np.mean(it_reward_list[-40:])
    print("Avg Reward:", avg_reward)
    avg_reward_list.append(avg_reward)
    
plt.plot(avg_reward_list)
plt.xlabel("Iteration")
plt.ylabel("Avg. Iteration Reward")
plt.show()

plt.plot(meta_reward_list)
plt.xlabel("Iteration")
plt.ylabel("Meta Iteration Reward")
plt.show()

plt.plot(teacher_loss_list)
plt.xlabel("Loss")
plt.ylabel("Teacher Loss")
plt.show()

Iteration 0
Meta Reward: 76.00502663142356
reward tf.Tensor(
[[-0.2075641 ]
 [-0.11310516]
 [-0.20884348]
 [-0.20286639]
 [-0.18154885]
 [-0.12940751]
 [-0.24385558]
 [-0.21137585]
 [-0.16603556]
 [-0.31080964]
 [-0.31080964]
 [-0.18062712]
 [-0.09631675]
 [-0.30195868]
 [-0.11808723]
 [-0.2896113 ]
 [-0.22084314]
 [-0.11045219]
 [-0.21036066]
 [-0.31420198]
 [-0.12481657]
 [-0.17174566]
 [-0.19001698]
 [-0.1918077 ]
 [-0.3042411 ]
 [-0.28510246]
 [-0.20884348]
 [-0.2090912 ]
 [-0.22084314]
 [-0.11955473]
 [-0.30195868]
 [-0.15749347]
 [-0.29673788]
 [-0.14488253]
 [-0.1474989 ]
 [-0.20884348]
 [-0.24385558]
 [-0.30597997]
 [-0.1918077 ]
 [-0.3049949 ]
 [-0.3049949 ]
 [-0.23121808]
 [-0.14444424]
 [-0.15884933]
 [-0.18004446]
 [-0.12940751]
 [-0.13937041]
 [-0.23121808]
 [-0.11955473]
 [-0.29842427]
 [-0.22084314]
 [-0.18499522]
 [-0.11808723]
 [-0.20295788]
 [-0.27096602]
 [-0.1393637 ]
 [-0.11360949]
 [-0.29842427]
 [-0.31420198]
 [-0.17174566]
 [-0.20484845]
 [-0.13937041]
 [-0.2805

Avg Reward: -1030.543143635523
Iteration 6
Meta Reward: -764.0149343035616
reward tf.Tensor(
[[-0.9015813 ]
 [-2.9872649 ]
 [-1.1763985 ]
 [-2.0181777 ]
 [-3.37118   ]
 [-3.655759  ]
 [-2.1199176 ]
 [-2.07351   ]
 [-4.266158  ]
 [-2.0181777 ]
 [-4.344458  ]
 [-1.4485459 ]
 [-3.37118   ]
 [-2.9602106 ]
 [-3.9919453 ]
 [-3.6492403 ]
 [-2.3864415 ]
 [-3.37118   ]
 [-0.8273794 ]
 [-1.2437906 ]
 [-3.37118   ]
 [-1.5641947 ]
 [-0.5569063 ]
 [-2.9872649 ]
 [-0.8273794 ]
 [-2.041559  ]
 [-3.0486414 ]
 [-4.0620613 ]
 [-2.497628  ]
 [-2.874389  ]
 [-0.5801077 ]
 [-3.6492403 ]
 [-1.1221223 ]
 [-3.018285  ]
 [-2.927667  ]
 [-2.8608367 ]
 [-2.9556344 ]
 [-1.1630114 ]
 [-3.018285  ]
 [-2.774782  ]
 [-2.07351   ]
 [-2.1969802 ]
 [-3.8551905 ]
 [-2.5247447 ]
 [-0.8273794 ]
 [-0.88723576]
 [-0.98547447]
 [-2.0666342 ]
 [-3.6492403 ]
 [-1.4723651 ]
 [-3.2386544 ]
 [-3.3804185 ]
 [-1.1763985 ]
 [-3.762818  ]
 [-3.4656985 ]
 [-3.9986243 ]
 [-1.5641947 ]
 [-0.6986378 ]
 [-1.032074  ]
 [-2.774782  ]
 [-0.58

Avg Reward: -1189.390129018585
Iteration 12
Meta Reward: -120.72431032787767
reward tf.Tensor(
[[ -2.1500673]
 [ -2.5570986]
 [ -1.6480665]
 [ -7.0740957]
 [ -2.2947137]
 [ -2.912331 ]
 [ -5.681974 ]
 [ -7.095216 ]
 [ -5.681974 ]
 [ -4.5651493]
 [ -1.6988647]
 [ -1.5472713]
 [ -2.7816575]
 [ -1.6016569]
 [ -3.2959216]
 [ -1.8602016]
 [ -5.0504365]
 [ -9.257287 ]
 [ -2.5933692]
 [ -1.6016569]
 [-11.6662855]
 [ -7.637065 ]
 [ -1.979844 ]
 [ -1.704043 ]
 [ -5.9325457]
 [ -1.396209 ]
 [-12.015427 ]
 [ -4.9632444]
 [ -1.405391 ]
 [ -2.912331 ]
 [ -4.9632444]
 [-12.015427 ]
 [ -2.7919424]
 [ -5.681974 ]
 [ -3.0289524]
 [ -8.183685 ]
 [ -1.6988647]
 [ -1.1869029]
 [-11.130953 ]
 [ -7.8756332]
 [ -1.4604696]
 [ -1.1869029]
 [ -3.5750206]
 [ -2.4741585]
 [ -2.2947137]
 [ -1.396209 ]
 [-11.833657 ]
 [ -4.112149 ]
 [ -7.637065 ]
 [ -2.4741585]
 [-11.278515 ]
 [ -2.1840851]
 [ -1.396209 ]
 [ -2.2947137]
 [ -2.4741585]
 [ -4.9632444]
 [-10.262592 ]
 [ -4.9632444]
 [ -9.312933 ]
 [ -9.257287 ]
 [ -7

Avg Reward: -1175.8359304015366
Iteration 18
Meta Reward: 398.7058413611345
reward tf.Tensor(
[[-3.2572336]
 [-3.6925225]
 [-3.5439901]
 [-3.6970963]
 [-3.8435516]
 [-3.8598123]
 [-4.343869 ]
 [-4.361848 ]
 [-4.4466715]
 [-3.5555882]
 [-3.5471783]
 [-4.7840104]
 [-3.1022875]
 [-3.643489 ]
 [-4.6004677]
 [-4.3177934]
 [-3.9221447]
 [-4.3341985]
 [-4.4149055]
 [-4.0879693]
 [-3.4126263]
 [-3.0051093]
 [-4.5093284]
 [-3.930304 ]
 [-3.669024 ]
 [-3.1166265]
 [-4.3177934]
 [-4.2296815]
 [-3.3505702]
 [-4.0840974]
 [-3.9221447]
 [-4.1882105]
 [-3.1022875]
 [-3.4533005]
 [-3.1683831]
 [-4.1975427]
 [-4.378136 ]
 [-4.831724 ]
 [-4.489741 ]
 [-4.5170693]
 [-4.4988904]
 [-4.2702284]
 [-3.983463 ]
 [-3.3035583]
 [-4.3341985]
 [-3.4809556]
 [-3.8650374]
 [-4.8486505]
 [-4.666879 ]
 [-3.0026433]
 [-3.4126263]
 [-4.4466715]
 [-4.5170693]
 [-4.0482244]
 [-4.443017 ]
 [-4.3554044]
 [-4.4810863]
 [-3.5799556]
 [-3.8870635]
 [-3.8598123]
 [-4.5170693]
 [-4.4607496]
 [-4.4429955]
 [-4.4149055]], shape=(6

Avg Reward: -1220.7584851955817
Iteration 24
Meta Reward: 189.9486666983853
reward tf.Tensor(
[[ -3.435881 ]
 [ -2.5782096]
 [-15.112237 ]
 [ -4.682444 ]
 [ -2.5782096]
 [ -8.048191 ]
 [ -4.1649776]
 [ -2.100511 ]
 [-13.527828 ]
 [ -5.371636 ]
 [ -9.389172 ]
 [ -3.5977335]
 [-10.101246 ]
 [ -4.183123 ]
 [-15.3289175]
 [-11.644677 ]
 [ -8.338265 ]
 [ -5.3053365]
 [ -8.54025  ]
 [ -2.5319016]
 [ -2.4429047]
 [ -9.614558 ]
 [-12.734513 ]
 [-13.474194 ]
 [-11.644677 ]
 [ -2.609691 ]
 [ -4.1161437]
 [ -5.670913 ]
 [ -6.909388 ]
 [ -8.221914 ]
 [ -3.7334325]
 [-17.12379  ]
 [ -4.682444 ]
 [-15.463755 ]
 [-12.487363 ]
 [ -2.100511 ]
 [ -3.435881 ]
 [-10.990942 ]
 [ -7.3089423]
 [ -5.371636 ]
 [ -2.2298157]
 [ -3.094442 ]
 [ -2.5538127]
 [-16.9489   ]
 [ -8.338265 ]
 [ -2.2298157]
 [ -3.792505 ]
 [-16.928675 ]
 [-16.9489   ]
 [-13.16729  ]
 [-14.892171 ]
 [-12.487363 ]
 [ -4.8881903]
 [-19.649183 ]
 [-17.790874 ]
 [ -4.682444 ]
 [ -9.614558 ]
 [-16.928675 ]
 [ -2.100511 ]
 [-18.691109 ]
 [-11.

Avg Reward: -1299.4726956225952
Iteration 30
Meta Reward: 320.7599500783831
reward tf.Tensor(
[[-10.66224  ]
 [-12.972089 ]
 [ -5.166164 ]
 [ -2.1306105]
 [ -1.979524 ]
 [-14.599183 ]
 [ -4.974274 ]
 [ -4.4662223]
 [ -9.355294 ]
 [ -6.558875 ]
 [ -1.6719166]
 [ -4.0044694]
 [ -1.5173577]
 [ -1.979524 ]
 [-14.599183 ]
 [-14.96679  ]
 [ -3.1623938]
 [ -5.894989 ]
 [ -5.7623525]
 [-14.98239  ]
 [ -2.9269826]
 [ -8.106616 ]
 [ -4.042977 ]
 [ -3.2860765]
 [ -3.2860765]
 [ -2.7791967]
 [ -7.131516 ]
 [ -1.6600894]
 [ -8.106616 ]
 [ -9.875187 ]
 [ -2.0936852]
 [ -1.9929808]
 [ -3.500944 ]
 [ -8.408305 ]
 [ -2.8887765]
 [-13.666183 ]
 [ -3.541859 ]
 [-12.950157 ]
 [-13.909109 ]
 [ -9.875187 ]
 [-14.98239  ]
 [-13.666183 ]
 [-10.66224  ]
 [ -2.826461 ]
 [-11.717641 ]
 [ -2.2084355]
 [ -3.4471586]
 [ -3.541859 ]
 [-13.617732 ]
 [ -6.9449725]
 [ -2.8539488]
 [ -2.826461 ]
 [ -3.8589432]
 [ -1.6600894]
 [ -2.9269826]
 [ -9.196153 ]
 [ -1.6600894]
 [-14.199085 ]
 [ -7.131516 ]
 [ -3.4248521]
 [ -1.