In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.run_functions_eagerly(True)

In [2]:
problem = "Pendulum-v0"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [3]:
class Buffer:
    def __init__(self, capacity=100000):
        # Number of "experiences" to store at max
        self.capacity = capacity
        self.counter = 0
        self.batch_size = 64
        self.state_buffer = np.zeros((self.capacity, num_states))
        self.action_buffer = np.zeros((self.capacity, num_actions))
        self.reward_buffer = np.zeros((self.capacity, 1))
        self.next_state_buffer = np.zeros((self.capacity, num_states))

    def insert(self, obs_tuple):
        index = self.index
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.counter += 1
    
    def append(self, d):
        for i in d:
            self.insert(i)
    
    def sample(self):
        idx = np.random.choice(self.size, self.batch_size)
        
        state_batch = tf.convert_to_tensor(self.state_buffer[idx])
        action_batch = tf.convert_to_tensor(self.action_buffer[idx])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[idx])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[idx])
        
        return state_batch, action_batch, reward_batch, next_state_batch
    
    @property
    def raw(self):
        d = [(self.state_buffer[i], self.action_buffer[i], self.reward_buffer[i], self.next_state_buffer[i]) for i in range(self.size)]
        return d
            
    @property
    def size(self):
        if self.counter >= self.capacity:
            return self.capacity
        else:
            return self.counter
    
    @property
    def index(self):
        return self.counter % self.capacity
    
    @property
    def batches(self):
        state_batch = tf.convert_to_tensor(self.state_buffer)
        action_batch = tf.convert_to_tensor(self.action_buffer)
        reward_batch = tf.convert_to_tensor(self.reward_buffer)
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer)
        return state_batch, action_batch, reward_batch, next_state_batch

In [4]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [5]:
@tf.function
def ddpg_update(d, actor, actor_optimizer, critic, critic_optimizer, update_critic=True):
    state_batch, action_batch, reward_batch, next_state_batch = d.sample()
    
    if update_critic:
        with tf.GradientTape() as tape:
            target_actions = actor(next_state_batch, training=True)
            y = reward_batch + gamma * critic([next_state_batch, target_actions], training=True)
            critic_value = critic([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor(state_batch, training=True)
        critic_value = critic([state_batch, actions], training=True)
        # Used `-value` as we want to maximize the value given
        # by the critic for our actions
        actor_loss = -tf.math.reduce_mean(critic_value)

    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))

# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def target_update(target, model):
    target.variables = model.variables

In [6]:
@tf.function
def teacher_update(actor, actor_optim, critic, critic_optim, d, meta):
    state_batch, action_batch, reward_batch, _ = d.sample()
    
    with tf.GradientTape() as tape:
        reward = critic([state_batch, action_batch], training=True)
        critic_loss = tf.math.reduce_mean(tf.math.square(reward - reward_batch))
    
    critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
    critic_optim.apply_gradients(zip(critic_grad, critic.trainable_variables))
    
    with tf.GradientTape() as tape:
        target_actions = actor(state_batch, training=True)
        reward = critic([state_batch, target_actions], training=False)
        print('reward', reward)
        log_reward = tf.math.log(reward + 0.00001)
        print('log_reward', log_reward)
        actor_loss = tf.math.multiply(tf.cast(meta, dtype="float32"), tf.reduce_sum(log_reward))
        
    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optim.apply_gradients(zip(actor_grad, actor.trainable_variables))
    
    return actor_loss.numpy()

In [7]:
# give: policy(model) and current state
# return: next action
def next_action(policy, state):
    state = np.expand_dims(state, axis=0)
    
    sampled_actions = tf.squeeze(policy(state))
    sampled_actions = sampled_actions.numpy()

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

# sample n steps using policy
# give: policy, n
# return: [sars], accumulated reward
def generate(policy, n):
    prev_state = env.reset()
    d = []
    total_reward = 0
    for i in range(n):
        action = next_action(policy, prev_state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        d.append((prev_state, action, reward, state))
        prev_state = state
        if done:
            break
            
    return d, total_reward

def copy(model):
    new_model = tf.keras.models.clone_model(model)
    new_model.set_weights(model.get_weights())
    
    return new_model

In [8]:
std_dev = 0.2

# Learning rate for actor-critic models
teacher_lr = 0.0001
actor_lr = 0.0001
critic_lr = 0.001

total_iterations = 100
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
d0_rollout = 100
d1_rollout = 200
pi_rollout = 50
buffer_capacity = 50000
log_interval = 50

# To store reward history of each episode
it_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
meta_reward_list = []
teacher_loss_list = []

In [9]:
# Step 1: Initialize pi
teacher_actor = get_actor()
teacher_critic = get_critic()

# Step 1: Initialize pie
actor = get_actor()
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
critic = get_critic()
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)

# Making the weights equal initially
teacher_actor.set_weights(actor.get_weights())
teacher_actor_optim = tf.keras.optimizers.Adam(actor_lr)
teacher_critic.set_weights(critic.get_weights())
teacher_critic_optim = tf.keras.optimizers.Adam(critic_lr)

# Step 2:
d0, d1 = Buffer(d0_rollout), Buffer(d1_rollout)
data, meta_pi = generate(actor, d1_rollout)
d1.append(data)

# Step 3:
buffer = Buffer(buffer_capacity)
buffer.append(d1.raw)

In [10]:
# Step 4:
for it in range(total_iterations):
    print('Iteration', it)
    # Step 5: Generate d0
    data, _ = generate(teacher_actor, d0_rollout)
    d0.append(data)
    
    # Step 6: Update pi to temp
    # ********* ddpg_update的optimizer用 actor_optimizer (不知道會不會出事)
    temp = copy(actor)
    ddpg_update(d0, temp, actor_optimizer, critic, critic_optimizer, update_critic=False)
    
    # Step 7: 
    data, meta_pip = generate(temp, d1_rollout)
    d1.append(data)
    meta = meta_pip - meta_pi
    print('Meta Reward:', meta)
    
    # Step 8:
    loss = teacher_update(teacher_actor, teacher_actor_optim, teacher_critic, teacher_critic_optim, d0, meta)
    print('Teacher Loss:', loss)
    teacher_loss_list.append(loss)

    # Step 9:
    buffer.append(d0.raw)
    buffer.append(d1.raw)
    
    # Step 10:
    ddpg_update(buffer, actor, actor_optimizer, critic, critic_optimizer)
    data_temp, meta_pi = generate(actor, d1_rollout)
    
    meta_reward_list.append(meta)
    it_reward_list.append(meta_pi)
    avg_reward = np.mean(it_reward_list[-40:])
    print("Avg Reward:", avg_reward)
    avg_reward_list.append(avg_reward)
    
plt.plot(avg_reward_list)
plt.xlabel("Iteration")
plt.ylabel("Avg. Iteration Reward")
plt.show()

plt.plot(meta_reward_list)
plt.xlabel("Iteration")
plt.ylabel("Meta Iteration Reward")
plt.show()

plt.plot(teacher_loss_list)
plt.xlabel("Loss")
plt.ylabel("Teacher Loss")
plt.show()

Iteration 0
Meta Reward: 460.6305112851918
reward tf.Tensor(
[[-0.07835543]
 [-0.061279  ]
 [-0.8340434 ]
 [-0.1008544 ]
 [-0.04810326]
 [-0.07242012]
 [-0.97741425]
 [-0.93841726]
 [-0.24525468]
 [-0.9975993 ]
 [-0.26687747]
 [-0.15630959]
 [-0.21214534]
 [-0.07173602]
 [-0.15630959]
 [-0.20921685]
 [-0.5729431 ]
 [-0.92038316]
 [-1.0195044 ]
 [-0.16722278]
 [-0.97741425]
 [-0.12742653]
 [-0.72332776]
 [-0.769971  ]
 [-0.9975993 ]
 [-0.92038316]
 [-0.72332776]
 [-0.19556242]
 [-0.111775  ]
 [-0.30444282]
 [-0.21214534]
 [-0.8340434 ]
 [-0.29177332]
 [-0.25399753]
 [-0.09998129]
 [-0.17692442]
 [-0.2677326 ]
 [-0.3045719 ]
 [-0.3045719 ]
 [-0.9975993 ]
 [-0.477905  ]
 [-0.12742653]
 [-0.8554919 ]
 [-0.92038316]
 [-0.93841726]
 [-0.81954646]
 [-0.81954646]
 [-0.21019422]
 [-1.0195044 ]
 [-0.13402422]
 [-0.81954646]
 [-0.3045719 ]
 [-0.76137936]
 [-0.14417212]
 [-0.30484262]
 [-0.8340434 ]
 [-0.30484262]
 [-0.21019422]
 [-0.9900609 ]
 [-0.8554919 ]
 [-0.15368778]
 [-0.8554919 ]
 [-0.2816

Avg Reward: -983.3647782437844
Iteration 6
Meta Reward: -163.65130392489732
reward tf.Tensor(
[[-2.7356384 ]
 [-3.3081372 ]
 [-2.238969  ]
 [-3.3617895 ]
 [-2.4929106 ]
 [-0.95581794]
 [-2.7356384 ]
 [-2.2652562 ]
 [-2.1153302 ]
 [-2.061025  ]
 [-2.185384  ]
 [-2.4253597 ]
 [-0.41974363]
 [-2.332492  ]
 [-3.342376  ]
 [-1.8895415 ]
 [-0.6484948 ]
 [-2.2143557 ]
 [-1.4794048 ]
 [-2.2103732 ]
 [-3.4706943 ]
 [-3.0697043 ]
 [-1.8733926 ]
 [-2.7356384 ]
 [-0.5956389 ]
 [-0.9062383 ]
 [-2.2652562 ]
 [-3.0697043 ]
 [-3.4422162 ]
 [-0.6305076 ]
 [-1.6679866 ]
 [-1.8895415 ]
 [-2.8175993 ]
 [-2.4929106 ]
 [-1.3950967 ]
 [-0.60321623]
 [-0.5899619 ]
 [-1.4900407 ]
 [-2.8817797 ]
 [-1.9692436 ]
 [-1.9798492 ]
 [-1.4218897 ]
 [-1.0401504 ]
 [-1.1968865 ]
 [-2.061025  ]
 [-2.2143557 ]
 [-0.6484948 ]
 [-1.5012944 ]
 [-0.7063565 ]
 [-2.2652562 ]
 [-2.4249856 ]
 [-0.5899619 ]
 [-2.1378946 ]
 [-2.4249856 ]
 [-3.1342366 ]
 [-2.4249856 ]
 [-1.6394047 ]
 [-2.1043737 ]
 [-1.1444567 ]
 [-1.9848548 ]
 [-0.6

Avg Reward: -1161.5047834857285
Iteration 12
Meta Reward: 211.34793431246385
reward tf.Tensor(
[[-9.77233  ]
 [-1.513365 ]
 [-5.083912 ]
 [-9.20497  ]
 [-5.846279 ]
 [-3.8491254]
 [-1.2743717]
 [-1.4431977]
 [-3.773774 ]
 [-1.5481465]
 [-7.8233614]
 [-9.77233  ]
 [-2.1798074]
 [-9.823645 ]
 [-9.521228 ]
 [-6.07767  ]
 [-6.7480597]
 [-5.5281734]
 [-5.981125 ]
 [-6.08379  ]
 [-2.9470441]
 [-2.8121605]
 [-4.4294715]
 [-8.906838 ]
 [-4.9170446]
 [-5.083912 ]
 [-6.5841265]
 [-2.0493639]
 [-3.773774 ]
 [-4.4294715]
 [-7.215905 ]
 [-3.3491886]
 [-3.4265864]
 [-9.018871 ]
 [-8.906838 ]
 [-1.7234373]
 [-5.24605  ]
 [-2.63984  ]
 [-9.018871 ]
 [-6.7414827]
 [-7.000994 ]
 [-5.846279 ]
 [-9.764033 ]
 [-8.04685  ]
 [-4.6088223]
 [-7.4147296]
 [-9.823645 ]
 [-3.2782824]
 [-9.784657 ]
 [-1.0951868]
 [-9.20497  ]
 [-6.662958 ]
 [-9.917292 ]
 [-9.917292 ]
 [-1.471551 ]
 [-5.3120265]
 [-2.537707 ]
 [-8.219901 ]
 [-8.906838 ]
 [-6.4214883]
 [-5.3742666]
 [-8.04685  ]
 [-5.846279 ]
 [-9.203794 ]], shape=(

KeyboardInterrupt: 