In [1]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt

In [2]:
problem = "Pendulum-v0"
env = gym.make(problem)

num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  3
Size of Action Space ->  1
Max Value of Action ->  2.0
Min Value of Action ->  -2.0


In [3]:
class Buffer:
    def __init__(self, capacity=100000):
        # Number of "experiences" to store at max
        self.capacity = capacity
        self.buffer = []
        self.counter = 0

    # Takes (s,a,r,s') obervation tuple as input
    def insert(self, d):
        self.buffer.append(d)
        over = self.size - self.capacity
        if over>0:
            self.buffer = self.buffer[over:]
    
    @property
    def size(self):
        return len(self.buffer)
    
    @property
    def batches(self):
        state_batch = tf.convert_to_tensor([self.buffer[i][0] for i in range(len(self.buffer))])
        action_batch = tf.convert_to_tensor([self.buffer[i][1] for i in range(len(self.buffer))])
        reward_batch = tf.convert_to_tensor([self.buffer[i][2] for i in range(len(self.buffer))])
        next_state_batch = tf.convert_to_tensor([self.buffer[i][3] for i in range(len(self.buffer))])
        return state_batch, action_batch, reward_batch, next_state_batch
    
    def clear(self):
        self.buffer = []

@tf.function
def ddpg_update(d, actor, actor_optimizer, critic, critic_optimizer, update_critic=True):
    state_batch, action_batch, reward_batch, next_state_batch = d.batches()
    print(state_batch)
    
    if update_critic:
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic([next_state_batch, target_actions], training=True)
            critic_value = critic([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
        critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))

    with tf.GradientTape() as tape:
        actions = actor(state_batch, training=True)
        critic_value = critic([state_batch, actions], training=True)
        # Used `-value` as we want to maximize the value given
        # by the critic for our actions
        actor_loss = -tf.math.reduce_mean(critic_value)

    actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
    actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))

# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def target_update(target, model):
    target.variables = model.variables

In [4]:
def teacher_update(teacher, teacher_optimizer, d, meta):
    state_batch, action_batch, reward_batch, next_state_batch = d.batches()
    
    with tf.GradientTape() as tape:
        log_reward = tf.math.log(reward_batch)
        loss = meta * tf.math.reduce_sum(log_reward)
    
    teacher_grad = tape.gradient(loss, teacher.trainable_variables)
    teacher_optimizer.apply_gradients(zip(teacher_grad, teacher.trainable_variables))
        

In [5]:
def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    model.summary()
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(256, activation="relu")(concat)
    out = layers.Dense(256, activation="relu")(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)
    model.summary()

    return model

In [6]:
# give: policy(model) and current state
# return: next action
def next_action(policy, state):
    state = np.expand_dims(state, axis=0)
    state = tf.convert_to_tensor(state)
    
    sampled_actions = tf.squeeze(policy(state))
    sampled_actions = sampled_actions.numpy()

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    return [np.squeeze(legal_action)]

# sample n steps using policy
# give: policy, n
# return: [sars], accumulated reward
def generate(policy, n):
    #宗叡把 env.reset()搬到這邊
    prev_state = env.reset()
    d = []
    reward = 0
    for i in range(n):
        action = next_action(policy, prev_state)
        s, r, done, _ = env.step(action)
        reward += r
        d.append((prev_state, action, r, s))
        prev_state = s
        if done:
            break
    print("Total reward for this episode: {}",format(reward))
            
    return d, reward

def copy(model, optimizer):
    new_model = tf.keras.models.clone_model(model)
    new_model.set_weights(model.get_weights())
    
    return new_model
    
#     new_optimizer = tf.keras.optimizers.Adam(actor_lr)
#     grad_vars = model.trainable_weights
#     zero_grads = [tf.zeros_like(w) for w in grad_vars]
#     new_optimizer.apply_gradients(zip(zero_grads, grad_vars))
#     new_optimizer.set_weights(optimizer.get_weights())
    
#     return new_model, new_optimizer

In [7]:
std_dev = 0.2

# Learning rate for actor-critic models
teacher_lr = 0.0001
actor_lr = 0.0001
critic_lr = 0.001

total_iterations = 10000
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
d0_rollout = 100
d1_rollout = 200
pi_rollout = 50
buffer_capacity = 50000

In [8]:
# Step 0: Define variables
# 此行註解為宗叡註解，如果要使用在用回來就好
# prev_state = env.reset()

# Step 1: Initialize pi
teacher = get_actor()

# Step 1: Initialize pie
actor = get_actor()
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
critic = get_critic()
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)

# Making the weights equal initially
teacher.set_weights(actor.get_weights())
teacher_optim = tf.keras.optimizers.Adam(teacher_lr)

# Step 2:
print("Draw D1 from pi to estimate Reward")
print("*"*25)
d0, d1 = Buffer(d0_rollout), Buffer(d1_rollout)
d1.buffer, meta_pi = generate(actor, d1_rollout)
buffer = Buffer(buffer_capacity)
buffer.insert(d1.buffer)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
dense (Dense)                (None, 256)               1024      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
_________________________________________________________________
tf.math.multiply (TFOpLambda (None, 1)                 0         
Total params: 67,073
Trainable params: 67,073
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shap

ValueError: Input 0 of layer dense_3 is incompatible with the layer: expected axis -1 of input shape to have value 3 but received input with shape (3, 1)

In [None]:
# Step 4:
for it in range(total_iterations):
    # Step 5: Generate d0
    d0.buffer, _ = generate(teacher, d0_rollout)
    
    # Step 6: Update pi to temp
    # ********* ddpg_update的optimizer用 actor_optimizer (不知道會不會出事)
#     temp, temp_optimizer = copy(actor, actor_optimizer)
    temp = copy(actor, actor_optimizer)
    ddpg_update(d0, temp, actor_optimizer, critic, critic_optimizer)
    
    # Step 7: 
    d1.buffer, meta_pip = generate(temp, d1_rollout)
    meta = meta_pip - meta_pi
    
    # Step 8:
    teacher_update(teacher, teacher_optimizer, d0, meta)
    
    # Step 9:
    buffer.insert(d0.buffer)
    buffer.insert(d1.buffer)
    
    # Step 10:
    ddpg_update(buffer, actor, actor_optimizer, critic, critic_optimizer)
    
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

In [None]:
test_state = env.reset()
env.step(np.array([0.01]))

In [None]:
tt = np.expand_dims(test_state, axis=1)
tt.shape