In [1]:
import numpy as np
np.random.seed(0)
import gym
env = gym.make('Acrobot-v1')
env.seed(0)

print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))

观测空间 = Box(-28.274333953857422, 28.274333953857422, (6,), float32)
动作空间 = Discrete(3)


In [2]:
import tensorflow as tf
tf.random.set_seed(0)
from tensorflow import keras

class AdvantageActorCriticAgent():
    def __init__(self, env, actor_kwargs, critic_kwargs, gamma=0.99):
        self.action_n = env.action_space.n
        self.gamma = gamma
        self.discount = 1.

        self.actor_net = self.build_network(output_size=self.action_n,output_activation=tf.nn.softmax,loss=tf.losses.categorical_crossentropy,**actor_kwargs)
        self.critic_net = self.build_network(output_size=1,**critic_kwargs)

    def build_network(self, hidden_sizes, output_size, input_size=None,activation=tf.nn.relu, output_activation=None,loss=tf.losses.mse, learning_rate=0.01):
        model = keras.Sequential()
        for idx, hidden_size in enumerate(hidden_sizes):
            kwargs = {}
            if idx == 0 and input_size is not None:
                kwargs['input_shape'] = (input_size,)
            model.add(keras.layers.Dense(units=hidden_size,activation=activation, **kwargs))
        model.add(keras.layers.Dense(units=output_size,activation=output_activation))
        optimizer = tf.optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model
        
    def decide(self, observation):
        probs = self.actor_net.predict(observation[np.newaxis])[0]
        action = np.random.choice(self.action_n, p=probs)
        return action

    def learn(self, observation, action, reward, next_observation, done):
        x = observation[np.newaxis]
        u = reward + (1. - done) * self.gamma * self.critic_net.predict(next_observation[np.newaxis])
        td_error = u - self.critic_net.predict(x)

        # 训练执行者网络
        x_tensor = tf.convert_to_tensor(observation[np.newaxis],dtype=tf.float32)
        with tf.GradientTape() as tape:
            pi_tensor = self.actor_net(x_tensor)[0, action]
            logpi_tensor = tf.math.log(tf.clip_by_value(pi_tensor,1e-6, 1.))
            loss_tensor = -self.discount * td_error * logpi_tensor
        grad_tensors = tape.gradient(loss_tensor, self.actor_net.variables)
        self.actor_net.optimizer.apply_gradients(zip(grad_tensors, self.actor_net.variables)) # 更新执行者网络

        # 训练评论者网络
        self.critic_net.fit(x, u, verbose=0) # 更新评论者网络

        if done:
            self.discount = 1. # 为下一回合初始化累积折扣
        else:
            self.discount *= self.gamma # 进一步累积折扣

In [3]:
def play_qlearning(env, agent, train=False, render=False):
    print('play_qlearning')
    episode_reward = 0
    observation = env.reset()
    step = 0
    while True:
        if render:
            env.render()
        action = agent.decide(observation)
        next_observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if train:
            agent.learn(observation, action, reward, next_observation,done)
        if done:
            break
        step += 1
        observation = next_observation
    return episode_reward

In [4]:
actor_kwargs = {'hidden_sizes' : [100,], 'learning_rate' : 0.0001}
critic_kwargs = {'hidden_sizes' : [100,], 'learning_rate' : 0.0002}
agent = AdvantageActorCriticAgent(env, actor_kwargs=actor_kwargs,critic_kwargs=critic_kwargs)

# 训练
episodes = 100
episode_rewards = []
for episode in range(episodes):
    episode_reward = play_qlearning(env, agent, train=True)
    episode_rewards.append(episode_reward)
    print('{}  {}'.format(episode,episode_reward))

play_qlearning
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on L

In [5]:
# 测试
episode_rewards = [play_qlearning(env, agent) for _ in range(100)]
print('平均回合奖励 = {} / {} = {}'.format(sum(episode_rewards),len(episode_rewards), np.mean(episode_rewards)))


play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlearning
play_qlear