Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
fix bugs 9105b12 Dec 3, 2018
1 contributor

Users who have contributed to this file

257 lines (230 sloc) 9.37 KB
# Proximal Policy Optimization
# https://arxiv.org/abs/1707.06347
# https://www.52coding.com.cn/2018/11/25/RL%20-%20PPO/
import gym
import numpy as np
import tensorflow as tf
from logging import getLogger
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from src.base import BaseController
from src.util import discount_cumsum, mlp, cnn
from src.config import Config, ControllerType
logger = getLogger(__name__)
class PPOControl(BaseController):
def __init__(self, env, config: Config):
self.env = env
self.epsilon = config.controller.epsilon # clip ratio
self.gamma = config.controller.gamma
self.lam = config.controller.lambda_
self.pi_lr = config.trainer.lr # 1e-4
self.v_lr = 1e-3
self.max_workers = config.controller.max_workers
tfconfig = tf.ConfigProto(
gpu_options=tf.GPUOptions(
allow_growth=True,
visible_device_list='0'
)
)
self.sess = tf.Session(config=tfconfig)
self.raw_pixels = config.controller.raw_pixels
if self.raw_pixels:
state_space = [84, 84, 2]
else:
state_space = self.env.observation_space.shape
self.actor = PPOActor(self.sess, state_space, self.env.action_space.n,
self.pi_lr, self.epsilon, self.raw_pixels)
self.critic = PPOCritic(self.sess, state_space,
self.v_lr, self.raw_pixels)
self.build_model()
def build_model(self):
self.actor.build_model()
self.critic.build_model()
self.sess.run(tf.global_variables_initializer())
def action(self, observation, predict=False, return_q=False, epsilon=None, return_logp=True):
if return_q:
v = self.critic.value_of(observation)
return self.actor.action(observation), [v]
return self.actor.action(observation)[0]
def train(self, batch_buffers, i):
'''Update parameters
Args:
batch_buffers = [buf1, buf2, ...]
'''
batch_states = []
batch_actions = []
batch_rets = []
batch_advs = []
batch_logp_old = []
total_rewards = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [executor.submit(self.build_training_set, buf)
for buf in batch_buffers]
for future in futures:
states, actions, returns, advantages, logps, r = future.result()
batch_states.extend(states)
batch_actions.extend(actions)
batch_rets.extend(returns)
batch_advs.extend(advantages)
batch_logp_old.extend(logps)
total_rewards.append(r)
self.actor.train(batch_states, batch_actions,
batch_advs, batch_logp_old, np.mean(total_rewards), i)
self.critic.train(batch_states, batch_rets, i)
def build_training_set(self, buf):
rewards_to_go = discount_cumsum(buf.rewards, self.gamma)
buf.values = np.array(buf.values)
# compute GAE
deltas = buf.rewards[:-1] + self.gamma * \
buf.values[1:] - buf.values[:-1]
advs = discount_cumsum(deltas, self.gamma * self.lam)
# advantage normalization trick
adv_mean, adv_std = np.mean(advs), np.std(advs)
advs = (advs - adv_mean) / adv_std
# squeeze
actions = np.squeeze(buf.actions)
logps = np.squeeze(buf.logps)
if self.raw_pixels:
states = np.array(buf.states[:-1])
states = np.reshape(states, (-1, 84, 84, 2))
else:
states = buf.states[:-1]
return states, actions[:-1], rewards_to_go[:-1], advs, logps[:-1], sum(buf.rewards)
def save(self, path):
saver = tf.train.Saver()
save_path = saver.save(self.sess, path)
logger.info(f"Save weight to {save_path}")
def load(self, path):
try:
saver = tf.train.Saver()
saver.restore(self.sess, path)
logger.info(f"Load weight from {path}")
except Exception as e:
logger.error(e)
class PPOActor:
def __init__(self, sess, n_features, n_actions, lr, epsilon, raw_pixels):
self.sess = sess
if raw_pixels:
self.n_features = n_features
else:
self.n_features = n_features[0]
self.n_actions = n_actions
self.lr = lr
self.epsilon = epsilon
self.train_policy_iter = 80
self.target_kl = 0.01
self.raw_pixels = raw_pixels
def build_model(self):
clip_ratio = self.epsilon
# Input placeholder
if self.raw_pixels:
self.s_ph = tf.placeholder(tf.float32, [None] + self.n_features)
else:
self.s_ph = tf.placeholder(tf.float32, [None, self.n_features])
self.a_ph = tf.placeholder(tf.int32, [None])
self.logp_old_ph = tf.placeholder(tf.float32, [None])
self.adv_ph = tf.placeholder(tf.float32, [None])
# Construct model
with tf.variable_scope('pi'):
if self.raw_pixels:
logits = mlp(cnn(self.s_ph), [256, self.n_actions], tf.tanh)
else:
logits = mlp(self.s_ph, [128, 64, self.n_actions], tf.tanh)
self.logp_all = tf.nn.log_softmax(logits)
self.pi = tf.squeeze(tf.multinomial(logits, 1), axis=1)
self.logp_pi = tf.reduce_sum(tf.one_hot(
self.pi, depth=self.n_actions) * self.logp_all, axis=1)
logp = tf.reduce_sum(tf.one_hot(
self.a_ph, depth=self.n_actions) * self.logp_all, axis=1)
# PPO objectives
# pi(a|s) / pi_old(a|s)
ratio = tf.exp(logp - self.logp_old_ph)
min_adv = tf.where(self.adv_ph > 0, (1+clip_ratio)
* self.adv_ph, (1-clip_ratio)*self.adv_ph)
self.pi_loss = - \
tf.reduce_mean(tf.minimum(ratio * self.adv_ph, min_adv))
self.approx_kl = tf.reduce_mean(self.logp_old_ph - logp)
self.approx_ent = tf.reduce_mean(-logp)
self.pi_loss -= 0.01 * self.approx_ent
self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.pi_loss)
def action(self, observation):
'''
Choose an action according to approximated softmax policy.
Args:
observation: An observation from the environment
Return:
The action choosed according to the policy
'''
my_action, logp = self.sess.run(
[self.pi, self.logp_pi], feed_dict={self.s_ph: observation})
return my_action, logp
def train(self, states, actions, advs, logp_old, avg_reward, i):
'''Update parameters
Args:
states = [s1, s2, ..., sn]
actions = [a1, a2, ..., an]
advs = [adv1, adv2, ..., advn]
logp_old = [logp1, logp2, ..., logpn]
i: episode number
'''
inputs = {
self.s_ph: states,
self.a_ph: actions,
self.adv_ph: advs,
self.logp_old_ph: logp_old
}
pi_loss_old, ent = self.sess.run(
[self.pi_loss, self.approx_ent], feed_dict=inputs)
for j in range(self.train_policy_iter):
_, kl = self.sess.run(
[self.optimizer, self.approx_kl], feed_dict=inputs)
kl = kl.mean()
if kl > 1.5 * self.target_kl:
logger.info(
'Early stopping at step %d due to reaching max kl.' % j)
break
pi_loss_new, kl = self.sess.run(
[self.pi_loss, self.approx_kl], feed_dict=inputs)
logger.info(
f"\n\tEpisode: {i}\n\tAvg Reward: {avg_reward:.2f}\n\t"
f"Loss_pi: {pi_loss_old:.3e}\n\tEntropy: {ent:.2f}\n\t"
f"KL: {kl:.2f}\n\tDelta_Pi_Loss: {(pi_loss_new - pi_loss_old):.2e}")
class PPOCritic:
def __init__(self, sess, n_features, lr, raw_pixels):
self.sess = sess
if raw_pixels:
self.n_features = n_features
else:
self.n_features = n_features[0]
self.lr = lr
self.model = None
self.train_value_iter = 80
self.raw_pixels = raw_pixels
def build_model(self):
with tf.variable_scope('v'):
if self.raw_pixels:
self.s_ph = tf.placeholder(
tf.float32, [None] + self.n_features)
x = cnn(self.s_ph)
hidden_sizes = [256, 1]
else:
self.s_ph = tf.placeholder(tf.float32, [None, self.n_features])
x = self.s_ph
hidden_sizes = [64, 64, 1]
self.ret_ph = tf.placeholder(tf.float32, [None])
self.value = tf.squeeze(mlp(x, hidden_sizes, tf.tanh), axis=1)
self.v_loss = tf.reduce_mean(
tf.losses.mean_squared_error(self.ret_ph, self.value))
self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.v_loss)
def value_of(self, state):
v = self.sess.run(self.value, feed_dict={self.s_ph: state})
return v
def train(self, states, rets, i):
inputs = {
self.s_ph: states,
self.ret_ph: rets
}
for _ in range(self.train_value_iter):
_, loss = self.sess.run(
[self.optimizer, self.v_loss], feed_dict=inputs)
print(f"\tLoss_v = {loss:.2e}")
You can’t perform that action at this time.