Permalink
3eb68ba Oct 29, 2017
3 contributors

Users who have contributed to this file

@JianGoForIt @mfernezir @jmhessel
472 lines (409 sloc) 18.6 KB
"""
YellowFin optimizer.
YellowFin and the Art of Momentum Tuning
https://arxiv.org/abs/1706.03471
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
# EPS for numerical stability
EPS = 1e-6
LARGE_FLOAT_VAL = 1e15
class YFOptimizer(object):
"""
Optimizer that implements the YellowFin algorithm.
Implemented as a wrapper around tf.train.MomentumOptimizer
"""
# Available gate_gradients values
GATE_NONE = tf.train.Optimizer.GATE_NONE
GATE_OP = tf.train.Optimizer.GATE_OP
GATE_GRAPH = tf.train.Optimizer.GATE_GRAPH
def __init__(self, learning_rate=0.0001, momentum=0.0, clip_thresh=None,
beta=0.999, curv_win_width=20, zero_debias=True, delta_mu=0.0,
sparsity_debias=False, use_locking=False, name="YellowFin",
use_nesterov=False, use_unsmoothed_lr_mu=True,
h_max_log_smooth=True, h_min_log_smooth=True,
use_adapt_grad_clip=True, stat_protect_fac=100.0):
"""
Construct a new YellowFin optimizer.
Args:
learning rate: Python scalar. The initial value of learning rate,
we use 1.0 in our paper.
momentum: Python scalar. The initial value of momentum, we use
0.0 in our paper.
clip_thresh: Python scalar. The cliping threshold for
`tf.clip_by_global_norm`. If None, no clipping will be used.
beta: Python scalar. The smoothing parameter for estimations.
curv_win_width: TODO
zero_debias: TODO
delta_mu: for extensions. Not necessary in the basic use.
sparsity_debias: Python boolean. Gradient norm and curvature are
biased to larger values when calculated with sparse gradient.
This is useful when the model is very sparse, e.g. LSTM with
word embedding. For non-sparse CNN, turning it off could
slightly accelerate the speed.
use_locking: If True, use locks for update operations.
name: Optional name prefix for the operations created when
applying gradients. Defaults to "YellowFin".
use_nesterov: If True, the underlying MomentumOptimizer uses Nesterov
Momentum. Set to False in the default YellowFin algorithm.
Notes:
`clip_thresh` is the threshold value on ||lr * gradient||
`delta_mu` can be a placeholder/variable/python scalar. Used for
additional momentum in situations such as asynchronous-parallel
training. The default is 0.0 for basic usage of the optimizer.
Other features:
If you want to manually control the learning rates,
`self.lr_factor` is an interface to the outside. It is a
multiplier for the internal learning rate in YellowFin. It is
helpful when you want to do additional hand tuning or some
decaying scheme for the internal learning rate. Example on using
`lr_factor` can be found here:
https://github.com/JianGoForIt/YellowFin/blob/master/char-rnn-tensorflow/train_YF.py#L140
"""
self._lr = learning_rate
self._mu = momentum
self._lr_var = tf.Variable(
learning_rate, dtype=tf.float32, name="YF_lr", trainable=False)
self._mu_var = tf.Variable(
momentum, dtype=tf.float32, name="YF_mu", trainable=False)
# for step scheme or decaying scheme for the learning rates
self.lr_factor = tf.Variable(
1.0, dtype=tf.float32, name="YF_lr_factor", trainable=False)
if clip_thresh is not None:
self._clip_thresh_var = tf.Variable(
clip_thresh, dtype=tf.float32, name="YF_clip_thresh",
trainable=False)
else:
self._clip_thresh_var = None
# the underlying momentum optimizer
self._optimizer = tf.train.MomentumOptimizer(
self._lr_var * self.lr_factor, self._mu_var + delta_mu,
use_locking, name, use_nesterov)
# moving average for statistics
self._beta = beta
self._moving_averager = None
# for global step counting
self._global_step = tf.Variable(0, trainable=False)
self._do_tune = tf.greater(self._global_step, tf.constant(0) )
self._zero_debias = zero_debias
self._sparsity_debias = sparsity_debias
self._tvars = None
# for curvature range
self._curv_win_width = curv_win_width
self._curv_win = None
# option for using smoothed or unsmoothed lr and mu
self._use_unsmoothed_lr_mu = use_unsmoothed_lr_mu
# options for curvature envelop smoothing
self._h_max_log_smooth = h_max_log_smooth
self._h_min_log_smooth = h_min_log_smooth
# for adaptive gradient clipping
self._use_adapt_grad_clip = use_adapt_grad_clip
self._adapt_grad_clip_thresh = \
tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
self._adapt_grad_clip_target_val = \
tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
# prevent exploding gradient from ruining the statistics
self._stat_protect_fac = stat_protect_fac
def curvature_range(self):
# set up the curvature window
self._curv_win = tf.Variable(
np.zeros([self._curv_win_width, ]), dtype=tf.float32,
name="curv_win", trainable=False)
# we can use log smoothing for curvature range to follow trend faster
# self._curv_win = tf.scatter_update(
# self._curv_win, self._global_step % self._curv_win_width,
# tf.log(self._grad_norm_squared + EPS))
self._curv_win = tf.scatter_update(
self._curv_win, self._global_step % self._curv_win_width,
self._grad_norm_squared + EPS)
# note here the iterations start from iteration 0
valid_window = tf.slice(
self._curv_win, tf.constant([0, ]), tf.expand_dims(
tf.minimum(tf.constant(self._curv_win_width),
self._global_step + 1), dim=0))
if self._h_min_log_smooth:
self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
else:
self._h_min_t = tf.reduce_min(valid_window)
if self._h_max_log_smooth:
self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
else:
self._h_max_t = tf.reduce_max(valid_window)
curv_range_ops = []
with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
avg_op = self._moving_averager.apply(
[self._h_min_t, self._h_max_t])
with tf.control_dependencies([avg_op]):
if self._h_min_log_smooth:
self._h_min = tf.exp(
tf.identity(self._moving_averager.average(self._h_min_t)))
else:
self._h_min = \
tf.identity(self._moving_averager.average(self._h_min_t))
if self._h_max_log_smooth:
self._h_max = tf.exp(
tf.identity(self._moving_averager.average(self._h_max_t)))
else:
self._h_max = \
tf.identity(self._moving_averager.average(self._h_max_t))
if self._sparsity_debias:
self._h_min = self._h_min * self._sparsity_avg
self._h_max = self._h_max * self._sparsity_avg
curv_range_ops.append(avg_op)
return curv_range_ops
def grad_variance(self):
grad_var_ops = []
tensor_to_avg = []
for t, g in zip(self._tvars, self._grads):
if isinstance(g, ops.IndexedSlices):
tensor_to_avg.append(
tf.reshape(tf.unsorted_segment_sum(
g.values, g.indices, g.dense_shape[0]),
shape=t.get_shape()))
else:
tensor_to_avg.append(g)
avg_op = self._moving_averager.apply(tensor_to_avg)
grad_var_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_avg = [
self._moving_averager.average(val) for val in tensor_to_avg]
self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
self._grad_var = tf.maximum(
tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
self._grad_norm_squared_avg
- tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared] ) )
if self._sparsity_debias:
self._grad_var *= self._sparsity_avg
return grad_var_ops
def dist_to_opt(self):
dist_to_opt_ops = []
# running average of the norm of gradeint
self._grad_norm = tf.sqrt(self._grad_norm_squared)
avg_op = self._moving_averager.apply([self._grad_norm, ])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._grad_norm_avg = self._moving_averager.average(
self._grad_norm)
# single iteration distance estimation
# note that self._grad_norm_avg is per variable
self._dist_to_opt = (self._grad_norm_avg
/ (self._grad_norm_squared_avg + EPS) )
# running average of distance
avg_op = self._moving_averager.apply([self._dist_to_opt])
dist_to_opt_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
self._dist_to_opt_avg = tf.identity(
self._moving_averager.average(self._dist_to_opt))
if self._sparsity_debias:
self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
return dist_to_opt_ops
def grad_sparsity(self):
# If the sparse minibatch gradient has 10 percent of its entries
# non-zero, its sparsity is 0.1.
# The norm of dense gradient averaged from full dataset
# are roughly estimated norm of minibatch
# sparse gradient norm * sqrt(sparsity)
# An extension maybe only correct the sparse blob.
non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
/ tf.cast(all_entry_cnt, self._grads[0].dtype)
avg_op = self._moving_averager.apply([self._sparsity, ])
with tf.control_dependencies([avg_op]):
self._sparsity_avg = self._moving_averager.average(self._sparsity)
return avg_op
def before_apply(self):
self._moving_averager = tf.train.ExponentialMovingAverage(
decay=self._beta, zero_debias=self._zero_debias)
assert self._grads is not None and len(self._grads) > 0
before_apply_ops = []
# get per var g**2 and norm**2
self._grad_squared = []
self._grad_norm_squared = []
for v, g in zip(self._tvars, self._grads):
if g is None:
continue
with ops.colocate_with(v):
self._grad_squared.append(tf.square(g))
self._grad_norm_squared = [
tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared]
if self._sparsity_debias:
avg_op_sparsity = self.grad_sparsity()
before_apply_ops.append(avg_op_sparsity)
# the following running average on squared norm of gradient is shared
# by `grad_variance` and `dist_to_opt`
avg_op = self._moving_averager.apply(self._grad_norm_squared)
with tf.control_dependencies([avg_op]):
self._grad_norm_squared_avg = [self._moving_averager.average(val)
for val in self._grad_norm_squared]
self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
before_apply_ops.append(avg_op)
with tf.control_dependencies([avg_op]):
curv_range_ops = self.curvature_range()
before_apply_ops += curv_range_ops
grad_var_ops = self.grad_variance()
before_apply_ops += grad_var_ops
dist_to_opt_ops = self.dist_to_opt()
before_apply_ops += dist_to_opt_ops
return tf.group(*before_apply_ops)
def get_lr_tensor(self):
lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
lr = tf.minimum(lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width) ) )
return lr
def get_cubic_root(self):
# We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
# where x = sqrt(mu).
# We substitute x, which is sqrt(mu), with x = y + 1.
# It gives y^3 + py = q
# where p = (D^2 h_min^2)/(2*C) and q = -p.
# We use the Vieta's substution to compute the root.
# There is only one real solution y (which is in [0, 1] ).
# http://mathworld.wolfram.com/VietasSubstitution.html
# assert_array = \
# [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]),
# tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
# with tf.control_dependencies(assert_array):
# EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
y = w - p / 3.0 / (w + EPS)
x = y + 1
return x
def get_mu_tensor(self):
root = self.get_cubic_root()
dr = tf.maximum( (self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS)
mu = tf.maximum(
root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2)
return mu
def update_hyper_param(self):
assign_hyper_ops = []
self._mu = tf.identity(tf.cond(
self._do_tune, lambda: self.get_mu_tensor(),
lambda: self._mu_var))
with tf.control_dependencies([self._mu]):
self._lr = tf.identity(tf.cond(
self._do_tune, lambda: self.get_lr_tensor(),
lambda: self._lr_var))
with tf.control_dependencies([self._mu, self._lr]):
if self._use_unsmoothed_lr_mu:
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
else:
self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu
self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr
with tf.control_dependencies([self._mu, self._lr] ):
assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
assign_hyper_op = tf.group(*assign_hyper_ops)
return assign_hyper_op
def get_name(self):
return self._optimizer.get_name()
def apply_gradients(self, grads_tvars, global_step=None, name=None):
self._grads, self._tvars = zip(
*[(g, t) for g, t in grads_tvars if g is not None])
# for manual gradient clipping
if self._clip_thresh_var is not None:
self._grads, self._grads_norm = tf.clip_by_global_norm(
self._grads, self._clip_thresh_var)
# loosely adaptive clipping of gradient in case exploding gradient ruins statistics
if self._use_adapt_grad_clip:
thresh = tf.cond(self._do_tune,
lambda: tf.sqrt(self._stat_protect_fac * self._adapt_grad_clip_thresh**2),
lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
self._grads, self._grads_norm = tf.clip_by_global_norm(self._grads, thresh)
with tf.variable_scope("before_apply"):
before_apply_op = self.before_apply()
with tf.variable_scope("update_hyper"):
with tf.control_dependencies([before_apply_op]):
update_hyper_op = self.update_hyper_param()
with tf.variable_scope("apply_updates"):
with tf.control_dependencies([update_hyper_op]):
# clip exploding gradient according to h_max
if self._use_adapt_grad_clip:
thresh = tf.cond(tf.greater(tf.global_norm(self._grads),
self._adapt_grad_clip_thresh),
lambda: self._adapt_grad_clip_target_val,
lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
self._grads, self._grads_norm = tf.clip_by_global_norm(
self._grads, thresh)
apply_grad_op = self._optimizer.apply_gradients(
zip(self._grads, self._tvars), global_step, name)
with tf.control_dependencies([apply_grad_op]):
self._increment_global_step_op = tf.assign(
self._global_step, self._global_step + 1)
self._adapt_grad_clip_thresh_op = \
tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
self._adapt_grad_clip_target_val_op = \
tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
# self._adapt_grad_clip_target_val_op = \
# tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))
return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op,
self._increment_global_step_op)
def compute_gradients(self, loss, var_list=None,
gate_gradients=GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False,
grad_loss=None):
return self._optimizer.compute_gradients(
loss, var_list=var_list,
gate_gradients=gate_gradients,
aggregation_method=aggregation_method,
colocate_gradients_with_ops=colocate_gradients_with_ops,
grad_loss=grad_loss)
def minimize(self, loss, global_step=None, var_list=None,
gate_gradients=GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False,
name=None,
grad_loss=None):
"""Add operations to minimize `loss` by updating `var_list`.
This method simply combines calls `compute_gradients()` and
`apply_gradients()`. If you want to process the gradient before
applying them, call `tf.gradients()` and `self.apply_gradients()`
explicitly instead of using this function.
Adapted from Tensorflow Optimizer base class member function.
"""
grads_and_vars = self._optimizer.compute_gradients(
loss, var_list=var_list,
gate_gradients=gate_gradients,
aggregation_method=aggregation_method,
colocate_gradients_with_ops=colocate_gradients_with_ops,
grad_loss=grad_loss)
vars_with_grad = [v for g, v in grads_and_vars if g is not None]
if not vars_with_grad:
raise ValueError(
"No gradients provided for any variable, check your graph for "
"ops that do not support gradients, between variables "
"%s and loss %s." %
([str(v) for _, v in grads_and_vars], loss))
return self.apply_gradients(grads_and_vars, global_step, name)
def get_slot(self, var, name):
"""
Return a slot named `name` created for `var` by
the underlying MomentumOptimizer.
Args:
var: A variable passed to `minimize()` or `apply_gradients()`.
name: A string.
Returns:
The `Variable` for the slot if it was created, `None` otherwise.
"""
return self._optimizer.get_slot(var, name)
def get_slot_names(self):
"""
Return a list of the names of the slots created by the
underlying MomentumOptimizer.
Returns:
A list of strings.
"""
return self._optimizer.get_slot_names()