In [2]:
from tensorflow.keras.optimizers import Optimizer
import tensorflow as tf

In [11]:
print(tf.__version__)

2.1.0


In [126]:
class AccumOptimizer(Optimizer):
    """
    tf api 实现原作者梯度累计
    继承Optimizer类，包装原有优化器，实现梯度累积。
    # 参数
        optimizer：优化器实例，支持目前所有的keras优化器；
        steps_per_update：累积的步数。
    # 返回
        一个新的keras优化器
    """
    #需要name属性
    def __init__(self,optimizer,steps_per_update=1,**kwargs):
        self.name=kwargs['name']
        super(AccumOptimizer,self).__init__(**kwargs)
        self.optimizer=optimizer
        with tf.name_scope(self.__class__.__name__):
            self.steps_per_update=steps_per_update
            self.iterations=tf.Variable(0,dtype='int64',name='iterations')
            self.cond=tf.equal(self.iterations%self.steps_per_update,0)
            self.lr=self.optimizer.lr
            self.optimizer.lr=tf.cond(self.cond,lambda:self.optimizer.lr.value(), lambda:0.)
            for attr in ['momentum', 'rho', 'beta_1', 'beta_2']:
                if hasattr(self.optimizer,attr):
                    #保存操作
                    value=getattr(self.optimizer,attr)
                    setattr(self, attr, value)
                    setattr(self.optimizer, attr, tf.cond(self.cond, lambda:value.value(), lambda:1 - 1e-7))
            for attr in self.optimizer.get_config():
                print(attr)
                if not hasattr(self, attr):
                    value = getattr(self.optimizer, attr)
                    setattr(self, attr, value)
            #必加属性 debug起来好难鸭
            self._create_slots=self.optimizer._create_slots
            self._resource_apply_dense=self.optimizer._resource_apply_dense
            # 覆盖原有的获取梯度方法，指向累积梯度
            
            def get_gradients(loss,params):
                return [ag / self.steps_per_update for ag in self.accum_grads]
            self.optimizer.get_gradients = get_gradients
    def get_updates(self,loss,params):
        self.iterations=tf.add(self.iterations, 1)
        self.optimizer.iterations=tf.add(self.optimizer.iterations, tf.cast(self.cond, 'int64'))
        self.updates=[
            self.iterations,
            self.optimizer.iterations
        ]
        # 累积梯度 (gradient accumulation)
        self.accum_grads = [tf.zeros(p.shape,dtype=p.dtype) for p in params]
        grads = self.get_gradients(loss, params)
                                     
        for g, ag in zip(grads, self.accum_grads):
            self.updates.append(ag=tf.cond(self.cond,lambda:g,lambda:ag+g))
        
        # 继承optimizer的更新 (inheriting updates of original optimizer)
        self.updates.extend(self.optimizer.get_updates(loss, params)[1:])
        self.weights.extend(self.optimizer.weights)
        return self.updates     
    
    def get_config(self):
        iterations = self.iterations.numpy()
        self.iterations=0
        config = self.optimizer.get_config()
        self.iterations=iterations
        return config

In [None]:
c.optimizer._resource_apply_dense

In [65]:

from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [130]:
batch_size = 128
num_classes = 10
epochs = 1

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=AccumOptimizer(Adam(), steps_per_update=100,name="ag_adam"), # equals batch_size=100
              #optimizer=Adam(),
              #optimizer=b,
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=1,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


60000 train samples
10000 test samples
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_24 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_25 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 10)                5130      
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
name
learning_rate
decay
beta_1
beta_2
epsilon
amsgrad
Train on 60000 samp

In [129]:
batch_size = 128
num_classes = 10
epochs = 1

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              #optimizer=AccumOptimizer(Adam(), steps_per_update=100,name="ag_adam"), # equals batch_size=100
              optimizer=Adam(),
              #optimizer=b,
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=1,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


60000 train samples
10000 test samples
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_22 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_23 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 10)                5130      
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
Train on 60000 samples, validate on 10000 samples
Test loss: 0.17850156284