In [1]:
!rm -rf ./wide_mixnet_poly

In [2]:
%config IPCompleter.greedy=True
import numpy as np
import tensorflow as tf
import time

In [3]:
%run ./datagen.py
datagen, (x_train, y_train), (x_test, y_test) = data_preparation()

In [4]:
save_dir = './wide_mixnet_poly/'
batch_size = 100
iterations = x_train.shape[0] // batch_size
epochs = 500
old_acc = 0
start_lr = 1e-3
end_lr = 1e-4
decay_rate = (end_lr / start_lr) ** (1 / epochs)
k = 4
# regularizer = tf.contrib.layers.l2_regularizer(scale=1e-5)
initializer=tf.initializers.he_normal()

In [5]:
# resnet layer
def res_layer(inputs, filter_num, filter_size, stride, is_train,
              batch_norm=True, activation=True):

    x = inputs

    if batch_norm:
        x = tf.layers.batch_normalization(x, training=is_train)
    if activation:
        x = tf.nn.relu(x)
    x = tf.layers.conv2d(inputs=x, filters=filter_num, 
                         kernel_initializer=initializer, 
                         kernel_size=filter_size, strides=stride, padding='same')
        
    return x

In [6]:
def wide_resnet(inputs, k, is_train):

    with tf.variable_scope("1st_Conv"):
        x = tf.layers.conv2d(inputs=inputs, filters=16, 
                             kernel_initializer=initializer, 
                             kernel_size=3, strides=1, padding='same')
        x = tf.layers.batch_normalization(x, training=is_train)
        x = tf.nn.relu(x)

    # Res Blocks
    a = [1, 1, 1]
    block_num = 0

    for stack in range(len(a)):
        for block in range(a[stack]):

            with tf.variable_scope('ResBlock_%d_%d' % (stack+1, block+1)):

                batch_norm = True
                activation = True
                stride = 1
                filter_num = 16*k*(2**stack)
                if stack == 0:
                    if block == 0:
                        batch_norm = False
                        activation = False
                else:  
                    if block == 0:
                        stride = 2

                shortcut = x
                with tf.variable_scope('conv1'):
                    x = res_layer(x, filter_num, 3, stride, is_train, 
                                  batch_norm=batch_norm, activation=activation)
                    
                x = tf.layers.dropout(x, 0.1)
                
                with tf.variable_scope('conv2'):
                    x = res_layer(x, filter_num, 3, 1, is_train)
                
                with tf.variable_scope('x_plus_shortcut'):
                    if block == 0:
                        shortcut = tf.layers.conv2d(inputs=shortcut, filters=filter_num, 
                                                    kernel_size=1, strides=stride, padding='same')
                    x = x + shortcut
    
    with tf.variable_scope("AfterResBlock"):
        x = tf.layers.batch_normalization(x, training=is_train)
        x = tf.nn.relu(x)
        x = tf.layers.average_pooling2d(x, pool_size=8, strides=8, padding='SAME', name='ave_pool')
    
    with tf.variable_scope("Flatten"):
        x = tf.transpose(x, perm=[0, 3, 1, 2])
        x = tf.layers.flatten(x)
    
    # crrent x.shape = (?, 256)
    with tf.variable_scope("Prediction"):
        pred = tf.layers.dense(x, units=10, kernel_initializer=initializer)
        
    return pred

In [7]:
def wide_mixnet(inputs, k, is_train):

    with tf.variable_scope("1st_Conv"):
        x = tf.layers.conv2d(inputs=inputs, filters=16, 
                             kernel_initializer=initializer, 
                             kernel_size=3, strides=1, padding='same')
        x = tf.layers.batch_normalization(x, training=is_train)
        x = tf.nn.relu(x)
    
    x_temp_0 = x
    
    with tf.variable_scope('ResBlock_%d_%d' % (1, 1)):
        
        with tf.variable_scope('conv1'):
            x = res_layer(x, 16*k, 3, 1, is_train, 
                          batch_norm=False, activation=False)
                    
        x = tf.layers.dropout(x, 0.1)
                
        with tf.variable_scope('conv2'):
            x = res_layer(x, 16*k, 3, 1, is_train)
    
        with tf.variable_scope('x_plus_shortcut'):
            shortcut = tf.layers.conv2d(inputs=x_temp_0, filters=16*k, 
                                        kernel_size=1, strides=1, padding='same')
            x = x + shortcut             
        
    x_temp_1 = x
    
    with tf.variable_scope('ResBlock_%d_%d' % (2, 1)):
        
        with tf.variable_scope('conv1'):
            x = res_layer(x, 32*k, 3, 2, is_train)
                    
        x = tf.layers.dropout(x, 0.1)
                
        with tf.variable_scope('conv2'):
            x = res_layer(x, 32*k, 3, 1, is_train)
            
        with tf.variable_scope('x_plus_shortcut'):
            shortcut = tf.layers.conv2d(inputs=x_temp_0, filters=32*k, 
                                        kernel_size=1, strides=2, padding='same')
            x = x + shortcut
            
            shortcut = tf.layers.conv2d(inputs=x_temp_1, filters=32*k, 
                                        kernel_size=1, strides=2, padding='same')
            x = x + shortcut
        
    x_temp_2 = x    
    
    with tf.variable_scope('ResBlock_%d_%d' % (3, 1)):
        
        with tf.variable_scope('conv1'):
            x = res_layer(x, 64*k, 3, 2, is_train)
                    
        x = tf.layers.dropout(x, 0.1)
                
        with tf.variable_scope('conv2'):
            x = res_layer(x, 64*k, 3, 1, is_train)
            
        with tf.variable_scope('x_plus_shortcut'):
            
            shortcut = tf.layers.conv2d(inputs=x_temp_0, filters=64*k, 
                                        kernel_size=1, strides=4, padding='same')
            x = x + shortcut

            shortcut = tf.layers.conv2d(inputs=x_temp_1, filters=64*k, 
                                        kernel_size=1, strides=4, padding='same')
            x = x + shortcut

            shortcut = tf.layers.conv2d(inputs=x_temp_2, filters=64*k, 
                                        kernel_size=1, strides=2, padding='same')
            x = x + shortcut

    with tf.variable_scope("AfterResBlock"):
        x = tf.layers.batch_normalization(x, training=is_train)
        x = tf.square(x)
        x = tf.layers.average_pooling2d(x, pool_size=8, strides=8, padding='SAME', name='ave_pool')

    with tf.variable_scope("Flatten"):
        x = tf.transpose(x, perm=[0, 3, 1, 2])
        x = tf.layers.flatten(x)

    with tf.variable_scope("Prediction"):
        pred = tf.layers.dense(x, units=10, kernel_initializer=initializer)
        
    return pred

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"

tf.reset_default_graph()

with tf.device('/GPU:5'):

    inputs = tf.placeholder(tf.float32, [None, 32, 32, 3], name='input')
    outputs = tf.placeholder(tf.float32, [None, 10], name='output')
    is_train = tf.placeholder(tf.bool, name='is_train')

    global_step = tf.Variable(0, trainable=False)

    l_r = tf.train.exponential_decay(
        start_lr, global_step, iterations, decay_rate, staircase=True)
    tf.summary.scalar('learning_rate', l_r)

    opt = tf.train.AdamOptimizer(learning_rate=l_r)

    pred = wide_mixnet(inputs, k, is_train)
    
#     l2_loss = tf.losses.get_regularization_loss()
    loss = tf.losses.softmax_cross_entropy(outputs, pred)
    
    grads = opt.compute_gradients(loss)

    correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(outputs, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

#     for grad, var in grads:
#         if grad is not None:
#             tf.summary.histogram(var.name.split(":")[0] + '/gradients', grad)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        train_op = opt.apply_gradients(grads, global_step=global_step)

#     kernel_vars = []

#     for i in tf.trainable_variables():
#         if 'bin/kernel' in i.name:
#             kernel_vars.append(i)

#     with tf.control_dependencies(update_ops):
#         with tf.control_dependencies([train_op]):
#             kernel_clip_op = [tf.clip_by_value(var, -1, 1) for var in kernel_vars]

    saver = tf.train.Saver(tf.global_variables(), max_to_keep=4)

#     def add_hist(train_vars):
#         for i in train_vars:
#             name = i.name.split(":")[0] + '/value'
#             value = i.value()
#             tf.summary.histogram(name, value)

#     add_hist(tf.trainable_variables())

    tf.summary.scalar('loss', loss)
    tf.summary.scalar('accuracy', accuracy)
    merged = tf.summary.merge_all()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use keras.layers.average_pooling2d instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.


In [9]:
config = tf.ConfigProto(allow_soft_placement=True,
                        log_device_placement=True)
config.gpu_options.allow_growth = True

with tf.Session(config=config) as sess:

    print('*****************Training Start!*****************')
    train_writer = tf.summary.FileWriter(save_dir+'train', sess.graph)
    sess.run(tf.global_variables_initializer())

    for m in range(epochs):
        start = time.time()
        batch_gen = datagen.flow(
            x_train, y_train, batch_size=batch_size)

        for i in range(iterations):
            x_batch, y_batch = next(batch_gen)
            _, loss_train = sess.run([train_op, loss], 
                                     {inputs: x_batch, outputs: y_batch, is_train: True})
        
        summary = sess.run(merged, {inputs: x_batch, outputs: y_batch, is_train: False})
        train_writer.add_summary(summary, m*iterations + i + 1)

        val_accs = []
        for i in range(5000//(batch_size*5)):
            val_acc = sess.run(accuracy, {inputs: x_test[i*batch_size*5: (i+1)*batch_size*5],
                                          outputs: y_test[i*batch_size*5: (i+1)*batch_size*5],
                                          is_train: False})
            val_accs.append(val_acc)

        if np.mean(val_accs) > old_acc:
            old_acc = np.mean(val_accs)
            saver.save(sess, save_dir+'cifar10.ckpt', global_step=global_step)

        end = time.time()
        print('Epoch: {}'.format(m + 1),
              'Train_loss: {:.3f}'.format(loss_train),
              'Val_acc: {:.3f}'.format(np.mean(val_accs)),
              'Time consumed: {:.4f} s'.format(end - start))

    print('*****************Training End!*****************')

*****************Training Start!*****************
Epoch: 1 Train_loss: 1.115 Val_acc: 0.545 Time consumed: 45.8281 s
Epoch: 2 Train_loss: 0.806 Val_acc: 0.729 Time consumed: 42.1131 s
Epoch: 3 Train_loss: 0.678 Val_acc: 0.750 Time consumed: 42.3344 s
Epoch: 4 Train_loss: 0.647 Val_acc: 0.765 Time consumed: 42.1676 s
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Epoch: 5 Train_loss: 0.523 Val_acc: 0.779 Time consumed: 42.3106 s
Epoch: 6 Train_loss: 0.426 Val_acc: 0.813 Time consumed: 42.2679 s
Epoch: 7 Train_loss: 0.606 Val_acc: 0.827 Time consumed: 42.0302 s
Epoch: 8 Train_loss: 0.600 Val_acc: 0.806 Time consumed: 41.9538 s
Epoch: 9 Train_loss: 0.324 Val_acc: 0.813 Time consumed: 42.2209 s
Epoch: 10 Train_loss: 0.245 Val_acc: 0.844 Time consumed: 42.3847 s
Epoch: 11 Train_loss: 0.338 Val_acc: 0.826 Time consumed: 42.1867 s
Epoch: 12 Train_loss: 0.367 Val_acc: 0.837 Time consumed: 41.9341 s
Epoch: 13 Train_loss: 0.275 Val_acc: 0.852 Time consumed: 4

Epoch: 116 Train_loss: 0.021 Val_acc: 0.896 Time consumed: 42.7422 s
Epoch: 117 Train_loss: 0.007 Val_acc: 0.895 Time consumed: 42.6890 s
Epoch: 118 Train_loss: 0.021 Val_acc: 0.896 Time consumed: 42.7358 s
Epoch: 119 Train_loss: 0.064 Val_acc: 0.898 Time consumed: 42.6665 s
Epoch: 120 Train_loss: 0.025 Val_acc: 0.898 Time consumed: 42.7063 s
Epoch: 121 Train_loss: 0.067 Val_acc: 0.900 Time consumed: 42.7795 s
Epoch: 122 Train_loss: 0.018 Val_acc: 0.904 Time consumed: 42.8459 s
Epoch: 123 Train_loss: 0.028 Val_acc: 0.900 Time consumed: 42.9265 s
Epoch: 124 Train_loss: 0.043 Val_acc: 0.881 Time consumed: 42.7497 s
Epoch: 125 Train_loss: 0.026 Val_acc: 0.894 Time consumed: 42.8251 s
Epoch: 126 Train_loss: 0.077 Val_acc: 0.895 Time consumed: 42.9737 s
Epoch: 127 Train_loss: 0.027 Val_acc: 0.894 Time consumed: 42.8552 s
Epoch: 128 Train_loss: 0.040 Val_acc: 0.902 Time consumed: 43.1856 s
Epoch: 129 Train_loss: 0.011 Val_acc: 0.897 Time consumed: 43.2941 s
Epoch: 130 Train_loss: 0.029 Val_a

Epoch: 235 Train_loss: 0.000 Val_acc: 0.907 Time consumed: 42.6483 s
Epoch: 236 Train_loss: 0.031 Val_acc: 0.904 Time consumed: 42.2038 s
Epoch: 237 Train_loss: 0.003 Val_acc: 0.907 Time consumed: 42.1143 s
Epoch: 238 Train_loss: 0.026 Val_acc: 0.906 Time consumed: 42.1934 s
Epoch: 239 Train_loss: 0.047 Val_acc: 0.908 Time consumed: 42.2791 s
Epoch: 240 Train_loss: 0.003 Val_acc: 0.912 Time consumed: 42.7892 s
Epoch: 241 Train_loss: 0.001 Val_acc: 0.903 Time consumed: 42.1039 s
Epoch: 242 Train_loss: 0.004 Val_acc: 0.900 Time consumed: 42.2578 s
Epoch: 243 Train_loss: 0.003 Val_acc: 0.906 Time consumed: 42.3908 s
Epoch: 244 Train_loss: 0.002 Val_acc: 0.904 Time consumed: 42.4003 s
Epoch: 245 Train_loss: 0.014 Val_acc: 0.893 Time consumed: 42.3407 s
Epoch: 246 Train_loss: 0.017 Val_acc: 0.904 Time consumed: 42.5908 s
Epoch: 247 Train_loss: 0.001 Val_acc: 0.902 Time consumed: 42.6707 s
Epoch: 248 Train_loss: 0.010 Val_acc: 0.899 Time consumed: 42.3805 s
Epoch: 249 Train_loss: 0.002 Val_a

Epoch: 354 Train_loss: 0.000 Val_acc: 0.906 Time consumed: 42.9874 s
Epoch: 355 Train_loss: 0.007 Val_acc: 0.906 Time consumed: 42.8278 s
Epoch: 356 Train_loss: 0.001 Val_acc: 0.907 Time consumed: 43.1934 s
Epoch: 357 Train_loss: 0.002 Val_acc: 0.912 Time consumed: 43.0802 s
Epoch: 358 Train_loss: 0.000 Val_acc: 0.908 Time consumed: 43.0193 s
Epoch: 359 Train_loss: 0.002 Val_acc: 0.912 Time consumed: 42.9729 s
Epoch: 360 Train_loss: 0.003 Val_acc: 0.910 Time consumed: 42.5491 s
Epoch: 361 Train_loss: 0.000 Val_acc: 0.908 Time consumed: 42.1867 s
Epoch: 362 Train_loss: 0.006 Val_acc: 0.906 Time consumed: 42.4442 s
Epoch: 363 Train_loss: 0.001 Val_acc: 0.907 Time consumed: 42.5579 s
Epoch: 364 Train_loss: 0.001 Val_acc: 0.913 Time consumed: 42.5421 s
Epoch: 365 Train_loss: 0.000 Val_acc: 0.908 Time consumed: 42.7057 s
Epoch: 366 Train_loss: 0.001 Val_acc: 0.912 Time consumed: 42.5941 s
Epoch: 367 Train_loss: 0.004 Val_acc: 0.909 Time consumed: 42.4517 s
Epoch: 368 Train_loss: 0.005 Val_a

Epoch: 473 Train_loss: 0.001 Val_acc: 0.908 Time consumed: 42.7114 s
Epoch: 474 Train_loss: 0.000 Val_acc: 0.908 Time consumed: 42.5490 s
Epoch: 475 Train_loss: 0.000 Val_acc: 0.912 Time consumed: 42.7678 s
Epoch: 476 Train_loss: 0.000 Val_acc: 0.912 Time consumed: 42.7412 s
Epoch: 477 Train_loss: 0.010 Val_acc: 0.911 Time consumed: 42.9329 s
Epoch: 478 Train_loss: 0.001 Val_acc: 0.911 Time consumed: 42.8113 s
Epoch: 479 Train_loss: 0.002 Val_acc: 0.915 Time consumed: 42.9170 s
Epoch: 480 Train_loss: 0.000 Val_acc: 0.909 Time consumed: 42.7351 s
Epoch: 481 Train_loss: 0.000 Val_acc: 0.910 Time consumed: 42.9240 s
Epoch: 482 Train_loss: 0.000 Val_acc: 0.912 Time consumed: 42.8970 s
Epoch: 483 Train_loss: 0.005 Val_acc: 0.910 Time consumed: 42.9912 s
Epoch: 484 Train_loss: 0.000 Val_acc: 0.908 Time consumed: 42.6963 s
Epoch: 485 Train_loss: 0.000 Val_acc: 0.910 Time consumed: 43.0008 s
Epoch: 486 Train_loss: 0.004 Val_acc: 0.913 Time consumed: 42.8651 s
Epoch: 487 Train_loss: 0.007 Val_a