In [1]:
# Running %env without any arguments
# lists all environment variables

# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 

import numpy as np
import pandas as pd
import io
import time
from datetime import datetime
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave, imshow
import tensorflow as tf
from tensorflow.python.platform import tf_logging
from tensorflow.contrib import layers
from tensorflow.contrib.training import add_gradients_summaries
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.contrib.layers.python.layers import layers as layers_lib
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.training import optimizer as tf_optimizer
from tensorflow.python.ops import variables as tf_variables
import os.path
import tensorflow.contrib.slim as slim
import inception_preprocessing
import vgg_preprocessing
import logging
import resnet2

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=


In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
PRETRAINED_MODEL_PATH = DATASET_PATH + 'Resnet/logs_v2_101/model/resnet101_v2_model.ckpt-367651'
LOG_PATH = DATASET_PATH + 'Resnet/logs_v2_101/'
TRAIN_PATH = DATASET_PATH + 'Split1/Train/'
#TRAIN_PATH = '/media/rs/FC6CDC6F6CDC25E4/resample_dataset2/'
#TRAIN_PATH = '/media/rs/FC6CDC6F6CDC25E4/ResnetHardTrain/'
LR_FILE_PATH = DATASET_PATH + 'Resnet/logs_v2_101/lr_setting/resnetv2_vgg_lr_setting'
VAL_PATH = DATASET_PATH + 'Split1/Validation/'
TEST_PATH = DATASET_PATH + 'Test/'
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
CATEGORY_WEIGHT_PATH = DATASET_PATH + 'catogory_with_weight.csv'
BATCH_SIZE = 128#256

IMAGE_WIDTH = 180
IMAGE_HEIGHT = 180
NUM_CLASS = 5270
LEVEL0_CLASS = 49
LEVEL1_CLASS = 483
# validation examples num: 2319624
# train examples num: 10051704
# total step: 157057
TOTAL_EXAMPLES = 10051704

NUM_EPOCHES = 12
EPOCHES_OVER = 10

INPUT_THREADS = 12

initial_learning_rate = 0.0001
stop_learning_rate = 0.000001
moving_average_decay = 0.96# use large to be more stable?
momentum = 0.9
#Know the number steps to take before decaying the learning rate and batches per epoch
num_steps_per_epoch = TOTAL_EXAMPLES / BATCH_SIZE + 1

In [3]:
# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler(DATASET_PATH + 'tensorflow_resnet_train_vggpreprocess.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [4]:
def read_learning_rate(cur_step, num_steps_per_epoch):
    def inner_lr_parser(interval_start, interval_end, lr, dict_in, default_lr, use_epoch_percent, num_steps_per_epoch):
        lr = default_lr * lr
        if use_epoch_percent:
            interval_start = num_steps_per_epoch * interval_start
            interval_end = num_steps_per_epoch * interval_end
        interval_start = int(interval_start)
        interval_end = int(interval_end)
        if (interval_start < interval_end) and (lr > 0):
            dict_in[(interval_start, interval_end)] = lr
            
    lr_map = dict()
    default_lr = initial_learning_rate
    stop_lr = stop_learning_rate
    line_index = -1
    use_epoch_percent = True
    if os.path.exists(LR_FILE_PATH):
        with open(LR_FILE_PATH, 'r') as lr_setting_file:
            for _, line in enumerate(lr_setting_file):
                line = line.strip()
                if (line != '') and (not line.startswith('#')):
                    line_index += 1
                    if line_index == 0:
                        default_lr = float(line.split(':')[-1].strip())
                        continue
                    if line_index == 1:
                        stop_lr = float(line.split(':')[-1].strip())
                        continue
                    if line_index == 2:
                        use_epoch_percent = ('EPOCHES_PERCENT' in (line.split(':')[-1].strip()))
                        continue
                    # this is a list desciption
                    if line.startswith('['):
                        line = [float(s.strip()) for s in line[1:-1].strip().split()]
                        step_interval = (line[1] - line[0])/line[-1]
                        lr_interval = (line[3] - line[2])/line[-1]
                        begin = line[0]
                        lr_begin = line[2]
                        for index in range(int(line[-1])):
                            inner_lr_parser(begin, begin+step_interval, lr_begin, lr_map, default_lr, use_epoch_percent, num_steps_per_epoch)
                            begin += step_interval
                            lr_begin += lr_interval
                    else:
                        interval_start, interval_end, lr = [float(s) for s in line.strip().split()]
                        inner_lr_parser(interval_start, interval_end, lr, lr_map, default_lr, use_epoch_percent, num_steps_per_epoch)
    lr_ret = default_lr
#     print(use_epoch_percent)
    for (start, end), lr in lr_map.items():
        if (cur_step >= start) and (cur_step <= end):
            if (lr < lr_ret):
                lr_ret = lr
    if lr_ret < stop_lr: lr_ret = stop_lr      
    return lr_ret
# _ = read_learning_rate(1, num_steps_per_epoch)
# lr = []
# num_epoches_to_show = 10
# num_point = 100
# for i in [i*num_epoches_to_show*num_steps_per_epoch/num_point for i in range(num_point)]:
#     lr.append(read_learning_rate(i, num_steps_per_epoch))
# plt.plot(lr)
# plt.ylabel('learning rate')
# plt.show()

In [5]:
def preprocess_for_inception(input_image, is_training = True):
    return vgg_preprocessing.preprocess_image(input_image, 180, 180, is_training)

In [6]:
class LabelMapping(object):
    def __init__(self, catogory_file_path):
        super(LabelMapping, self).__init__()
        self._category_level_csv = catogory_file_path
        self._category_map, self._category_level0_map, self._category_level1_map, self._len_level0, self._len_level1 = self.cvt_csv2tfrecord()
        self._catogory_weight_map = self.cvt_catogory_weight()
        
        self._mapping_strings = tf.constant( [ str(key) for key in self._category_map.keys() ] )

        self._mapping_table = tf.contrib.lookup.index_table_from_tensor(mapping=self._mapping_strings, default_value=0) 
        
        self._level0_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level0_map.keys()), list(self._category_level0_map.values()), tf.int64, tf.int64), 0)
        self._level1_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level1_map.keys()), list(self._category_level1_map.values()), tf.int64, tf.int64), 0)
        self._weight_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._catogory_weight_map.keys()), list(self._catogory_weight_map.values()), tf.int64, tf.float32), 0)

    @property
    def category_map(self):
        return self._category_map
    @property
    def level0_table(self):
        return self._level0_table
    @property
    def level1_table(self):
        return self._level1_table
    @property
    def len_level0(self):
        return self._len_level0
    @property
    def len_level1(self):
        return self._len_level1
    @property
    def mapping_table(self):
        return self._mapping_table
    @property
    def weight_table(self):
        return self._weight_table
    
    def cvt_catogory_weight(self):
        category_weight_map = dict()
        csv = pd.read_csv(CATEGORY_WEIGHT_PATH).values
        for row in csv:  
            category_id, weight = row[0], row[2]
            category_weight_map[int(category_id)] = weight

        return category_weight_map
    
    def cvt_csv2tfrecord(self):
        level0_map, level1_map = self.create_level_map()
        count = 0
        category_map = dict()
        category_level0_map = dict()
        category_level1_map = dict()
        csv = pd.read_csv(self._category_level_csv).values
        for row in csv:  
            category_id, level0, level1 = row[0], row[1], row[2]
            category_map[category_id] = count
            category_level0_map[int(category_id)] = level0_map[level0]
            category_level1_map[int(category_id)] = level1_map[level1]
            count += 1

        return category_map, category_level0_map, category_level1_map, len(level0_map), len(level1_map)

    def create_level_map(self):
        csv = pd.read_csv(self._category_level_csv).values
        level_list = [list(), list()]
        for row in csv: 
            for level in range(1,3):
                if row[level] not in level_list[level-1]:
                    level_list[level-1].append(row[level])
        return dict(zip(level_list[0], range(len(level_list[0])))), dict(zip(level_list[1], range(len(level_list[1]))))

In [7]:
class CdiscountDataset(object):
    def __init__(self, data_path, file_begin_match, label_mapping, num_examples, num_classes, buffer_size, batch_size, num_epochs, is_training):
        super(CdiscountDataset, self).__init__()
        #self._data_file_list = [ os.path.join(data_path, x) for x in os.listdir(data_path) if lambda x: os.path.isfile(x) and x.startswith(file_begin_match) ]
        self._data_file_list = data_path + file_begin_match + '*'
        self._num_examples = num_examples
        self._num_classes = num_classes
        self._batch_size = batch_size
        self._buffer_size = buffer_size
        self._num_epochs = num_epochs
        self._is_training = is_training
        self._category_map = label_mapping.category_map
        self._level0_table = label_mapping.level0_table
        self._level1_table = label_mapping.level1_table
        self._len_level0 = label_mapping.len_level0
        self._len_level1 = label_mapping.len_level1
        self._mapping_table = label_mapping.mapping_table
        self._weight_table = label_mapping.weight_table
    
    def create_dataset(self):
        opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
        reader = lambda : tf.TFRecordReader(options=opts)
        keys_to_features = {
            'img_raw': tf.FixedLenFeature([], tf.string, default_value=''),
            'product_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
            # notice that we don't have this feature in our TFRecord, so always default provided
            'format': tf.FixedLenFeature([], tf.string, default_value='jpg'),
            'category_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64))
        }

        items_to_handlers = {
            # automated decode image from features in FixedLenFeature
            'image': slim.tfexample_decoder.Image(image_key='img_raw', format_key='format'),
            'label': slim.tfexample_decoder.Tensor('category_id'),
        }

        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
        
        self._dataset = slim.dataset.Dataset(
            data_sources = self._data_file_list,
            decoder = decoder,
            reader = reader,
            # num_readers = 8,
            num_samples = self._num_examples,
            #num_classes = self._num_classes,
            items_to_descriptions = None)
        
        # notice that DatasetDataProvider can automate shuffle the examples by ParallelReader using its RandomShuffleQueue
        self._data_provider = slim.dataset_data_provider.DatasetDataProvider(
            self._dataset,
            num_readers = INPUT_THREADS,
            shuffle = True, # default is True
            num_epochs = self._num_epochs,
            common_queue_capacity = self._buffer_size + 4 * self._batch_size,
            common_queue_min = self._buffer_size,
            scope = self._is_training and 'train_files' or 'validation_files')
        
        org_image, org_label = self._data_provider.get(['image', 'label'])

        image = preprocess_for_inception(org_image, self._is_training) # final image to train
 
        # no need for shuffle, DatasetDataProvider do this for us
        batch_images, batch_labels, batch_labels_level0, batch_labels_level1, batch_weight = \
                tf.train.batch([image, tf.one_hot(self._mapping_table.lookup(tf.as_string(org_label)), self._num_classes, axis=-1),\
                tf.one_hot(self._level0_table.lookup(org_label), self._len_level0, axis=-1),\
                tf.one_hot(self._level1_table.lookup(org_label), self._len_level1, axis=-1), self._weight_table.lookup(org_label)],\
                self._batch_size,\
                num_threads = INPUT_THREADS,\
                capacity = self._buffer_size + 4 * self._batch_size,\
                allow_smaller_final_batch = self._is_training, name = self._is_training and 'train_batch' or 'validation_batch')
        
        return batch_images, batch_labels, batch_labels_level0, batch_labels_level1, batch_weight

In [8]:
def_graph = tf.Graph()
with def_graph.as_default() as graph:
    def train_step(input_examples, one_hot_labels, level0_labels, level1_labels, batch_weight):   
        # inputs has shape [batch, 224, 224, 3]
        with slim.arg_scope(resnet2.resnet_arg_scope()):
            resnet2_logits, end_points = resnet2.resnet_v2_101(input_examples, None, is_training=True)
            
            logits = tf.stop_gradient(resnet2_logits)
            
            net = layers_lib.dropout(logits, keep_prob=0.5, is_training=True, scope='Dropout')

            net = layers_lib.conv2d(
                                  net,
                                  NUM_CLASS, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='logits')
       
            end_points['predictions'] = layers.softmax(net, scope='predictions')

            tvars = tf.trainable_variables()

            head_only_vars = [var for var in tvars if 'logits' in var.name]
            
            variables_to_restore = slim.get_variables_to_restore(exclude = ['logits'])

            end_points['logits_output_squeezed'] = tf.squeeze(net)
            loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = end_points['logits_output_squeezed'], weights=batch_weight, label_smoothing = 0.0)
            total_loss = tf.losses.get_total_loss()    # obtain the regularization losses as well

            # Create the global step for monitoring the learning_rate and training.
            # since supervisor will also create one global_step, so we create n advance in order to feed into exponential_decay
            global_step = tf.train.get_or_create_global_step(graph = graph)

            custom_learning_rate = tf.placeholder(tf.float32, shape=[])
            #Now we can define the optimizer that takes on the learning rate
            #optimizer = tf.train.AdamOptimizer(learning_rate = lr)
            #optimizer = tf.train.RMSPropOptimizer(learning_rate = lr)
            optimizer = tf.train.MomentumOptimizer(learning_rate = custom_learning_rate, momentum=momentum)
            
            # Gather update_ops from the first clone. These contain, for example,
            # the updates for the batch_norm variables created by network_fn.
            # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay, global_step)
            # Use an alternative set of update ops in addition to the default updates:
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables))

            #variables_to_restore_checkpoint = (ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS))
            
            #Create the train_op.
            #accumulate_factor = tf.constant([1./ACCUMULATE_STEP])
            #train_op, accum_ops, zero_ops = my_create_train_op(total_loss, optimizer, False, accumulate_factor)
            
            #Create the train_op.
            train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=False, variables_to_train=head_only_vars)
            
            variables_to_restore_checkpoint = slim.get_variables_to_restore()  

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(tf.squeeze(end_points['predictions']), 1)
            probabilities = end_points['predictions']
            accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1), name='train_accuracy')
            metrics_op = tf.group(accuracy_update)
            
            real_time_accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, tf.argmax(one_hot_labels, 1)), tf.float32))

            #Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('losses/Total_Loss', total_loss)
            tf.summary.scalar('train/accuracy', accuracy)
            tf.summary.scalar('train/real_time_accuracy', real_time_accuracy)
            tf.summary.scalar('learning_rate', custom_learning_rate)

            return train_op, global_step, metrics_op, variables_to_restore, variables_to_restore_checkpoint, predictions, custom_learning_rate, accuracy, real_time_accuracy, total_loss

In [9]:
with def_graph.as_default() as graph:
    label_mapping = LabelMapping(CATEGORY_NAME_PATH)
    train_dataset = CdiscountDataset(TRAIN_PATH, 'output_file', label_mapping, TOTAL_EXAMPLES, NUM_CLASS, 12000, BATCH_SIZE, NUM_EPOCHES, True)
   
    batch_images, batch_labels, batch_level0_labels, batch_level1_labels, batch_weight = train_dataset.create_dataset()
   
    with tf.device('/gpu:0'):
        train_op, global_step, metrics_op, variables_to_restore, variables_to_restore_checkpoint, pred_op, lr, accuracy, real_time_accuracy, total_loss = train_step(batch_images, batch_labels, batch_level0_labels, batch_level1_labels, batch_weight)
         
    summary_op = tf.summary.merge_all()

    checkpoint_saver = tf.train.Saver(variables_to_restore_checkpoint)
    
    pre_train_saver = tf.train.Saver(variables_to_restore)
    #pre_train_saver = tf.train.Saver(variables_to_restore)
    # Define an init function that loads the pretrained checkpoint.
    # sess is the managed session passed by Supervisor
    def load_pretrain(sess):
        pre_train_saver.restore(sess, PRETRAINED_MODEL_PATH)

    # no need for specify local_variables_initializer and tables_initializer, Supervisor will do this via default local_init_op
    # init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
    init_op = tf.group(tf.global_variables_initializer())
    #init_op = tf.group(train_iterator_initializer, val_iterator_initializer, tf.global_variables_initializer())
    
    # Pass the init function to the supervisor.
    # - The init function is called _after_ the variables have been initialized by running the init_op.
    # - use default tf.Saver() for ordinary save and restore
    # - save checkpoint every 1.3 hours(4800)
    # - manage summary in current process by ourselves for memory saving
    # - no need to specify global_step, supervisor will find this automately
    # - initialize order: checkpoint -> local_init_op -> init_op -> init_func
    sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain, init_op = init_op, summary_op = None, saver = checkpoint_saver, save_model_secs=7200, checkpoint_basename='resnet101_v2_model.ckpt')
    
    final_loss = 0.
    final_accuracy = 0.
    training_state = True
    cur_readed_lr = initial_learning_rate
    tf_logging.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
    #config.gpu_options.allow_growth = True
    with sv.managed_session(config=config) as sess:
    #with sv.prepare_or_wait_for_session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
        #sess.run(iterator_initalizer)
        # Here sess was either initialized from the pre-trained-checkpoint or
        # recovered from a checkpoint saved in a previous run of this code.
        for step in range(int(num_steps_per_epoch * NUM_EPOCHES)):         
            if sv.should_stop():
                tf_logging.info('Supervisor emit finished!')
                tf_logging.info('Current Loss: %s', loss)
                tf_logging.info('Current Accuracy: %s', accuracy)
                tf_logging.info('Saving current model to disk(maybe invalid).')
                training_state = False
                break

            start_time = time.time()

            if step % 1000 == 0:
                summ, cur_global_step = sess.run([summary_op, global_step], feed_dict={lr: cur_readed_lr})
                sv.summary_computed(sess, summ)
                if step > EPOCHES_OVER * num_steps_per_epoch:
                    raise StopIteration("over epoches reached.")
                cur_readed_lr = read_learning_rate(cur_global_step, num_steps_per_epoch)

            with tf.device('/gpu:0'):
                _, _, cur_loss, cur_acc, rt_accuracy, total_step, cur_lr = sess.run([train_op, metrics_op, total_loss, accuracy, real_time_accuracy, global_step, lr], feed_dict={lr: cur_readed_lr})
                
            time_elapsed = time.time() - start_time

            if step % 10 == 0:
                final_loss = cur_loss
                final_accuracy = cur_acc
                tf_logging.info('Current Speed: {:5.3f}sec/batch'.format(time_elapsed))
                tf_logging.info('Current Streaming Accuracy: {:5.3f}%'.format(cur_acc*100.))
                tf_logging.info('Current Realtime Accuracy: {:5.3f}%'.format(rt_accuracy*100.))
                tf_logging.info('Current Loss: {:5.3f}'.format(cur_loss))
                tf_logging.info('Epoch %s/%s, Global Step: %s', int(total_step / num_steps_per_epoch + 1), NUM_EPOCHES, total_step)
                tf_logging.info('Current Learning Rate: {}'.format(cur_lr))

        if training_state:
            #We log the final training loss and accuracy
            tf_logging.info('Final Loss: %s', final_loss)
            tf_logging.info('Final Accuracy: %s', final_accuracy)
            # Once all the training has been done, save the log files and checkpoint model
            tf_logging.info('Finished training! Model saved.')
        sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
    

Instructions for updating:
Please switch to tf.train.get_or_create_global_step
INFO:tensorflow:2017-12-01 18:47:14
INFO:tensorflow:Restoring parameters from /media/rs/0E06CD1706CD0127/Kapok/kaggle/Resnet/logs_v2_101/model/resnet101_v2_model.ckpt-367651
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Saving checkpoint to path /media/rs/0E06CD1706CD0127/Kapok/kaggle/Resnet/logs_v2_101/resnet101_v2_model.ckpt
INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:Starting queue runners.


KeyboardInterrupt: 