In [1]:
# Running %env without any arguments
# lists all environment variables

# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 1

import numpy as np
import pandas as pd
import io
import time
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave
import tensorflow as tf
from tensorflow.python.platform import tf_logging
import os.path
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets import inception
import inception_preprocessing
import logging

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=1


In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
PRETRAINED_MODEL_PATH = DATASET_PATH + 'logs/before/inception_v3_model.ckpt-917169'
LOG_PATH = DATASET_PATH + 'logs/'
TRAIN_PATH = DATASET_PATH + 'Split1/Train/'
VAL_PATH = DATASET_PATH + 'Split1/Validation/'
TEST_PATH = DATASET_PATH + 'Test/'
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
BATCH_SIZE = 64
IMAGE_WIDTH = 180
IMAGE_HEIGHT = 180
NUM_CLASS = 5270
# validation examples num: 2319624
# train examples num: 10051704
# total step: 157057
TOTAL_EXAMPLES = 10051704
# validation num = 2319624
NUM_EPOCHES = 7
INPUT_THREADS = 6

#Learning rate information and configuration (Up to you to experiment)
# initial_learning_rate = 0.000003#0.00001
# learning_rate_decay_factor = 0.94
initial_learning_rate = 0.001#0.00001
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 1
momentum = 0.4
#Know the number steps to take before decaying the learning rate and batches per epoch
num_steps_per_epoch = TOTAL_EXAMPLES / BATCH_SIZE
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch / 6)

In [3]:
# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler(DATASET_PATH + 'tensorflow_inception_train.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [4]:
class MiniDataSet(object):
    def __init__(self, file_path_pattern, category_level_csv, num_examples, num_classes, is_training = True, min_after_dequeue=1000, batch_size = BATCH_SIZE, num_epochs = NUM_EPOCHES, num_reader = INPUT_THREADS):
        super(MiniDataSet, self).__init__()
        self._num_examples = num_examples
        self._num_classes = num_classes
        self._file_path_pattern = file_path_pattern
        self._category_level_csv = category_level_csv
        self._num_reader = num_reader
        self._batch_size = batch_size
        self._num_epochs = num_epochs
        self._min_after_dequeue = min_after_dequeue
        self._is_training = is_training
        
    def get_category_description_from_csv(self, level = 0):
        category_map = dict()
        csv = pd.read_csv(self._category_level_csv).values
        for row in csv:  
            category_id, levels = row[0], row[1:]
            category_map[category_id] = levels[level]
        return category_map

    def create_dataset(self):
        opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
        reader = lambda : tf.TFRecordReader(options=opts)
        keys_to_features = {
            'img_raw': tf.FixedLenFeature([], tf.string, default_value=''),
            'product_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
            # notice that we don't have this feature in our TFRecord, so always default provided
            'format': tf.FixedLenFeature([], tf.string, default_value='jpg'),
            'category_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64))
        }

        items_to_handlers = {
            # automated decode image from features in FixedLenFeature
            'image': slim.tfexample_decoder.Image(image_key='img_raw', format_key='format'),
            'label': slim.tfexample_decoder.Tensor('category_id'),
        }

        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)

        labels_to_name_dict = self.get_category_description_from_csv()

        self._dataset = slim.dataset.Dataset(
            data_sources = self._file_path_pattern,
            decoder = decoder,
            reader = reader,
            # num_readers = 8,
            num_samples = self._num_examples,
            #num_classes = self._num_classes,
            #labels_to_name = labels_to_name_dict,
            items_to_descriptions = None)
        
        # notice that DatasetDataProvider can automate shuffle the examples by ParallelReader using its RandomShuffleQueue
        self._data_provider = slim.dataset_data_provider.DatasetDataProvider(
            self._dataset,
            num_readers = self._num_reader,
            shuffle = True, # default is True
            num_epochs = self._num_epochs,
            common_queue_capacity = self._min_after_dequeue + 3 * self._batch_size,
            common_queue_min = self._min_after_dequeue,
            scope = self._is_training and 'train_files' or 'validation_files')
        
        return self._data_provider.get(['image', 'label'])
        

In [5]:
def preprocess_for_inception(input_image, is_training = True):
    # inception_v3.default_image_size = 299
    return inception_preprocessing.preprocess_image(input_image, 299, 299, is_training)

In [6]:
def cvt_csv2tfrecord():
    count = 0
    category_map = dict()
    csv = pd.read_csv(CATEGORY_NAME_PATH).values
    for row in csv:  
        category_id, _ = row[0], row[1:]
        category_map[category_id] = count
        count += 1
    return category_map

In [7]:
def one_hot_process(org_label, map_table, num_classes):
    return tf.one_hot(map_table.lookup(tf.as_string(org_label)), num_classes, axis=-1)

In [8]:
def_graph = tf.Graph()
with def_graph.as_default() as graph:
    def train_step(input_examples, one_hot_labels):   
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            # here logits is the pre-softmax activations
            logits, end_points = inception.inception_v3(
                input_examples,
                num_classes = NUM_CLASS,
                is_training = True)
            # we retrain for diferrent num classes
            # and don't define any Variables before get_variables_to_restore
            
#             variables_to_exclude = []
#             #variables_to_exclude = ['InceptionV3/Logits', 'InceptionV3/AuxLogits']
#             for var in slim.get_model_variables():
#                 print(var.op.name)
#                 if var.op.name.strip().endswith('*Momentum'):
#                     print(var.op.name)
#                     variables_to_exclude.append(var)

#             variables = tf.contrib.framework.get_model_variables()
#             restore_variables = tf.contrib.framework.filter_variables(
#                 variables, include_patterns=None, exclude_patterns=['Momentum', 'momentum'])

            variables_to_restore = slim.get_variables_to_restore(exclude = ['InceptionV3/Logits', 'InceptionV3/AuxLogits'])
            #variables_to_restore_from_checkpoint = slim.get_variables_to_restore(exclude = variables_to_exclude)
            # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced, e.x. label smothing
            loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
            total_loss = tf.losses.get_total_loss()    # obtain the regularization losses as well

            # Create the global step for monitoring the learning_rate and training.
            # since supervisor will also create one global_step, so we create n advance in order to feed into exponential_decay
            global_step = tf.train.get_or_create_global_step(graph = graph)

            #Define your exponentially decaying learning rate
            lr = tf.train.exponential_decay(
                learning_rate = initial_learning_rate,
                global_step = global_step,
                decay_steps = decay_steps,
                decay_rate = learning_rate_decay_factor,
                staircase = True)

            #Now we can define the optimizer that takes on the learning rate
            #optimizer = tf.train.AdamOptimizer(learning_rate = lr)
            optimizer = tf.train.MomentumOptimizer(learning_rate = lr, momentum=momentum)
            

            #Create the train_op.
            train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=False)

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(end_points['Predictions'], 1)
            probabilities = end_points['Predictions']
            accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1))
            metrics_op = tf.group(accuracy_update)


            #Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('losses/Total_Loss', total_loss)
            tf.summary.scalar('accuracy', accuracy)
            tf.summary.scalar('learning_rate', lr)

            return train_op, global_step, metrics_op, variables_to_restore, predictions, lr, accuracy, total_loss

    def validation_step(input_examples, one_hot_labels):   
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            # here logits is the pre-softmax activations
            logits, end_points = inception.inception_v3(
                input_examples,
                num_classes = NUM_CLASS,
                is_training=False, reuse=True)

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(end_points['Predictions'], 1)
            probabilities = end_points['Predictions']
            accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1))
            metrics_op = tf.group(accuracy_update)

            #Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('validation/accuracy', accuracy)

            return metrics_op, accuracy, predictions, probabilities

In [9]:
with def_graph.as_default() as graph:
    def init_dataset(file_path_pattern, mapping_table, is_training = True):
        dataset = MiniDataSet(file_path_pattern, CATEGORY_NAME_PATH, TOTAL_EXAMPLES, NUM_CLASS, is_training = is_training)
        org_image, org_label = dataset.create_dataset()
        image = preprocess_for_inception(org_image, is_training) # final image to train

        label = one_hot_process(org_label, mapping_table, NUM_CLASS) # final label for training
        # no need for shuffle, DatasetDataProvider do this for us
        batch_images, batch_labels = tf.train.batch([image, label], BATCH_SIZE,\
                                            num_threads = INPUT_THREADS,\
                                            capacity = 1000 + 3 * BATCH_SIZE,\
                                            allow_smaller_final_batch = is_training, name = is_training and 'train_batch' or 'validation_batch')
        
        return batch_images, batch_labels

In [10]:
with def_graph.as_default() as graph:
    mapping_strings = tf.constant( [ str(key) for key in cvt_csv2tfrecord().keys() ] )
    mapping_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, default_value=0)
    batch_images, batch_labels = init_dataset(TRAIN_PATH + "output_file*.tfrecords", mapping_table)
    batch_val_images, batch_val_labels = init_dataset(VAL_PATH + "test_output_file*.tfrecords", mapping_table, False)
    with tf.device('/gpu:0'):
        train_op, global_step, metrics_op, variables_to_restore, pred_op, lr, accuracy, total_loss = train_step(batch_images, batch_labels)
        val_metrics_op, val_accuracy, val_predictions, val_probabilities = validation_step(batch_val_images, batch_val_labels)
        real_val_label = tf.argmax(batch_val_labels, 1)
    
     # Summarize all gradients
#     for var in tf.trainable_variables():
#         print(var.name[:-2])
#         if 'InceptionV3/Conv2d_1a_3x3/weights' == var.name[:-2]:
#             tf.summary.tensor_summary(var.name[:-2], var) 
                    
    summary_op = tf.summary.merge_all()
    # Create a saver that restores only the pre-trained variables.
    # we have change optim, restore all param use pretrained mode
    #pre_train_saver = tf.train.Saver(variables_to_restore)
    
    variables = slim.get_variables_to_restore()
    restore_from_pretrained = tf.contrib.framework.filter_variables(
        variables,
        include_patterns=None,
        exclude_patterns=['Momentum'])

    pre_train_saver = tf.train.Saver(restore_from_pretrained)
    # Define an init function that loads the pretrained checkpoint.
    # sess is the managed session passed by Supervisor
    def load_pretrain(sess):
        pre_train_saver.restore(sess, PRETRAINED_MODEL_PATH)

    # no need for specify local_variables_initializer and tables_initializer, Supervisor will do this via default local_init_op
    # init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
    init_op = tf.group(tf.global_variables_initializer())
    # Pass the init function to the supervisor.
    # - The init function is called _after_ the variables have been initialized by running the init_op.
    # - use default tf.Saver() for ordinary save and restore
    # - save checkpoint every 1.3 hours(4800)
    # - manage summary in current process by ourselves for memory saving
    # - no need to specify global_step, supervisor will find this automately
    # - initialize order: checkpoint -> local_init_op -> init_op -> init_func
    sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain, init_op = init_op, summary_op = None, save_model_secs=24000, checkpoint_basename='inception_v3_model.ckpt')
    
    final_loss = 0.
    final_accuracy = 0.
    training_state = True

    config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
    #config.gpu_options.allow_growth = True
    with sv.managed_session(config=config) as sess:
    #with sv.prepare_or_wait_for_session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:

        # Here sess was either initialized from the pre-trained-checkpoint or
        # recovered from a checkpoint saved in a previous run of this code.
        for step in range(int(num_steps_per_epoch * NUM_EPOCHES)):       
            if sv.should_stop():
                tf_logging.info('Supervisor emit finished!')
                tf_logging.info('Current Loss: %s', loss)
                tf_logging.info('Current Accuracy: %s', accuracy)
                tf_logging.info('Saving current model to disk(maybe invalid).')
                training_state = False
                break

            start_time = time.time()
            if step % 1000 == 0:
                with tf.device('/gpu:0'):
                    _, _, _, summ = sess.run([train_op, global_step, metrics_op, summary_op])
                sv.summary_computed(sess, summ)
            else:
                if step % 50 == 0:
                    with tf.device('/gpu:0'):
                        _, val_acc, val_pred, val_prob, real_label = sess.run([val_metrics_op, val_accuracy, val_predictions, val_probabilities, real_val_label])
                    time_elapsed = time.time() - start_time
                    tf_logging.info('Validation Speed: {:5.3f}sec/batch'.format(time_elapsed))
                    tf_logging.info('Current Streaming ValAccuracy: {:5.3f}%'.format(val_acc*100.))
                    tf_logging.info('Real Label: {}'.format(real_label))
                    tf_logging.info('Pred Label: {}'.format(val_pred))
                        
                else:
                    with tf.device('/gpu:0'):
                        _, total_step, _, cur_loss, cur_acc, cur_lr = sess.run([train_op, global_step, metrics_op, total_loss, accuracy, lr])
                    time_elapsed = time.time() - start_time
                    if step % 10 == 0:
                        final_loss = cur_loss
                        final_accuracy = cur_acc
                        tf_logging.info('Current Speed: {:5.3f}sec/batch'.format(time_elapsed))
                        tf_logging.info('Current Streaming Accuracy: {:5.3f}%'.format(cur_acc*100.))
                        tf_logging.info('Current Loss: {:5.3f}'.format(cur_loss))
                        tf_logging.info('Epoch %s/%s, Global Step: %s', int(total_step / num_steps_per_epoch + 1), NUM_EPOCHES, total_step)
                        tf_logging.info('Current Learning Rate: {}'.format(cur_lr))
                
                    
        if training_state:
            #We log the final training loss and accuracy
            tf_logging.info('Final Loss: %s', final_loss)
            tf_logging.info('Final Accuracy: %s', final_accuracy)
            # Once all the training has been done, save the log files and checkpoint model
            tf_logging.info('Finished training! Model saved.')
        sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
    

Instructions for updating:
Please switch to tf.train.get_or_create_global_step
INFO:tensorflow:Restoring parameters from /media/rs/0E06CD1706CD0127/Kapok/kaggle/logs/inception_v3_model.ckpt-1115408
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Saving checkpoint to path /media/rs/0E06CD1706CD0127/Kapok/kaggle/logs/inception_v3_model.ckpt
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:Current Speed: 0.860sec/batch
INFO:tensorflow:Current Streaming Accuracy: 59.531%
INFO:tensorflow:Current Loss: 2.646
INFO:tensorflow:Epoch 8/7, Global Step: 1115423
INFO:tensorflow:Current Learning Rate: 8.507064563900713e-08
INFO:tensorflow:Current Speed: 0.808sec/batch
INFO:tensorflow:Current Streaming Accuracy: 59.375%
INFO:tensorflow:Current Loss: 2.679
INFO:tensorflow:Epoch 8/7, Global Step: 1115433
INFO:tensorflow:Current Learning Rate: 8.507064563900713e-08


KeyboardInterrupt: 