In [1]:
# Running %env without any arguments
# lists all environment variables

# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 1

import numpy as np
import pandas as pd
import io
import time
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave
import tensorflow as tf
from tensorflow.python.platform import tf_logging
from tensorflow.contrib import layers
from tensorflow.contrib.training import add_gradients_summaries
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.training import optimizer as tf_optimizer
from tensorflow.python.ops import variables as tf_variables

from tensorflow.python.ops import init_ops
from tensorflow.contrib import layers
from tensorflow.contrib.framework.python.ops import arg_scope
from tensorflow.contrib.layers.python.layers import layers as layers_lib
from tensorflow.python.ops import variable_scope

import os.path
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets import inception
import inception_preprocessing
import logging

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=1


In [2]:
# last count: 9817, roughly examples num: 10052608
# after resample count: 18456575

# '/media/rs/FC6CDC6F6CDC25E4/resample_dataset2/'
# total sampled examples num: 18456575

In [3]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
PRETRAINED_MODEL_PATH = DATASET_PATH + 'inception-v3/20160828/inception_v3.ckpt'
#PRETRAINED_MODEL_PATH = DATASET_PATH + 'logs/before/inception_v3_model.ckpt-810491'
LOG_PATH = DATASET_PATH + 'logs_aux/'
#TRAIN_PATH = DATASET_PATH + 'Split1/Train/'
TRAIN_PATH = '/media/rs/FC6CDC6F6CDC25E4/resample_dataset2/'
VAL_PATH = DATASET_PATH + 'Split1/Validation/'
TEST_PATH = DATASET_PATH + 'Test/'
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
BATCH_SIZE = 128#256

# total_batch_size is BATCH_SIZE * ACCUMULATE_STEP
ACCUMULATE_STEP = 4#4

IMAGE_WIDTH = 180
IMAGE_HEIGHT = 180
NUM_CLASS = 5270

LEVEL0_CLASS = 49
LEVEL1_CLASS = 483
# validation examples num: 2319624
# train examples num: 10051704
# total step: 157057
TOTAL_EXAMPLES = 10051704
#TOTAL_EXAMPLES = 18456575 # try don't change total numbers, just check global_step
VAL_EXAMPLES = 2319624
# validation num = 2319624
VAL_CHECK_FREQ = 50
NUM_EPOCHES = 7
VAL_NUM_EPOCHES = int(NUM_EPOCHES/(VAL_CHECK_FREQ*VAL_EXAMPLES/TOTAL_EXAMPLES)) + 1
INPUT_THREADS = 6
EPOCHES_OVER = 7
#Learning rate information and configuration (Up to you to experiment)
# initial_learning_rate = 0.000003#0.00001
# learning_rate_decay_factor = 0.94
initial_learning_rate = 0.0001#0.001
stop_learning_rate = 0.000002
learning_rate_decay_factor = 0.94#0.9
num_epochs_before_decay = 1
#Know the number steps to take before decaying the learning rate and batches per epoch
moving_average_decay = 0.9
momentum = 0.8
num_steps_per_epoch = TOTAL_EXAMPLES / (BATCH_SIZE * ACCUMULATE_STEP) + 1
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch / 6)

In [4]:
# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler(DATASET_PATH + 'tensorflow_inception_160_train_aux.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [5]:
class CdiscountDataset(object):
    def __init__(self, data_path, file_begin_match, label_mapping, num_examples, num_classes, buffer_size, batch_size, num_epochs, is_training):
        super(CdiscountDataset, self).__init__()
        #self._data_file_list = [ os.path.join(data_path, x) for x in os.listdir(data_path) if lambda x: os.path.isfile(x) and x.startswith(file_begin_match) ]
        self._data_file_list = data_path + file_begin_match + '*'
        self._num_examples = num_examples
        self._num_classes = num_classes
        self._batch_size = batch_size
        self._buffer_size = buffer_size
        self._num_epochs = num_epochs
        self._is_training = is_training
        self._category_map = label_mapping.category_map
        self._level0_table = label_mapping.level0_table
        self._level1_table = label_mapping.level1_table
        self._len_level0 = label_mapping.len_level0
        self._len_level1 = label_mapping.len_level1
        self._mapping_table = label_mapping.mapping_table
    
    
    def create_dataset(self):
        opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
        reader = lambda : tf.TFRecordReader(options=opts)
        keys_to_features = {
            'img_raw': tf.FixedLenFeature([], tf.string, default_value=''),
            'product_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
            # notice that we don't have this feature in our TFRecord, so always default provided
            'format': tf.FixedLenFeature([], tf.string, default_value='jpg'),
            'category_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64))
        }

        items_to_handlers = {
            # automated decode image from features in FixedLenFeature
            'image': slim.tfexample_decoder.Image(image_key='img_raw', format_key='format'),
            'label': slim.tfexample_decoder.Tensor('category_id'),
        }

        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
        
        self._dataset = slim.dataset.Dataset(
            data_sources = self._data_file_list,
            decoder = decoder,
            reader = reader,
            # num_readers = 8,
            num_samples = self._num_examples,
            #num_classes = self._num_classes,
            items_to_descriptions = None)
        
        # notice that DatasetDataProvider can automate shuffle the examples by ParallelReader using its RandomShuffleQueue
        self._data_provider = slim.dataset_data_provider.DatasetDataProvider(
            self._dataset,
            num_readers = INPUT_THREADS,
            shuffle = True, # default is True
            num_epochs = self._num_epochs,
            common_queue_capacity = self._buffer_size + 4 * self._batch_size,
            common_queue_min = self._buffer_size,
            scope = self._is_training and 'train_files' or 'validation_files')
        
        org_image, org_label = self._data_provider.get(['image', 'label'])

        image = preprocess_for_inception(org_image, self._is_training) # final image to train
 
        # no need for shuffle, DatasetDataProvider do this for us
        batch_images, batch_labels, batch_labels_level0, batch_labels_level1 = \
                tf.train.batch([image, tf.one_hot(self._mapping_table.lookup(tf.as_string(org_label)), self._num_classes, axis=-1),\
                tf.one_hot(self._level0_table.lookup(org_label), self._len_level0, axis=-1),\
                tf.one_hot(self._level1_table.lookup(org_label), self._len_level1, axis=-1)],\
                self._batch_size,\
                num_threads = INPUT_THREADS,\
                capacity = self._buffer_size + 4 * self._batch_size,\
                allow_smaller_final_batch = self._is_training, name = self._is_training and 'train_batch' or 'validation_batch')
        
        return batch_images, batch_labels, batch_labels_level0, batch_labels_level1

In [6]:
def preprocess_for_inception(input_image, is_training = True):
    # inception_v3.default_image_size = 299
    return inception_preprocessing.preprocess_image(input_image, 160, 160, True)#is_training)

In [7]:
class LabelMapping(object):
    def __init__(self, catogory_file_path):
        super(LabelMapping, self).__init__()
        self._category_level_csv = catogory_file_path
        self._category_map, self._category_level0_map, self._category_level1_map, self._len_level0, self._len_level1 = self.cvt_csv2tfrecord()
        self._mapping_strings = tf.constant( [ str(key) for key in self._category_map.keys() ] )
        #print(list(self._category_map.keys())[0])
        self._mapping_table = tf.contrib.lookup.index_table_from_tensor(mapping=self._mapping_strings, default_value=0) 
        
        self._level0_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level0_map.keys()), list(self._category_level0_map.values()), tf.int64, tf.int64), 0)
        self._level1_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level1_map.keys()), list(self._category_level1_map.values()), tf.int64, tf.int64), 0)

    @property
    def category_map(self):
        return self._category_map
    @property
    def level0_table(self):
        return self._level0_table
    @property
    def level1_table(self):
        return self._level1_table
    @property
    def len_level0(self):
        return self._len_level0
    @property
    def len_level1(self):
        return self._len_level1
    @property
    def mapping_table(self):
        return self._mapping_table
    
    def cvt_csv2tfrecord(self):
        level0_map, level1_map = self.create_level_map()
        count = 0
        category_map = dict()
        category_level0_map = dict()
        category_level1_map = dict()
        csv = pd.read_csv(self._category_level_csv).values
        for row in csv:  
            category_id, level0, level1 = row[0], row[1], row[2]
            category_map[category_id] = count
            category_level0_map[int(category_id)] = level0_map[level0]
            category_level1_map[int(category_id)] = level1_map[level1]
            count += 1

        return category_map, category_level0_map, category_level1_map, len(level0_map), len(level1_map)

    def create_level_map(self):
        csv = pd.read_csv(self._category_level_csv).values
        level_list = [list(), list()]
        for row in csv: 
            for level in range(1,3):
                if row[level] not in level_list[level-1]:
                    level_list[level-1].append(row[level])
        return dict(zip(level_list[0], range(len(level_list[0])))), dict(zip(level_list[1], range(len(level_list[1]))))

In [8]:
# def my_create_train_op(total_loss, optimizer, summarize_gradients = False):
#     global_step = tf.train.get_or_create_global_step()

#     update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))

#     # Make sure update_ops are computed before total_loss.
#     if update_ops:
#         with ops.control_dependencies(update_ops):
#             barrier = control_flow_ops.no_op(name='update_barrier')
#     total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

#     variables_to_train = tf_variables.trainable_variables()

#     # Create the gradients. Note that apply_gradients adds the gradient
#     # computation to the current graph.
#     grads = optimizer.compute_gradients(
#       total_loss,
#       variables_to_train,
#       gate_gradients=tf_optimizer.Optimizer.GATE_OP,
#       aggregation_method=None,
#       colocate_gradients_with_ops=False)

#     # Summarize gradients.
#     if summarize_gradients:
#         with ops.name_scope('summarize_grads'):
#             add_gradients_summaries(grads)

#     # Create gradient updates.
#     grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

#     with ops.name_scope('train_op'):
#         # Make sure total_loss is valid.
#         total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')

#     # Ensure the train_tensor computes grad_updates.
#     train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)

#     # Add the operation used for training to the 'train_op' collection
#     train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
#     if train_op not in train_ops:
#         train_ops.append(train_op)

#     return train_op
def my_create_train_op(total_loss, optimizer, summarize_gradients = False, accumulate_factor=None):
    global_step = tf.train.get_or_create_global_step()

    update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))

    # Make sure update_ops are computed before total_loss.
    if update_ops:
        with ops.control_dependencies(update_ops):
            barrier = control_flow_ops.no_op(name='update_barrier')
    total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

    variables_to_train = tf_variables.trainable_variables()

    # initialized with 0s
    accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in variables_to_train]
    zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]

    # Calls the compute_gradients function of the optimizer to obtain... the list of gradients
    grads = optimizer.compute_gradients(
      total_loss,
      variables_to_train,
      gate_gradients=tf_optimizer.Optimizer.GATE_OP,
      aggregation_method=None,
      colocate_gradients_with_ops=False)

    ## Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and grads are in the same order)
    if accumulate_factor is not None:   
        total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')
        with tf.control_dependencies([total_loss]):
            accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(grads) if gv[0] is not None]

        ## Define the training step (part with variable value update)
        accumulate_grads = [(tf.multiply(accum_vars[i], accumulate_factor), gv[1]) for i, gv in enumerate(grads) if gv[0] is not None]
        #accumulate_grads = [(accum_vars[i], gv[1]) for i, gv in enumerate(grads) if gv[0] is not None]
    else:
        accum_ops = tf.ops.no_op
    
    if accumulate_factor is not None: 
        # Summarize gradients.
        if summarize_gradients:
            with ops.name_scope('summarize_grads'):
                add_gradients_summaries(accumulate_grads)
        grad_updates = optimizer.apply_gradients(accumulate_grads, global_step=global_step)
    else:
        # Summarize gradients.
        if summarize_gradients:
            with ops.name_scope('summarize_grads'):
                add_gradients_summaries(grads)
        grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

    with ops.name_scope('train_op'):
        # Ensure the train_tensor computes grad_updates.
        train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)

    # Add the operation used for training to the 'train_op' collection
    train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
    if train_op not in train_ops:
        train_ops.append(train_op)

    return train_op, accum_ops, zero_ops

In [9]:
def_graph = tf.Graph()
with def_graph.as_default() as graph:
    trunc_normal = lambda stddev: init_ops.truncated_normal_initializer(0.0, stddev)
    def reduced_kernel_size_for_small_input(input_tensor, kernel_size):
        shape = input_tensor.get_shape().as_list()
        if shape[1] is None or shape[2] is None:
            kernel_size_out = kernel_size
        else:
            kernel_size_out = [
                min(shape[1], kernel_size[0]), min(shape[2], kernel_size[1])
            ]
        return kernel_size_out
    def my_exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
                      staircase=False, name=None):
        if global_step is None:
            raise ValueError("global_step is required for exponential_decay.")
        with ops.name_scope(name, "ExponentialDecay",
                          [learning_rate, global_step,
                           decay_steps, decay_rate]) as name:
            learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
            dtype = learning_rate.dtype
            global_step = math_ops.cast(global_step, dtype)
            decay_steps = math_ops.cast(decay_steps, dtype)
            decay_rate = math_ops.cast(decay_rate, dtype)
            p = global_step / decay_steps
            if staircase:
                p = math_ops.floor(p)
            return tf.maximum(math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p),
                                 name=name), tf.constant(stop_learning_rate, dtype=dtype))

    def train_step(input_examples, one_hot_labels, level0_labels, level1_labels):   
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            # here logits is the pre-softmax activations
            logits, end_points = inception.inception_v3(
                input_examples,
                num_classes = NUM_CLASS,
                is_training = True)

            # we retrain for diferrent num classes
            # and don't define any Variables before get_variables_to_restore
            
#             variables_to_exclude = []
#             #variables_to_exclude = ['InceptionV3/Logits', 'InceptionV3/AuxLogits']
#             for var in slim.get_model_variables():
#                 print(var.op.name)
#                 if var.op.name.strip().endswith('*Momentum'):
#                     print(var.op.name)
#                     variables_to_exclude.append(var)

#             variables = tf.contrib.framework.get_model_variables()
#             restore_variables = tf.contrib.framework.filter_variables(
#                 variables, include_patterns=None, exclude_patterns=['Momentum', 'momentum'])

            # Create the global step for monitoring the learning_rate and training.
            # since supervisor will also create one global_step, so we create n advance in order to feed into exponential_decay
            global_step = tf.train.get_or_create_global_step(graph = graph)
            
            #variables_to_restore = slim.get_variables_to_restore()
            variables_to_restore = slim.get_variables_to_restore(exclude = ['InceptionV3/Logits', 'InceptionV3/AuxLogits'])
            #variables_to_restore_from_checkpoint = slim.get_variables_to_restore(exclude = variables_to_exclude)
            # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced, e.x. label smothing
            loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits, label_smoothing = 0.1)
            # add my custom AuxLogits
            with arg_scope(
                [layers_lib.batch_norm, layers_lib.dropout], is_training=True):
                # Auxiliary Head logits
                with arg_scope(
                  [layers.conv2d, layers_lib.max_pool2d, layers_lib.avg_pool2d],
                      stride=1,
                      padding='SAME'):
                    aux_logits1 = end_points['Mixed_6e']
                    with variable_scope.variable_scope('AuxLogits1'):
                        aux_logits1 = layers_lib.avg_pool2d(
                            aux_logits1, [5, 5],
                            stride=3,
                            padding='VALID',
                            scope='AvgPool_1a_5x5')
                        aux_logits1 = layers.conv2d(
                            aux_logits1, 128, [1, 1], scope='Conv2d_1b_1x1')

                        # Shape of feature map before the final layer.
                        kernel_size = reduced_kernel_size_for_small_input(aux_logits1, [5, 5])
                        aux_logits1 = layers.conv2d(
                            aux_logits1,
                            512,
                            kernel_size,
                            weights_initializer=trunc_normal(0.01),
                            padding='VALID',
                            scope='Conv2d_2a_{}x{}'.format(*kernel_size))
                        aux_logits1 = layers.conv2d(
                            aux_logits1,
                            LEVEL1_CLASS, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            weights_initializer=trunc_normal(0.001),
                            scope='Conv2d_2b_1x1')
           
                        end_points['AuxLogits1'] = array_ops.squeeze(
                                aux_logits1, [1, 2], name='SpatialSqueeze')
                        
                    aux_logits0 = end_points['Mixed_5d']
                    with variable_scope.variable_scope('AuxLogits0'):
                        aux_logits0 = layers_lib.avg_pool2d(
                            aux_logits0, [5, 5],
                            stride=3,
                            padding='VALID',
                            scope='AvgPool_1a_5x5')
                        aux_logits0 = layers.conv2d(
                            aux_logits0, 32, [1, 1], scope='Conv2d_1b_1x1')
                        
                        # Shape of feature map before the final layer.
                        kernel_size = reduced_kernel_size_for_small_input(aux_logits0, [5, 5])
                        aux_logits0 = layers.conv2d(
                            aux_logits0,
                            64,
                            kernel_size,
                            weights_initializer=trunc_normal(0.01),
                            padding='VALID',
                            scope='Conv2d_2a_{}x{}'.format(*kernel_size))
                        
                        aux_logits0 = layers.conv2d(
                            aux_logits0,
                            LEVEL0_CLASS, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            weights_initializer=trunc_normal(0.001),
                            scope='Conv2d_2b_1x1')
                        
                        end_points['AuxLogits0'] = array_ops.squeeze(
                                aux_logits0, [1, 2], name='SpatialSqueeze')
        
            #loss_level0 = tf.losses.softmax_cross_entropy(onehot_labels = level0_labels, logits = end_points['AuxLogits0'], weights=0.02, label_smoothing = 0.2)
            #loss_level1 = tf.losses.softmax_cross_entropy(onehot_labels = level1_labels, logits = end_points['AuxLogits1'], weights=0.15, label_smoothing = 0.2)
            loss_level0 = tf.losses.softmax_cross_entropy(onehot_labels = level0_labels, logits = end_points['AuxLogits0'], weights=0.01, label_smoothing = 0.2)
            loss_level1 = tf.losses.softmax_cross_entropy(onehot_labels = level1_labels, logits = end_points['AuxLogits1'], weights=0.1, label_smoothing = 0.2)

            #aux_loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = end_points['AuxLogits'], weights=0.2)
            total_loss = tf.losses.get_total_loss()    # obtain the regularization losses as well
           
            #Define your exponentially decaying learning rate
            lr = my_exponential_decay(#tf.train.exponential_decay(
                learning_rate = initial_learning_rate,
                global_step = global_step,
                decay_steps = decay_steps,
                decay_rate = learning_rate_decay_factor,
                staircase = True)

            #Now we can define the optimizer that takes on the learning rate
            optimizer = tf.train.AdamOptimizer(learning_rate = lr)
            #optimizer = tf.train.MomentumOptimizer(learning_rate = lr, momentum=momentum)
            
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay, global_step)
            # Use an alternative set of update ops in addition to the default updates:
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables))

            #Create the train_op.
            accumulate_factor = tf.constant([1./ACCUMULATE_STEP], name='accumulate_factor')
            train_op, accum_ops, zero_ops = my_create_train_op(total_loss, optimizer, False, accumulate_factor)
            #Create the train_op.
            #train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=False)

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(end_points['Predictions'], 1)
            probabilities = end_points['Predictions']
            accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1), name='train_accuracy')
            metrics_op = tf.group(accuracy_update)


            #Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('losses/Total_Loss', total_loss)
            tf.summary.scalar('accuracy', accuracy)
            tf.summary.scalar('learning_rate', lr)

            return train_op, accum_ops, zero_ops, global_step, metrics_op, variables_to_restore, predictions, lr, accuracy, total_loss

    def validation_step(input_examples, one_hot_labels):   
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            # here logits is the pre-softmax activations
            logits, end_points = inception.inception_v3(
                input_examples,
                num_classes = NUM_CLASS,
                is_training=False, reuse=True)

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(end_points['Predictions'], 1)
            probabilities = end_points['Predictions']
            accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1), name='val_accuracy')
            metrics_op = tf.group(accuracy_update)

            #Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('validation/accuracy', accuracy)

            return metrics_op, accuracy, predictions, probabilities

In [10]:
with def_graph.as_default() as graph:   
    label_mapping = LabelMapping(CATEGORY_NAME_PATH)
    train_dataset = CdiscountDataset(TRAIN_PATH, 'output_file', label_mapping, TOTAL_EXAMPLES, NUM_CLASS, 8000, BATCH_SIZE, NUM_EPOCHES, True)
    val_dataset = CdiscountDataset(VAL_PATH, 'test_output_file', label_mapping, VAL_EXAMPLES, NUM_CLASS, 2000, BATCH_SIZE, VAL_NUM_EPOCHES, False)

    batch_images, batch_labels, batch_level0_labels, batch_level1_labels = train_dataset.create_dataset()
    batch_val_images, batch_val_labels, batch_val_level0_labels, batch_val_level1_labels = val_dataset.create_dataset()
    with tf.device('/gpu:0'):
        train_op, accum_op, zero_op, global_step, metrics_op, variables_to_restore, pred_op, lr, accuracy, total_loss = train_step(batch_images, batch_labels, batch_level0_labels, batch_level1_labels)
        val_metrics_op, val_accuracy, val_predictions, val_probabilities = validation_step(batch_val_images, batch_val_labels)
        real_val_label = tf.argmax(batch_val_labels, 1)
        
    global_step_zero = global_step.assign(tf.zeros_like(global_step))
    
    summary_op = tf.summary.merge_all()
    # Create a saver that restores only the pre-trained variables.
    # we have change optim, restore all param use pretrained mode
    #pre_train_saver = tf.train.Saver(variables_to_restore)
    
    variables = slim.get_variables_to_restore()
    restore_from_pretrained = tf.contrib.framework.filter_variables(
        variables,
        include_patterns=None,
        exclude_patterns=['ExponentialMovingAverage', 'accumulate_factor', 'Momentum', 'InceptionV3/Logits', 'InceptionV3/AuxLogits', 'InceptionV3/AuxLogits0', 'InceptionV3/AuxLogits1'])
    
    restore_from_checkpoint = tf.contrib.framework.filter_variables(
        variables,
        include_patterns=None,
        exclude_patterns=['train_accuracy', 'val_accuracy'])

    checkpoint_saver = tf.train.Saver(restore_from_checkpoint)
    
    pre_train_saver = tf.train.Saver(variables_to_restore)
    # Define an init function that loads the pretrained checkpoint.
    # sess is the managed session passed by Supervisor
    def load_pretrain(sess):
        pre_train_saver.restore(sess, PRETRAINED_MODEL_PATH)

    # no need for specify local_variables_initializer and tables_initializer, Supervisor will do this via default local_init_op
    # init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
    init_op = tf.group(tf.global_variables_initializer())
    # Pass the init function to the supervisor.
    # - The init function is called _after_ the variables have been initialized by running the init_op.
    # - use default tf.Saver() for ordinary save and restore
    # - save checkpoint every 1.3 hours(4800)
    # - manage summary in current process by ourselves for memory saving
    # - no need to specify global_step, supervisor will find this automately
    # - initialize order: checkpoint -> local_init_op -> init_op -> init_func
    sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain, init_op = init_op, summary_op = None, save_model_secs=8000, checkpoint_basename='inception_v3_model.ckpt')
    
    final_loss = 0.
    final_accuracy = 0.
    training_state = True

    config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
    #config.gpu_options.allow_growth = True
    with sv.managed_session(config=config) as sess:
    #with sv.prepare_or_wait_for_session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:

        #sess.run(global_step_zero)
        # Here sess was either initialized from the pre-trained-checkpoint or
        # recovered from a checkpoint saved in a previous run of this code.
        for step in range(int(num_steps_per_epoch * NUM_EPOCHES)):       
            if sv.should_stop():
                tf_logging.info('Supervisor emit finished!')
                tf_logging.info('Current Loss: %s', loss)
                tf_logging.info('Current Accuracy: %s', accuracy)
                tf_logging.info('Saving current model to disk(maybe invalid).')
                training_state = False
                break

            start_time = time.time()

            if step % 1000 == 0:
                summ = sess.run(summary_op)
                sv.summary_computed(sess, summ)
                if step > EPOCHES_OVER * num_steps_per_epoch:
                    raise StopIteration("over epoches reached.")

            if step % VAL_CHECK_FREQ == 0:
                with tf.device('/gpu:0'):
                    _, val_acc, val_pred, val_prob, real_label = sess.run([val_metrics_op, val_accuracy, val_predictions, val_probabilities, real_val_label])
                time_elapsed = time.time() - start_time
                tf_logging.info('Validation Speed: {:5.3f}sec/batch'.format(time_elapsed))
                tf_logging.info('Current Streaming ValAccuracy: {:5.3f}%'.format(val_acc*100.))
                tf_logging.info('Real Label: {}'.format(real_label))
                tf_logging.info('Pred Label: {}'.format(val_pred))

            else:
                with tf.device('/gpu:0'):
                    # accumulate gradient to get bigger batch_size
                    sess.run(zero_op)
                    for _ in range(1, ACCUMULATE_STEP):
                        sess.run([accum_op, total_loss])
                    _, _, _, cur_loss, cur_acc, total_step, cur_lr = sess.run([train_op, accum_op, metrics_op, total_loss, accuracy, global_step, lr])
#                     sess.run([train_op])
                time_elapsed = time.time() - start_time

                if step % 10 == 0:
                    final_loss = cur_loss
                    final_accuracy = cur_acc
                    tf_logging.info('Current Speed: {:5.3f}sec/batch'.format(time_elapsed))
                    tf_logging.info('Current Streaming Accuracy: {:5.3f}%'.format(cur_acc*100.))
                    tf_logging.info('Current Loss: {:5.3f}'.format(cur_loss))
                    tf_logging.info('Epoch %s/%s, Global Step: %s', int(total_step / num_steps_per_epoch + 1), NUM_EPOCHES, total_step)
                    tf_logging.info('Current Learning Rate: {}'.format(cur_lr))
      
        if training_state:
            #We log the final training loss and accuracy
            tf_logging.info('Final Loss: %s', final_loss)
            tf_logging.info('Final Accuracy: %s', final_accuracy)
            # Once all the training has been done, save the log files and checkpoint model
            tf_logging.info('Finished training! Model saved.')
        sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
    

INFO:tensorflow:Restoring parameters from /media/rs/0E06CD1706CD0127/Kapok/kaggle/logs_aux/inception_v3_model.ckpt-70322
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Saving checkpoint to path /media/rs/0E06CD1706CD0127/Kapok/kaggle/logs_aux/inception_v3_model.ckpt
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:Current Speed: 2.421sec/batch
INFO:tensorflow:Current Streaming Accuracy: 27.431%
INFO:tensorflow:Current Loss: 5.632
INFO:tensorflow:Epoch 2/7, Global Step: 70335
INFO:tensorflow:Current Learning Rate: 5.06298165419139e-05
INFO:tensorflow:Current Speed: 2.361sec/batch
INFO:tensorflow:Current Streaming Accuracy: 28.002%
INFO:tensorflow:Current Loss: 5.850
INFO:tensorflow:Epoch 2/7, Global Step: 70345
INFO:tensorflow:Current Learning Rate: 5.06298165419139e-05


KeyboardInterrupt: 