In [1]:
# Running %env without any arguments
# lists all environment variables

# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 0

import numpy as np
import pandas as pd
from datetime import datetime
import io
import time
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave, imshow
import tensorflow as tf
from tensorflow.python.platform import tf_logging
from tensorflow.contrib import layers
from tensorflow.contrib.training import add_gradients_summaries
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.training import optimizer as tf_optimizer
from tensorflow.python.ops import variables as tf_variables
import os.path
import tensorflow.contrib.slim as slim
import inception_preprocessing
from tensorflow.contrib.slim.python.slim.nets import inception
import logging
import resnet2

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=0


In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
RESNET_MODEL_PATH = DATASET_PATH + 'Resnet/logs101-new/resnet101_v2_model.ckpt-216292'
INCEPTION_MODEL_PATH = DATASET_PATH + 'logs_aux/inception_v3_model.ckpt-47255'
LOG_PATH = DATASET_PATH + 'Resnet/temp/'
TRAIN_PATH = DATASET_PATH + 'Split1/Train/'
RESNET_OUTPUT_TRAIN_PATH = '/media/rs/FC6CDC6F6CDC25E4/ResnetHardTrain/'
INCEPTION_OUTPUT_TRAIN_PATH = '/media/rs/FC6CDC6F6CDC25E4/InceptionHardTrain/'
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
BATCH_SIZE = 256#256

IMAGE_WIDTH = 180
IMAGE_HEIGHT = 180
NUM_CLASS = 5270
LEVEL0_CLASS = 49
LEVEL1_CLASS = 483

TOTAL_EXAMPLES = 10051704
NUM_STEPS = int(TOTAL_EXAMPLES / BATCH_SIZE) + 1
INPUT_THREADS = 12

moving_average_decay = 0.96
hard_example_thres = 3.
out_file_num = 600

MODEL_TO_RUN = 'resnet'

if os.path.exists(RESNET_OUTPUT_TRAIN_PATH) is not True: os.makedirs(RESNET_OUTPUT_TRAIN_PATH)
if os.path.exists(INCEPTION_OUTPUT_TRAIN_PATH) is not True: os.makedirs(INCEPTION_OUTPUT_TRAIN_PATH)

In [3]:
# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler(DATASET_PATH + 'tensorflow_resnet_hard_example.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [4]:
def preprocess_for_inception(input_image, is_training = False):
    return inception_preprocessing.preprocess_image(input_image, 160, 160, is_training)

In [5]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [6]:
class LabelMapping(object):
    def __init__(self, catogory_file_path):
        super(LabelMapping, self).__init__()
        self._category_level_csv = catogory_file_path
        self._category_map, self._category_level0_map, self._category_level1_map, self._len_level0, self._len_level1 = self.cvt_csv2tfrecord()
        self._mapping_strings = tf.constant( [ str(key) for key in self._category_map.keys() ] )

        self._mapping_table = tf.contrib.lookup.index_table_from_tensor(mapping=self._mapping_strings, default_value=0) 
        
        self._level0_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level0_map.keys()), list(self._category_level0_map.values()), tf.int64, tf.int64), 0)
        self._level1_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(self._category_level1_map.keys()), list(self._category_level1_map.values()), tf.int64, tf.int64), 0)

    @property
    def category_map(self):
        return self._category_map
    @property
    def level0_table(self):
        return self._level0_table
    @property
    def level1_table(self):
        return self._level1_table
    @property
    def len_level0(self):
        return self._len_level0
    @property
    def len_level1(self):
        return self._len_level1
    @property
    def mapping_table(self):
        return self._mapping_table
    
    def cvt_csv2tfrecord(self):
        level0_map, level1_map = self.create_level_map()
        count = 0
        category_map = dict()
        category_level0_map = dict()
        category_level1_map = dict()
        csv = pd.read_csv(self._category_level_csv).values
        for row in csv:  
            category_id, level0, level1 = row[0], row[1], row[2]
            category_map[category_id] = count
            category_level0_map[int(category_id)] = level0_map[level0]
            category_level1_map[int(category_id)] = level1_map[level1]
            count += 1

        return category_map, category_level0_map, category_level1_map, len(level0_map), len(level1_map)

    def create_level_map(self):
        csv = pd.read_csv(self._category_level_csv).values
        level_list = [list(), list()]
        for row in csv: 
            for level in range(1,3):
                if row[level] not in level_list[level-1]:
                    level_list[level-1].append(row[level])
        return dict(zip(level_list[0], range(len(level_list[0])))), dict(zip(level_list[1], range(len(level_list[1]))))

In [7]:
class CdiscountDataset(object):
    def __init__(self, data_path, file_begin_match, label_mapping, num_examples, num_classes, buffer_size, batch_size, num_epochs, is_training):
        super(CdiscountDataset, self).__init__()
        #self._data_file_list = [ os.path.join(data_path, x) for x in os.listdir(data_path) if lambda x: os.path.isfile(x) and x.startswith(file_begin_match) ]
        self._data_file_list = data_path + file_begin_match + '*'
        self._num_examples = num_examples
        self._num_classes = num_classes
        self._batch_size = batch_size
        self._buffer_size = buffer_size
        self._num_epochs = num_epochs
        self._is_training = is_training
        self._category_map = label_mapping.category_map
        self._level0_table = label_mapping.level0_table
        self._level1_table = label_mapping.level1_table
        self._len_level0 = label_mapping.len_level0
        self._len_level1 = label_mapping.len_level1
        self._mapping_table = label_mapping.mapping_table
    
    def create_dataset(self):
        opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
        reader = lambda : tf.TFRecordReader(options=opts)
        keys_to_features = {
            'img_raw': tf.FixedLenFeature([], tf.string, default_value=''),
            'product_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
            # notice that we don't have this feature in our TFRecord, so always default provided
            'format': tf.FixedLenFeature([], tf.string, default_value='jpg'),
            'category_id': tf.FixedLenFeature([], tf.int64, default_value=tf.zeros([], dtype=tf.int64))
        }

        items_to_handlers = {
            # automated decode image from features in FixedLenFeature
            'image': slim.tfexample_decoder.Image(image_key='img_raw', format_key='format'),
            'raw_image': slim.tfexample_decoder.Tensor('img_raw'),
            'label': slim.tfexample_decoder.Tensor('category_id'),
            'product_id': slim.tfexample_decoder.Tensor('product_id')
        }

        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
        
        self._dataset = slim.dataset.Dataset(
            data_sources = self._data_file_list,
            decoder = decoder,
            reader = reader,
            # num_readers = 8,
            num_samples = self._num_examples,
            #num_classes = self._num_classes,
            items_to_descriptions = None)
        
        # notice that DatasetDataProvider can automate shuffle the examples by ParallelReader using its RandomShuffleQueue
        self._data_provider = slim.dataset_data_provider.DatasetDataProvider(
            self._dataset,
            num_readers = INPUT_THREADS,
            shuffle = True, # default is True
            num_epochs = self._num_epochs,
            common_queue_capacity = self._buffer_size + 4 * self._batch_size,
            common_queue_min = self._buffer_size,
            scope = 'test_files')
        
        raw_org_image, org_image, org_label, product_id = self._data_provider.get(['raw_image', 'image', 'label', 'product_id'])

        image = preprocess_for_inception(org_image, self._is_training) # final image to train
 
        batch_org_images, batch_images, batch_labels, batch_category_id, batch_product_id = \
                tf.train.batch([raw_org_image, image, self._mapping_table.lookup(tf.as_string(org_label)), org_label, product_id],\
                self._batch_size,\
                num_threads = INPUT_THREADS,\
                capacity = self._buffer_size + 4 * self._batch_size,\
                allow_smaller_final_batch = self._is_training, name = 'test_batch')
        
        return batch_org_images, batch_images, batch_labels, batch_category_id, batch_product_id

In [None]:
def_graph = tf.Graph()
with def_graph.as_default() as graph:
    def resnet_v2_101_test_step(input_examples): 
        with slim.arg_scope(resnet2.resnet_arg_scope()):
            logits, end_points = resnet2.resnet_v2_101(input_examples, NUM_CLASS, is_training=False)
        
            variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay)
            variables_to_restore = variable_averages.variables_to_restore()
            #variables_to_restore = slim.get_variables_to_restore()
            
            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(tf.squeeze(end_points['predictions']), 1)
            probabilities = tf.squeeze(end_points['predictions'])

            return predictions, probabilities, variables_to_restore
    def inception_aux_test_step(input_examples): 
        with slim.arg_scope(inception.inception_v3_arg_scope()):
            # here logits is the pre-softmax activations
            logits, end_points = inception.inception_v3(
                input_examples,
                num_classes = NUM_CLASS,
                is_training=False)
            variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay)
            variables_to_restore = variable_averages.variables_to_restore()
            #variables_to_restore = slim.get_variables_to_restore()

            #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            predictions = tf.argmax(end_points['Predictions'], 1)
            probabilities = end_points['Predictions']

            return predictions, probabilities, variables_to_restore

In [None]:
with def_graph.as_default() as graph:
    label_mapping = LabelMapping(CATEGORY_NAME_PATH)
    train_dataset = CdiscountDataset(TRAIN_PATH, 'output_file', label_mapping, TOTAL_EXAMPLES, NUM_CLASS, 8000, BATCH_SIZE, 1, False)
    
    batch_org_images, batch_images, batch_labels, batch_category_ids, batch_product_ids = train_dataset.create_dataset()
    
    hard_train_examples = dict()
    with tf.device('/gpu:0'):
        if(MODEL_TO_RUN == 'resnet'):
            test_predictions, test_probabilities, variables_to_restore = resnet_v2_101_test_step(batch_images)
        if(MODEL_TO_RUN == 'inception'):
            test_predictions, test_probabilities, variables_to_restore = inception_aux_test_step(batch_images)
        # after stack
        # [ [0, real0],
        #   [1, real1]
        #   ....
        # ]
        # after tf.gather_nd
        # indices = [[0, 0], [1, 1]]
        # params = [['a', 'b'], ['c', 'd']]
        # output = ['a', 'd']
        real_label_pos_value = tf.gather_nd( test_probabilities, tf.stack((tf.range(test_probabilities.get_shape()[0],
                                            dtype=batch_labels.dtype), batch_labels), axis=1) )
        
        batch_max_prob = tf.reduce_max(test_probabilities, axis = 1)
        
        false_true_ratio = tf.div(batch_max_prob, real_label_pos_value)
        ratio_thres = tf.add(tf.zeros_like(false_true_ratio), tf.constant(hard_example_thres, dtype=tf.float32))
        partition_mask = tf.cast(tf.greater(false_true_ratio, ratio_thres), tf.int32)
        
        _, hard_train_examples['img_raw'] = tf.dynamic_partition(batch_org_images, partition_mask, 2)
        _, hard_train_examples['category_id'] = tf.dynamic_partition(batch_category_ids, partition_mask, 2)
        _, hard_train_examples['product_id'] = tf.dynamic_partition(batch_product_ids, partition_mask, 2)
        
        cur_hard_count = tf.count_nonzero(partition_mask)
   
    if(MODEL_TO_RUN == 'inception'):
        tfrecords_filename = [INCEPTION_OUTPUT_TRAIN_PATH + 'output_file{:d}.tfrecords'.format(index + 1) for index in range(out_file_num)]
    if(MODEL_TO_RUN == 'resnet'):
        tfrecords_filename = [RESNET_OUTPUT_TRAIN_PATH + 'output_file{:d}.tfrecords'.format(index + 1) for index in range(out_file_num)]
    
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    try:
        writer_list = [tf.python_io.TFRecordWriter(file_name, options = opts) for file_name in tfrecords_filename]
    except Exception as e:
        print('writer_list create failed!')

    pre_train_saver = tf.train.Saver(variables_to_restore)
    # Define an init function that loads the pretrained checkpoint.
    # sess is the managed session passed by Supervisor
    def load_pretrain(sess, path):
        pre_train_saver.restore(sess, path)
        #pre_train_saver.restore(sess, RESNET_MODEL_PATH)
    if(MODEL_TO_RUN == 'inception'):
        load_pretrain_func = lambda sess : load_pretrain(sess, INCEPTION_MODEL_PATH)
    if(MODEL_TO_RUN == 'resnet'):
        load_pretrain_func = lambda sess : load_pretrain(sess, RESNET_MODEL_PATH)
    # no need for specify local_variables_initializer and tables_initializer, Supervisor will do this via default local_init_op
    # init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
    init_op = tf.group(tf.global_variables_initializer())
    #init_op = tf.group(train_iterator_initializer, val_iterator_initializer, tf.global_variables_initializer())
    
    # Pass the init function to the supervisor.
    # - The init function is called _after_ the variables have been initialized by running the init_op.
    # - use default tf.Saver() for ordinary save and restore
    # - save checkpoint every 1.3 hours(4800)
    # - manage summary in current process by ourselves for memory saving
    # - no need to specify global_step, supervisor will find this automately
    # - initialize order: checkpoint -> local_init_op -> init_op -> init_func
    sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain_func, init_op = init_op, summary_op = None, save_model_secs=0)

    total_hard_examples = 0
    config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
    #config.gpu_options.allow_growth = True
    with sv.managed_session(config=config) as sess:
    #with sv.prepare_or_wait_for_session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
        #sess.run(iterator_initalizer)
        # Here sess was either initialized from the pre-trained-checkpoint or
        # recovered from a checkpoint saved in a previous run of this code.
        for step in range(NUM_STEPS):        
            if sv.should_stop():
                tf_logging.info('Supervisor emit finished!')
                break

            start_time = time.time()
            cur_train_writer = writer_list[step % out_file_num]
            
            with tf.device('/gpu:0'):
                hard_count, cur_ratio, cur_mask, train_list_img, train_list_catogory_id, train_list_product_id = sess.run([cur_hard_count, false_true_ratio, partition_mask, hard_train_examples['img_raw'], hard_train_examples['category_id'], hard_train_examples['product_id']])
            
            for index in range(hard_count):
                example = tf.train.Example(features=tf.train.Features(feature={
                    'img_raw': _bytes_feature(train_list_img[index]),
                    'product_id': _int64_feature(train_list_product_id[index]),
                    'category_id': _int64_feature(train_list_catogory_id[index])
                }))
                cur_train_writer.write(example.SerializeToString())

            total_hard_examples += hard_count    
            time_elapsed = time.time() - start_time
#             print(hard_count)
#             print(cur_ratio)
#             print(cur_mask)
#             print(train_list_product_id)
#             print(train_list_catogory_id)
#             print(train_list_img)
#             if step % 50000 == 1:
#                  break
            if step % 1000 == 0:
                tf_logging.info('Current Speed: {:5.3f}sec/batch'.format(time_elapsed))    
                tf_logging.info('Step {}/{}'.format(step, NUM_STEPS))
                tf_logging.info('Roughly select ratio {:6.2f}%.'.format(hard_count*100./BATCH_SIZE))
                tf_logging.info('Roughly {:6.3f} hours to go.'.format(  time_elapsed*( (NUM_STEPS-step) > 0 and (NUM_STEPS-step)/3600. or 0.001 )  ))
    if writer_list:
        for f in writer_list:
            f.close()
    tf_logging.info('Total Examples: {}'.format(total_hard_examples))    

INFO:tensorflow:Restoring parameters from /media/rs/0E06CD1706CD0127/Kapok/kaggle/Resnet/logs101-new/resnet101_v2_model.ckpt-216292
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:Current Speed: 3.448sec/batch
INFO:tensorflow:Step 0/39265
INFO:tensorflow:Roughly select ratio  31.25%.
INFO:tensorflow:Roughly 37.608 hours to go.
INFO:tensorflow:Current Speed: 0.633sec/batch
INFO:tensorflow:Step 1000/39265
INFO:tensorflow:Roughly select ratio  34.38%.
INFO:tensorflow:Roughly  6.730 hours to go.
