In [1]:
# Running %env without any arguments
# lists all environment variables

# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 

import io
import time
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave
import tensorflow as tf
from tensorflow.python.platform import tf_logging
import os.path
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim.python.slim.nets import inception
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
import inception_preprocessing
import logging
from scipy.sparse import *
import tables as tb

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=


In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
PROB_FILE_PATH = DATASET_PATH + 'keep/'
ENSEMBLE_LIST = ['0.61245', '0.62080', '0.62191', '0.62625', '0.62490']
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
ID_FILE_PATH = PROB_FILE_PATH + 'ids.csv'
OUTPUT_PATH = PROB_FILE_PATH + 'output_ensemble_{}.csv'
NUM_CLASS = 5270
NUM_TOPK = 20
#TOTAL_EXAMPLES = 1524
TOTAL_EXAMPLES = 3095080
BATCH_SIZE = 256
NUM_STEPS = int(TOTAL_EXAMPLES / BATCH_SIZE) + 1

In [3]:
ALL_MODEL_TO_ENSEMBLE = sum([[os.path.join(PROB_FILE_PATH, ENSEMBLE_LIST[index], filename) for filename in os.listdir(os.path.join(PROB_FILE_PATH, ENSEMBLE_LIST[index])) if filename.endswith(".h5")]  for index in range(len(ENSEMBLE_LIST))], [])

In [4]:
# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler(DATASET_PATH + 'product_test_ensemble.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

In [5]:
def cvt_csv2tfrecord():
    count = 0
    category_map = dict()
    csv = pd.read_csv(CATEGORY_NAME_PATH).values
    for row in csv:  
        category_id, _ = row[0], row[1:]
        category_map[category_id] = count
        count += 1
    return category_map

In [6]:
def_graph = tf.Graph()
with def_graph.as_default() as graph:
    mapping_strings = tf.constant( [ str(key) for key in cvt_csv2tfrecord().keys() ] )
    mapping_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, default_value=0)
    
    inv_table = tf.contrib.lookup.index_to_string_table_from_tensor(mapping_strings, default_value="0000000000")

    last_prob = tf.placeholder(tf.float32)
    last_id = tf.placeholder(tf.int64)
    test_probabilities = tf.placeholder(tf.float32, shape=(None, NUM_CLASS))
    batch_id = tf.placeholder(tf.int64, shape=(None, 1))
    with tf.device('/cpu:0'):
        # concat betweent batches
        batch_id_1d = tf.reshape(batch_id, [-1])
        _, idx, count = tf.unique_with_counts(batch_id_1d)
        
        cur_id_tail, _cur_id_head = tf.dynamic_partition(batch_id_1d, tf.cast(tf.not_equal(idx, tf.shape(count)[0] - 1), tf.int32), 2)
        with tf.control_dependencies([cur_id_tail, _cur_id_head]):
            cur_id_head = tf.concat([last_id, _cur_id_head], axis = 0)
      
        cur_prob_tail, _cur_prob_head = tf.dynamic_partition(test_probabilities, tf.cast(tf.not_equal(idx, tf.shape(count)[0] - 1), tf.int32), 2)
        with tf.control_dependencies([last_prob, _cur_prob_head]):
            cur_prob_head = tf.concat([last_prob, _cur_prob_head], axis = 0)
       
        with tf.control_dependencies([cur_id_head, cur_prob_head]):
            raw_id, idx, _ = tf.unique_with_counts(cur_id_head)
            mean_prob = tf.segment_mean(cur_prob_head, idx)
            mean_label = tf.string_to_number(inv_table.lookup(tf.argmax(mean_prob, 1)), out_type=tf.int64) 
        with tf.control_dependencies([mean_prob, mean_label]):
            # last partition may have nothing to concat
            raw_id_tail, idx_tail, _ = tf.unique_with_counts(cur_id_tail)
            mean_prob_tail = tf.segment_mean(cur_prob_tail, idx_tail)
            tail_label = tf.string_to_number(inv_table.lookup(tf.argmax(mean_prob_tail, 1)), out_type=tf.int64) 

In [7]:
with def_graph.as_default() as graph:
    init_op = tf.group(tf.global_variables_initializer(), tf.tables_initializer(), tf.local_variables_initializer())
    
    tf_logging.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    save_file_name = OUTPUT_PATH.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    with tf.Session() as sess:
        sess.run(init_op)
        
        cur_step = 0
        last_feed_id = np.empty([0])
        last_feed_prob = np.empty([0, NUM_CLASS])
        h5_file_list = [tb.open_file(model, 'r') for model in ALL_MODEL_TO_ENSEMBLE]
        #h5_file_list = []
        id_list = pd.read_csv(ID_FILE_PATH)
        for _step in range(NUM_STEPS):
            start_time = time.time()
            
            cur_batch_size = _step + 1 == NUM_STEPS and TOTAL_EXAMPLES - cur_step//NUM_TOPK or BATCH_SIZE
            next_step = cur_step + cur_batch_size*NUM_TOPK

            probs = np.zeros((cur_batch_size, NUM_CLASS), dtype=np.float) 
            labels = np.zeros(cur_batch_size, dtype=np.int)
            for h5 in h5_file_list:
                dense_matrix = csr_matrix((h5.root.prob[cur_step:next_step], (h5.root.row[cur_step:next_step], h5.root.col[cur_step:next_step])), shape=(cur_batch_size,NUM_CLASS)).toarray()
                probs = np.add(probs, dense_matrix)
            probs = np.divide(probs, 1.*len(h5_file_list))
            labels = id_list[cur_step//NUM_TOPK:next_step//NUM_TOPK]
            cur_step += cur_batch_size*NUM_TOPK

            with tf.device('/cpu:0'):
                last_feed_id, last_feed_prob, _mean_label, _mean_id, _tail_label, _tail_id = sess.run([cur_id_tail, cur_prob_tail, mean_label, raw_id, tail_label, raw_id_tail], feed_dict = {last_prob: last_feed_prob, last_id: last_feed_id, test_probabilities: probs, batch_id: labels })

            df = pd.DataFrame({'_id' : _mean_id, 'category_id' : _mean_label})
#             print({'_id' : _mean_id, 'category_id' : _mean_label})
            if not os.path.isfile(save_file_name):
                df.to_csv(save_file_name, mode='a', index=False, sep=',')
            else:
                df.to_csv(save_file_name, mode='a', index=False, sep=',', header=False)
                
            time_elapsed = time.time() - start_time
            if _step % 500 == 0:
                tf_logging.info('CurStep:{}/{} Speed: {:5.3f}sec/batch.'.format(cur_step//NUM_TOPK//BATCH_SIZE, NUM_STEPS, time_elapsed))
            
        for file in h5_file_list:
            file.close()
        df = pd.DataFrame({'_id' : _tail_id, 'category_id' : _tail_label})
        df.to_csv(save_file_name, mode='a', index=False, sep=',', header=False)
        tf_logging.info('Ensemble finished! ')
        tf_logging.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

INFO:tensorflow:2017-10-28 11:03:32
INFO:tensorflow:CurStep:1/12091 Speed: 0.057sec/batch.
INFO:tensorflow:CurStep:501/12091 Speed: 0.047sec/batch.
INFO:tensorflow:CurStep:1001/12091 Speed: 0.044sec/batch.
INFO:tensorflow:CurStep:1501/12091 Speed: 0.048sec/batch.
INFO:tensorflow:CurStep:2001/12091 Speed: 0.059sec/batch.
INFO:tensorflow:CurStep:2501/12091 Speed: 0.046sec/batch.
INFO:tensorflow:CurStep:3001/12091 Speed: 0.050sec/batch.
INFO:tensorflow:CurStep:3501/12091 Speed: 0.050sec/batch.
INFO:tensorflow:CurStep:4001/12091 Speed: 0.046sec/batch.
INFO:tensorflow:CurStep:4501/12091 Speed: 0.043sec/batch.
INFO:tensorflow:CurStep:5001/12091 Speed: 0.042sec/batch.
INFO:tensorflow:CurStep:5501/12091 Speed: 0.051sec/batch.
INFO:tensorflow:CurStep:6001/12091 Speed: 0.049sec/batch.
INFO:tensorflow:CurStep:6501/12091 Speed: 0.061sec/batch.
INFO:tensorflow:CurStep:7001/12091 Speed: 0.045sec/batch.
INFO:tensorflow:CurStep:7501/12091 Speed: 0.047sec/batch.
INFO:tensorflow:CurStep:8001/12091 Speed

In [8]:
# h5 = tb.open_file(ALL_MODEL_TO_ENSEMBLE[0], 'r')
# print(h5.root.prob.shape)
# print(h5.root.row.shape)
# print(h5.root.col.shape)
# print(h5.root.prob)
# dense_matrix = csr_matrix((h5.root.prob[:100], (h5.root.row[:100], h5.root.col[:100])), shape=(20,NUM_CLASS)).toarray()

# print(sorted(dense_matrix[2], reverse=True)[:5])
# print(sorted(dense_matrix[3], reverse=True)[:5])
# print(sorted(dense_matrix[4], reverse=True)[:5])
# print(sorted(dense_matrix[5], reverse=True)[:5])
#print(np.max(csr_matrix((h5.root.prob[:100], (h5.root.row[:100], h5.root.col[:100])), shape=(20,NUM_CLASS)).toarray()))
#print(csr_matrix((h5.root.prob[:10], (h5.root.row[:10], h5.root.col[:10])), shape=(TOTAL_EXAMPLES,NUM_CLASS)).toarray())