In [1]:
# The line below sets the environment
# variable CUDA_VISIBLE_DEVICES
%env CUDA_VISIBLE_DEVICES = 
import numpy as np
import pandas as pd
import io
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path
import random
import tensorflow as tf
from itertools import compress
from datetime import datetime

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

env: CUDA_VISIBLE_DEVICES=


In [13]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
TRAIN_PATH = DATASET_PATH + 'Split/Train/'
VAL_PATH = DATASET_PATH + 'Split/Validation/'
TEST_PATH = DATASET_PATH + 'Test/'
if os.path.exists(TRAIN_PATH) is not True: os.makedirs(TRAIN_PATH)
if os.path.exists(VAL_PATH) is not True: os.makedirs(VAL_PATH)
BATCH_SIZE = 4096

In [3]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [4]:
def create_examples(files):
    filename_queue = tf.train.string_input_producer(files, num_epochs=1, shuffle=True) 
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    reader = tf.TFRecordReader(options = opts)
    _, serialized_example = reader.read(filename_queue)
    input_features = tf.parse_single_example(
          serialized_example,
          features={
                #'height': tf.FixedLenFeature([], tf.int64),
                #'width': tf.FixedLenFeature([], tf.int64),
                'category_id': tf.FixedLenFeature([], tf.int64),
                'product_id': tf.FixedLenFeature([], tf.int64),
                'img_raw': tf.FixedLenFeature([], tf.string),
          })
    # only part of the dictionary are needed
    return { 'img_raw' : input_features['img_raw'], 'product_id' : input_features['product_id'], 'category_id' : input_features['category_id'] }
    #return input_features['img_raw'], input_features['product_id'], input_features['category_id']
    #return input_features

In [5]:
def input_pipeline(filenames, batch_size, read_threads = 8):
    filename_queue = create_examples(filenames)
    example_list = [ filename_queue for _ in range(read_threads)]
    #print(example_list)
    min_after_dequeue = 2000
    capacity = min_after_dequeue + 3 * batch_size
    return tf.train.shuffle_batch_join(
        example_list, batch_size = batch_size, capacity = capacity,
        min_after_dequeue = min_after_dequeue, allow_smaller_final_batch = True)

In [6]:
def count_num_of_examples(file_path, file_prefix):
    # Count the total number of examples in all of these shard 
    num_samples = 0
    tfrecords_to_count = [os.path.join(file_path, file) for file in os.listdir(file_path) if file.startswith(file_prefix)]
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    for tfrecord_file in tfrecords_to_count:
        for record in tf.python_io.tf_record_iterator(tfrecord_file, options = opts):
            num_samples += 1
    return num_samples

In [7]:
def count_num_of_examples2(file_path, file_prefix):
    # Count the total number of examples in all of these shard 
    num_samples = 0
    tfrecords_to_count = [os.path.join(file_path, file) for file in os.listdir(file_path) if file_prefix in file]
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    for tfrecord_file in tfrecords_to_count:
        for record in tf.python_io.tf_record_iterator(tfrecord_file, options = opts):
            num_samples += 1
    return num_samples

In [8]:
def histogram_of_category(file_path, file_prefix, sess):
    # Count the total number of examples in all of these shard 
    hist = dict()
    temp_list= list()
    tfrecords_to_count = [os.path.join(file_path, file) for file in os.listdir(file_path) if file.startswith(file_prefix)]
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    for tfrecord_file in tfrecords_to_count:
        for record in tf.python_io.tf_record_iterator(tfrecord_file, options = opts):
            input_features = tf.parse_single_example(
              record,
              features={
                    'category_id': tf.FixedLenFeature([], tf.int64),
                    'product_id': tf.FixedLenFeature([], tf.int64),
                    'img_raw': tf.FixedLenFeature([], tf.string),
              })
            temp_list.append(input_features['category_id'].eval(session=sess))

    for elements in temp_list:
        hist[elements]=temp_list.count(elements)
    return hist

In [14]:
def split_into_train_val(filenames, outpath_train, outpath_test, batch_size, val_per, out_file_num = 500):
    tfrecords_filename = [outpath_train + 'output_file{:d}.tfrecords'.format(index + 1) for index in range(out_file_num)]
    test_out_file_num = int(out_file_num*val_per)
    tfrecords_test_filename = [outpath_test + 'test_output_file{:d}.tfrecords'.format(index + 1) for index in range(test_out_file_num)]
    # create a partition vector
    partitions = [0] * batch_size
    test_set_size = int(batch_size * val_per)
    partitions[:test_set_size] = [1] * test_set_size 
    #print(partitions)
    #total_fold = int(8)

    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    try:
        writer_list = [tf.python_io.TFRecordWriter(file_name, options = opts) for file_name in tfrecords_filename]
    except Exception as e:
        print('writer_list create failed!')
        if not writer_list:
            for f in writer_list:
                f.close()
        return
    try:
        test_writer_list = [tf.python_io.TFRecordWriter(file_name, options = opts) for file_name in tfrecords_test_filename]
    except Exception as e:
        print('test_writer_list create failed!')
        if not test_writer_list:
            for f in test_writer_list:
                f.close()
        return
    
    files = tf.train.match_filenames_once(filenames)
    all_examples = input_pipeline(files, batch_size)
    
    partitions_tensor = tf.constant(partitions)
    shuff = tf.random_shuffle(partitions_tensor)
    train_examples = dict()
    test_examples = dict()
    # split train and test examples from the mask
    for key, value in all_examples.items():
        #temp_list = tf.split(value, total_fold)
        #train_examples[key] = tf.concat(temp_list[0:-1], 0)
        #test_examples[key] = temp_list[-1]
        train_examples[key], test_examples[key] = tf.dynamic_partition(value, shuff, 2)#tf.cond(value.get_shape().as_list()[0] < batch_size, \
                                           # lambda : tf.dynamic_partition(value, shuff, 2), lambda : tf.split(value, 2))
         
    # Create the graph, etc.
    # initialize local variables, like local counter epochs
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    # Create a session for running operations in the Graph.
    sess = tf.Session()
    # Initialize the variables (like the epoch counter).
    sess.run(init_op)

    # Start input enqueue threads.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # batch iteration count, use for select different output file
    count = 0

    try:
        while not coord.should_stop():
            cur_train_writer = writer_list[count % out_file_num]
            cur_test_writer = test_writer_list[count % test_out_file_num]
           
            try:
                # Run training steps or whatever
                feeded_train_list, feeded_test_list = sess.run([[tensors for tensors in train_examples.values()], [tensors for tensors in test_examples.values()]])
                train_dictionary = dict(zip([key for key in train_examples.keys()], feeded_train_list))
                # each run call runs one "step" of TensorFlow computation
                # feeded_test_list = sess.run([tensors for tensors in test_examples.values()])
                test_dictionary = dict(zip([key for key in test_examples.keys()], feeded_test_list))
            # error dealed for less than a batch num data
            # feed remaining into train
            except tf.errors.InvalidArgumentError:
                feeded_train_list = sess.run([tensors for tensors in all_examples.values()])
                train_dictionary = dict(zip([key for key in all_examples.keys()], feeded_train_list))
                test_dictionary = {}
            finally:
                # write here  
                reshaped_test = [ dict(zip([key for key in test_dictionary.keys()], [test_dictionary[key][index] for key in test_dictionary.keys()])) for index in range(len(test_dictionary['img_raw'])) ]
                reshaped_train = [ dict(zip([key for key in train_dictionary.keys()], [train_dictionary[key][index] for key in train_dictionary.keys()])) for index in range(len(train_dictionary['img_raw'])) ]
                for item in reshaped_train:
                    example = tf.train.Example(features=tf.train.Features(feature={
                        'img_raw': _bytes_feature(item['img_raw']),
                        'product_id': _int64_feature(item['product_id']),
                        'category_id': _int64_feature(item['category_id'])
                    }))
                    cur_train_writer.write(example.SerializeToString())
                for item in reshaped_test:
                    example = tf.train.Example(features=tf.train.Features(feature={
                        'img_raw': _bytes_feature(item['img_raw']),
                        'product_id': _int64_feature(item['product_id']),
                        'category_id': _int64_feature(item['category_id'])
                    }))
                    cur_test_writer.write(example.SerializeToString())
                #[dict(zip([key for key in test_dictionary.keys()], feeded_train_list)) for key in test_dictionary.keys() for _ in range(len(test_dictionary[key])) ]
                #print(test_dictionary['product_id'])
                #print(test_dictionary['category_id'])
#                 print(len(test_dictionary['img_raw']))
#                 print(len(train_dictionary['img_raw']))
#                 print(shuff.eval(session=sess))
                count += 1
                if count > 1000:
                    break
            
    except tf.errors.OutOfRangeError:
        print('Done splitting -- epoch limit reached')
        print('last count: {}, roughly examples num: {}'.format(count, count * batch_size))
        print('finished time: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    finally:
        for f in writer_list:
            f.close()
        for f in test_writer_list:
            f.close()
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads) 
    sess.close()

In [None]:
def split_into_train_val2(filenames, outpath_train, batch_size, out_file_num = 500):
    tfrecords_filename = [outpath_train + 'output_file{:d}.tfrecords'.format(index + 1) for index in range(int(out_file_num/2))]
    
    #tfrecords_test_filename = [outpath_test + 'test_output_file{:d}.tfrecords'.format(index + 1) for index in range(test_out_file_num)]
    tfrecords_test_filename = [outpath_train + 'output_file{:d}.tfrecords'.format(index + 1) for index in range(int(out_file_num/2), out_file_num)]
    test_out_file_num = len(tfrecords_test_filename)
    out_file_num = len(tfrecords_filename)
    # create a partition vector
    partitions = [0] * batch_size
    test_set_size = int(batch_size * 0.5)
    partitions[:test_set_size] = [1] * test_set_size 
    #print(partitions)
    #total_fold = int(8)

    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    try:
        writer_list = [tf.python_io.TFRecordWriter(file_name, options = opts) for file_name in tfrecords_filename]
    except Exception as e:
        print('writer_list create failed!')
        if not writer_list:
            for f in writer_list:
                f.close()
        return
    try:
        test_writer_list = [tf.python_io.TFRecordWriter(file_name, options = opts) for file_name in tfrecords_test_filename]
    except Exception as e:
        print('test_writer_list create failed!')
        if not test_writer_list:
            for f in test_writer_list:
                f.close()
        return
    
    files = tf.train.match_filenames_once(filenames)
    all_examples = input_pipeline(files, batch_size)
    
    partitions_tensor = tf.constant(partitions)
    shuff = tf.random_shuffle(partitions_tensor)
    train_examples = dict()
    test_examples = dict()
    # split train and test examples from the mask
    for key, value in all_examples.items():
        #temp_list = tf.split(value, total_fold)
        #train_examples[key] = tf.concat(temp_list[0:-1], 0)
        #test_examples[key] = temp_list[-1]
        train_examples[key], test_examples[key] = tf.dynamic_partition(value, shuff, 2)#tf.cond(value.get_shape().as_list()[0] < batch_size, \
                                           # lambda : tf.dynamic_partition(value, shuff, 2), lambda : tf.split(value, 2))
         
    # Create the graph, etc.
    # initialize local variables, like local counter epochs
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    # Create a session for running operations in the Graph.
    sess = tf.Session()
    # Initialize the variables (like the epoch counter).
    sess.run(init_op)

    # Start input enqueue threads.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # batch iteration count, use for select different output file
    count = 0

    try:
        while not coord.should_stop():
            cur_train_writer = writer_list[count % out_file_num]
            cur_test_writer = test_writer_list[count % test_out_file_num]
           
            try:
                # Run training steps or whatever
                feeded_train_list, feeded_test_list = sess.run([[tensors for tensors in train_examples.values()], [tensors for tensors in test_examples.values()]])
                train_dictionary = dict(zip([key for key in train_examples.keys()], feeded_train_list))
                # each run call runs one "step" of TensorFlow computation
                # feeded_test_list = sess.run([tensors for tensors in test_examples.values()])
                test_dictionary = dict(zip([key for key in test_examples.keys()], feeded_test_list))
            # error dealed for less than a batch num data
            # feed remaining into train
            except tf.errors.InvalidArgumentError:
                feeded_train_list = sess.run([tensors for tensors in all_examples.values()])
                train_dictionary = dict(zip([key for key in all_examples.keys()], feeded_train_list))
                test_dictionary = {}
            finally:
                # write here  
                reshaped_test = [ dict(zip([key for key in test_dictionary.keys()], [test_dictionary[key][index] for key in test_dictionary.keys()])) for index in range(len(test_dictionary['img_raw'])) ]
                reshaped_train = [ dict(zip([key for key in train_dictionary.keys()], [train_dictionary[key][index] for key in train_dictionary.keys()])) for index in range(len(train_dictionary['img_raw'])) ]
                for item in reshaped_train:
                    example = tf.train.Example(features=tf.train.Features(feature={
                        'img_raw': _bytes_feature(item['img_raw']),
                        'product_id': _int64_feature(item['product_id']),
                        'category_id': _int64_feature(item['category_id'])
                    }))
                    cur_train_writer.write(example.SerializeToString())
                for item in reshaped_test:
                    example = tf.train.Example(features=tf.train.Features(feature={
                        'img_raw': _bytes_feature(item['img_raw']),
                        'product_id': _int64_feature(item['product_id']),
                        'category_id': _int64_feature(item['category_id'])
                    }))
                    cur_test_writer.write(example.SerializeToString())
                #[dict(zip([key for key in test_dictionary.keys()], feeded_train_list)) for key in test_dictionary.keys() for _ in range(len(test_dictionary[key])) ]
                #print(test_dictionary['product_id'])
                #print(test_dictionary['category_id'])
#                 print(len(test_dictionary['img_raw']))
#                 print(len(train_dictionary['img_raw']))
#                 print(shuff.eval(session=sess))
                count += 1
           
            
    except tf.errors.OutOfRangeError:
        print('Done splitting -- epoch limit reached')
        print('last count: {}, roughly examples num: {}'.format(count, count * batch_size))
        print('finished time: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    finally:
        for f in writer_list:
            f.close()
        for f in test_writer_list:
            f.close()
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads) 
    sess.close()

In [None]:
# split dataset into train and validation
split_into_train_val(DATASET_PATH + "Train/output_file*.tfrecords", TRAIN_PATH, VAL_PATH, BATCH_SIZE, 0.2)
#split_into_train_val2(DATASET_PATH + 'Split/org/Validation/' + "test_output_file*.tfrecords", VAL_PATH, BATCH_SIZE, out_file_num=500)
#split_into_train_val2(DATASET_PATH + 'Split/org/Train/' + "output_file*.tfrecords", TRAIN_PATH, BATCH_SIZE, out_file_num=900)
print('validation examples num: {}'.format(count_num_of_examples(VAL_PATH, 'test_output_file')))
print('train examples num: {}'.format(count_num_of_examples(TRAIN_PATH, 'output_file')))

In [14]:
# calculate num of examples
# print('validation examples num: {}'.format(count_num_of_examples(VAL_PATH, 'test_output_file')))
print('train examples num: {}'.format(count_num_of_examples2(VAL_PATH, 'output_file')))
#print('test examples num: {}'.format(count_num_of_examples(TEST_PATH, 'output_file',sess)))
#print('total sampled examples num: {}'.format(count_num_of_examples('/media/rs/FC6CDC6F6CDC25E4/resample_dataset2/', 'output_file')))
#print('total sampled examples num: {}'.format(count_num_of_examples('/media/rs/FC6CDC6F6CDC25E4/ResnetHardTrain/', 'output_file')))

train examples num: 69588


In [None]:
# calculate category histogram
init_op = tf.global_variables_initializer()
# Create a session for running operations in the Graph.
sess = tf.Session()
# Initialize the variables (like the epoch counter).
sess.run(init_op)
train_hist = histogram_of_category(TRAIN_PATH, 'output_file', sess)
val_hist = histogram_of_category(VAL_PATH, 'test_output_file', sess)
print('validation category histogram: {}'.format(val_hist))
print('train category histogram: {}'.format(train_hist))
sess.close()

In [None]:
# init = tf.global_variables_initializer()
# t1 = tf.constant([[2],[3], [4],[4]], dtype='int64')
# t2 = tf.constant('fegegerg')
# # Start training
# with tf.Session() as sess:
#     # Run the initializer
#     sess.run(init)
#     # t1.eval().tostring()
#     t8 = tf.decode_raw(t2, tf.int64)

#     t9 = t8.eval().tostring()
#     print(t9)
