In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import io
import bson 
import tensorflow as tf
import os.path
from scipy.misc import imread   # or, whatever image library you prefer

In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
TRAIN_OUTPUT_PATH = '/media/rs/0E06CD1706CD0127/Train_Seq/'
TEST_OUTPUT_PATH = DATASET_PATH + 'Test11111/'
if os.path.exists(TRAIN_OUTPUT_PATH) is not True: os.mkdir(TRAIN_OUTPUT_PATH)
if os.path.exists(TEST_OUTPUT_PATH) is not True: os.mkdir(TEST_OUTPUT_PATH)
train_bson_file = DATASET_PATH + 'train.bson'
#test_bson_file = DATASET_PATH + 'train_example.bson'
test_bson_file = DATASET_PATH + 'test.bson'

In [3]:
# helper functions
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
def cvt_bson_tfrecord(path, bson_file):
    tfrecords_filename = [path + 'output_file1.tfrecords', path + 'output_file2.tfrecords', path + 'output_file3.tfrecords', path + 'output_file4.tfrecords']
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    z = 0 
    data = bson.decode_file_iter(open(bson_file, 'rb'))
    with tf.python_io.TFRecordWriter(tfrecords_filename[0], options=opts) as writer1, tf.python_io.TFRecordWriter(tfrecords_filename[1], options=opts) as writer2, tf.python_io.TFRecordWriter(tfrecords_filename[2], options=opts) as writer3, tf.python_io.TFRecordWriter(tfrecords_filename[3], options=opts) as writer4:
        writer_list = [writer1, writer2, writer3, writer4]
        for c, d in enumerate(data):       
            n_img = len(d['imgs'])
            cur_writer = writer_list[z%4]
            for index in range(n_img):
                img_raw = d['imgs'][index]['picture']
                img = imread(io.BytesIO(img_raw))
                height = img.shape[0]
                width = img.shape[1]
                product_id = d['_id']
                category_id = d['category_id'] 
                example = tf.train.Example(features=tf.train.Features(feature={
                    'height': _int64_feature(height),
                    'width': _int64_feature(width),
                    'category_id': _int64_feature(category_id),
                    'product_id': _int64_feature(product_id),
                    'img_raw':_bytes_feature(img_raw)
                }))
                cur_writer.write(example.SerializeToString())
            z = z + 1
            if z % 10000 == 0:
                print('current record: ', z)
        print('finished. ')

In [5]:
def cvt_bson_tfrecord_test_only(path, bson_file):
    tfrecords_filename = [path + 'output_file1.tfrecords', path + 'output_file2.tfrecords', path + 'output_file3.tfrecords', path + 'output_file4.tfrecords']
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    z = 0 
    num_examples = 0
    data = bson.decode_file_iter(open(bson_file, 'rb'))
    with tf.python_io.TFRecordWriter(tfrecords_filename[0], options=opts) as writer1,\
        tf.python_io.TFRecordWriter(tfrecords_filename[1], options=opts) as writer2, \
        tf.python_io.TFRecordWriter(tfrecords_filename[2], options=opts) as writer3, \
        tf.python_io.TFRecordWriter(tfrecords_filename[3], options=opts) as writer4:
        writer_list = [writer1, writer2, writer3, writer4]
        for c, d in enumerate(data):       
            n_img = len(d['imgs'])
            #z = z + 1
            #num_examples = num_examples + n_img
            #continue
            cur_writer = writer_list[z%4]
            for index in range(n_img):
                img_raw = d['imgs'][index]['picture']
                img = imread(io.BytesIO(img_raw))
                height = img.shape[0]
                width = img.shape[1]
                product_id = d['_id']
                example = tf.train.Example(features=tf.train.Features(feature={
                    'height': _int64_feature(height),
                    'width': _int64_feature(width),
                    'product_id': _int64_feature(product_id),
                    'img_raw':_bytes_feature(img_raw)
                }))
                cur_writer.write(example.SerializeToString())
            z = z + 1
            if z % 10000 == 0:
                print('current record: ', z)
                #break
        print('finished. ')
    return z, num_examples

In [6]:
def cvt_bson_tfrecord_seq(path, bson_file):
    tfrecords_filename = path + 'output_file.tfrecords'
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)

    z = 0 
    num_examples = 0
    data = bson.decode_file_iter(open(bson_file, 'rb'))
    with tf.python_io.TFRecordWriter(tfrecords_filename, options=opts) as cur_writer:
        for c, d in enumerate(data):     
            if np.random.random_sample() < 0.4:
                continue
            n_img = len(d['imgs'])
            for index in range(n_img):
                img_raw = d['imgs'][index]['picture']
                img = imread(io.BytesIO(img_raw))
                height = img.shape[0]
                width = img.shape[1]
                product_id = d['_id']
                category_id = d['category_id'] 
                example = tf.train.Example(features=tf.train.Features(feature={
                    'height': _int64_feature(height),
                    'width': _int64_feature(width),
                    'category_id': _int64_feature(category_id),
                    'product_id': _int64_feature(product_id),
                    'img_raw':_bytes_feature(img_raw)
                }))
                cur_writer.write(example.SerializeToString())
                num_examples += 1
            z = z + 1
            if z % 10000 == 0:
                print('current record: ', z)
        print('finished. ')
    return num_examples

In [7]:
# # Create the graph, etc.
# # initialize local variables, like local counter epochs
# init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
# # Create a session for running operations in the Graph.
# sess = tf.Session()
# # Initialize the variables (like the epoch counter).
# sess.run(init_op)

cvt_bson_tfrecord(TRAIN_OUTPUT_PATH, train_bson_file)
#print(cvt_bson_tfrecord_seq(TRAIN_OUTPUT_PATH, train_bson_file))
#print(cvt_bson_tfrecord_test_only(TEST_OUTPUT_PATH, test_bson_file))

#sess.close()

FileNotFoundError: [Errno 2] No such file or directory: '/media/rs/0E06CD1706CD0127/Kapok/kaggle/train.bson'