In [1]:
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
import os.path
import random
import tensorflow as tf
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
CATEGORY_NAME_PATH = DATASET_PATH + 'category_names.csv'
CATEGORY_TF_PATH = DATASET_PATH + 'category_names.tfrecords'
CATEGORY_REMAP_PATH = DATASET_PATH + 'category_names_onehot.tfrecords'

In [3]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [4]:
def cvt_csv2tfrecord(csv_name, tf_name):
    count = 0
    csv = pd.read_csv(csv_name).values
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    with tf.python_io.TFRecordWriter(tf_name, options=opts) as writer:
        for row in csv:  
            category_id, levels = row[0], row[1:]
            #print(type(category_id), levels)  
            example = tf.train.Example(features=tf.train.Features(feature={
                            'category_id': _int64_feature(category_id),
                            'index': _int64_feature(count),
                            'level1': _bytes_feature(levels[0].encode()),
                            'level2': _bytes_feature(levels[1].encode()),
                            'level3': _bytes_feature(levels[2].encode()),
                        }))
            writer.write(example.SerializeToString())
            count += 1
    print('total count: {}'.format(count))

In [5]:
cvt_csv2tfrecord(CATEGORY_NAME_PATH, CATEGORY_TF_PATH)

total count: 5270


In [6]:
def create_examples(files):
    filename_queue = tf.train.string_input_producer(files, num_epochs=1, shuffle=True) 
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    reader = tf.TFRecordReader(options = opts)
    _, serialized_example = reader.read(filename_queue)
    input_features = tf.parse_single_example(
          serialized_example,
          features={
                'category_id': tf.FixedLenFeature([], tf.int64),
                'index': tf.FixedLenFeature([], tf.int64),
                'level1': tf.FixedLenFeature([], tf.string),
                'level2': tf.FixedLenFeature([], tf.string),
                'level3': tf.FixedLenFeature([], tf.string),
          })
    # only part of the dictionary are needed
    #return { 'category_id' : input_features['category_id'] }
    return input_features