# Create TFRecords

This notebook is used to create the tfrecords file from the datasets. It is a important step to speed up the training and evaluation time.

## 1. Import modules and define parameters

In [None]:
# import necessary modules
import os
import cv2
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from glob import glob

In [None]:
# define the dataset path
DATASET_PATH = './UCM'

## 2. Create tfrecords files

In [None]:
# read and resize images in RGB
def imread(path_image, resize=(256,256)):
    image = cv2.imread(str(path_image))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, resize)

    return image

In [None]:
# returns a bytes_list from a string / byte
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.tobytes()]))

# serialize input data
def serialize_example(image, label):
    feature = {
      'image': _bytes_feature(image),
      'label': _bytes_feature(label),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
# map the labels names into numeric values
labels_name = glob(os.path.join(DATASET_PATH, '*'))
map_ann = {lab:i for i,lab in enumerate(labels_name)}

In [None]:
# get all images in the folders
path_images = glob(os.path.join(DATASET_PATH, '**', '*.*'), recursive=True)
np.random.shuffle(path_images)

In [None]:
# create and save tfrecords
tfrecords_path = DATASET_PATH+'_tfrecords'
size_record = 1
total_data  = len(path_images)

pbar = tqdm(range(total_data))
for i in range(total_data//size_record+1):

    with tf.io.TFRecordWriter(os.path.join(tfrecords_path, f'{i}.tfrec'), \
                              options=tf.io.TFRecordOptions(compression_type='GZIP')) as writer:
        for k in range(size_record):
            if i*size_record+k>=total_data:
                break
            
            img = imread(path_images[i*size_record+k])
            label = os.path.split(path_images[i*size_record+k])[0]
            label = tf.keras.utils.to_categorical(map_ann[label], len(map_ann))

            example = serialize_example(img, label)
            writer.write(example)
            
            pbar.update(1)

## (optional) 3. Evaluate tfrecords data

In [None]:
# define function to read the tfrecords data
def __read_data(example):
    LABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = tf.reshape(tf.io.decode_raw(example['image'], tf.uint8), (256,256,3))
    label = tf.reshape(tf.io.decode_raw(example['label'], tf.float32), (len(map_ann),))
    return image, label

ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(os.path.join(tfrecords_path, '*.tfrec')),\
                                  compression_type='GZIP',\
                                  num_parallel_reads=tf.data.experimental.AUTOTUNE)
dataset = dataset.map(__read_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# verify the number of samples for each class (original dataset)
{lab:len(os.listdir(lab)) for i,lab in enumerate(labels_name)}

In [None]:
# verify the number of samples for each class (tfrecords dataset)
ally = [np.argmax(y) for x,y in iter(dataset)]

unique, counts = np.unique(ally, return_counts=True)
list(zip(unique, counts))