Create TFRecords files starting from csv data and images on local runtime

In [None]:
#Define directories
images_dir='test_data\\test_data'
csv_dir='test_data\\test_gt_v2.csv'
output_path='test_tfrecords{}.record'

In [None]:
#Import all required packages
import os
import pandas as pd
import io
import tensorflow as tf
from PIL import Image
import numpy as np

In [None]:
#Define some utilities

def int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def int64_list_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def bytes_list_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def float_list_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))


In [None]:
#Encode image and data to TFRecords

def create_tf_example(row, path):

    image_filename=str(np.int64(np.floor(row[0])))+'.npz'
    
    filepath=os.path.join(path,image_filename)

    with np.load(filepath) as npz:
      arr = np.ma.MaskedArray(**npz)

    image=arr.data
    mask=arr.mask

    image[mask]=0

    #Recast to more standard channel-last format:
    image=np.transpose(image,[1,2,0])
    height=image.shape[0]
    width=image.shape[1]

    image=image.tobytes()
    
    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': int64_feature(height),
        'image/width': int64_feature(width),
        'image': bytes_feature(image),
        'image/filename': bytes_feature(image_filename.encode('utf8')),
          
    }))
    return tf_example

Run the following cell to create the TFRecord file

In [None]:
#Define the desired number of TFRecords files, for TPU parallel data loading 100MB is the optimal file size
number_of_tfrecords_files=1

#Read csv file content
csv = pd.read_csv(csv_dir).values

#Sequentially read the dataset and write TFRecord(s)
number_of_images_per_file=np.floor(len(csv)/number_of_tfrecords_files).astype('int32')
images_processed=0
images_index_start=0
images_index_end=number_of_images_per_file

for i in range(number_of_tfrecords_files):
    writer = tf.io.TFRecordWriter(output_path.format(i))
    
    
    for row in csv[images_index_start:images_index_end]:
      images_processed+=1
      features=row
      images_dir_full = os.path.join(images_dir)
      
      
      tf_example = create_tf_example(row, images_dir_full)
      writer.write(tf_example.SerializeToString())

    images_index_start=images_index_end
    images_index_end=(i+2)*number_of_images_per_file

    if i==number_of_tfrecords_files-2:
      images_index_end=len(csv)
    writer.close()
    print('Successfully created the TFRecord file: {}'.format(output_path.format(i)))