# Using TFRecords to train ResNet50 in TensorFlow


This tutorial will explain how to train a "transfer learning" model using TRFrecords.


In [1]:
import tensorflow as tf
from io import BytesIO
from imageio import imread
from IPython import display
import cv2 as cv
import os
import random
import csv
import glob

In [2]:
# View the TFRecord files provided by the "RANZCR CLiP - Catheter and Line Position Challenge"
!ls '../data/test_tfrecords'


01-224.tfrec  04-224.tfrec  07-224.tfrec  10-224.tfrec	13-224.tfrec
02-224.tfrec  05-224.tfrec  08-224.tfrec  11-224.tfrec	14-224.tfrec
03-224.tfrec  06-224.tfrec  09-224.tfrec  12-224.tfrec	15-224.tfrec


# Step 1: Load the TFRecord files into a tf.data.Dataset

https://www.tensorflow.org/guide/data

TFRecord files can be loaded into a tf.data.Dataset.
The dataset's map() method can then be used to preprocess the data.  https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map




Datasets can  can be fed directly into your model's model.fit().  They can also be "optimized" to speed up the training process.

In [28]:
INPUT_DATA_FOLDER = '../data/test_tfrecords/'

filenames = [INPUT_DATA_FOLDER]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [33]:

# Get a list of the TFRecord files
TFRecordFiles = []
for dirname, _, filenames in os.walk(INPUT_DATA_FOLDER):
    for filename in filenames:
        if filename[-6:]=='.tfrec':
            TFRecordFiles.append(os.path.join(dirname, filename))

            
# Spilt the data into training and validation datasets                 
SplitNumber = int(.8*len(TFRecordFiles))  
train_dataset = tf.data.TFRecordDataset(TFRecordFiles[:SplitNumber])
val_dataset = tf.data.TFRecordDataset(TFRecordFiles[SplitNumber:])
        
    
    
# I looked this up in windows explorer
# It's the number of images in the train folder
DATASET_SIZE = 30083

train_size = int(0.8 * DATASET_SIZE)
val_size = int(0.2 * DATASET_SIZE)

# Create a training and a validation datasets
full_dataset = tf.data.TFRecordDataset(TFRecordFiles)
full_dataset = full_dataset.shuffle(buffer_size=1000)
train_dataset = full_dataset.take(train_size) #.cache()
val_dataset = full_dataset.skip(train_size).take(val_size) #.cache()
        
        

# Step 2: See what data is included in a single example

You need to create a "feature dictionary" that will be used to parse the TFRecord files into a format that can be used by your model.

To do that, you need to know what data is in a TFRecords, and the data's datatype.



In [34]:
for raw_record in train_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    if True:  # Change this to True if you want to see the data.  I turned it off because it's long and it make this notebook hard to read.
        print(example)

features {
  feature {
    key: "StudyInstanceUID"
    value {
      bytes_list {
        value: "1.2.826.0.1.3680043.8.498.24260911792031702994888925871005278482"
      }
    }
  }
  feature {
    key: "image"
    value {
      bytes_list {
        value: "\377\330\377\340\000\020JFIF\000\001\001\001\001,\001,\000\000\377\333\000C\000\002\001\001\001\001\001\002\001\001\001\002\002\002\002\002\004\003\002\002\002\002\005\004\004\003\004\006\005\006\006\006\005\006\006\006\007\t\010\006\007\t\007\006\006\010\013\010\t\n\n\n\n\n\006\010\013\014\013\n\014\t\n\n\n\377\300\000\013\010\004\000\004\000\001\001\021\000\377\304\000\036\000\000\001\005\001\001\001\001\001\000\000\000\000\000\000\000\000\005\000\002\003\004\006\007\001\010\t\n\377\304\000S\020\000\002\001\003\003\002\004\003\005\006\005\002\004\002\001\025\001\002\003\000\004\021\005\022!\"1\006\0232ABQa\007\024Rq\201#b\221\241\261\360\0253r\301\321\341\361\010$C\202\026S\222\242%4c\262\0275D\203&Es\223\263Udt\302u\243\322\377\3

# Step 3: Create a feature dictionary

The feature dictionary should describe the data stored in the TFRecord
For more details, see:  https://www.tensorflow.org/tutorials/load_data/tfrecord#read_the_tfrecord_file

In [18]:
feature_dictionary = {
    'CVC - Abnormal': tf.io.FixedLenFeature([], tf.int64),
    'CVC - Borderline': tf.io.FixedLenFeature([], tf.int64),
    'CVC - Normal': tf.io.FixedLenFeature([], tf.int64),
    'ETT - Abnormal': tf.io.FixedLenFeature([], tf.int64),
    'ETT - Borderline': tf.io.FixedLenFeature([], tf.int64),
    'ETT - Normal': tf.io.FixedLenFeature([], tf.int64),
    'NGT - Abnormal': tf.io.FixedLenFeature([], tf.int64),
    'NGT - Borderline': tf.io.FixedLenFeature([], tf.int64),
    'NGT - Incompletely Imaged': tf.io.FixedLenFeature([], tf.int64),
    'NGT - Normal': tf.io.FixedLenFeature([], tf.int64),
    'StudyInstanceUID': tf.io.FixedLenFeature([], tf.string),    
    'Swan Ganz Catheter Present': tf.io.FixedLenFeature([], tf.int64),
    'image': tf.io.FixedLenFeature([], tf.string),
}

# Step 4: Parse the TFRecord

In this step, we parse the TFRecord data into a more useable format.

In [19]:
# Define two parsing functions that will turn the TFRecord back into an array and a label        
def _parse_function(example, feature_dictionary=feature_dictionary):
    # Parse the input `tf.train.Example` proto using the feature_dictionary.
    # Create a description of the features.
    parsed_example = tf.io.parse_example(example, feature_dictionary)
    return parsed_example


#print(TF_dataset)
#print('-----------------')
train_dataset = train_dataset.map(_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.map(_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
print(train_dataset)

<ParallelMapDataset shapes: {CVC - Abnormal: (), CVC - Borderline: (), CVC - Normal: (), ETT - Abnormal: (), ETT - Borderline: (), ETT - Normal: (), NGT - Abnormal: (), NGT - Borderline: (), NGT - Incompletely Imaged: (), NGT - Normal: (), StudyInstanceUID: (), Swan Ganz Catheter Present: (), image: ()}, types: {CVC - Abnormal: tf.int64, CVC - Borderline: tf.int64, CVC - Normal: tf.int64, ETT - Abnormal: tf.int64, ETT - Borderline: tf.int64, ETT - Normal: tf.int64, NGT - Abnormal: tf.int64, NGT - Borderline: tf.int64, NGT - Incompletely Imaged: tf.int64, NGT - Normal: tf.int64, StudyInstanceUID: tf.string, Swan Ganz Catheter Present: tf.int64, image: tf.string}>


# Step 5:  Print a couple images

In [24]:
def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "class": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        
        # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
        "label":         tf.io.FixedLenFeature([], tf.string),  # one bytestring
        "size":          tf.io.FixedLenFeature([2], tf.int64),  # two integers
        "one_hot_class": tf.io.VarLenFeature(tf.float32)        # a certain number of floats
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    # FixedLenFeature fields are now ready to use: exmple['size']
    # VarLenFeature fields require additional sparse_to_dense decoding
    
    image = tf.image.decode_jpeg(example['image'], channels=3)
    image = tf.reshape(image, [*TARGET_SIZE, 3])
    
    class_num = example['class']
    
    label  = example['label']
    height = example['size'][0]
    width  = example['size'][1]
    one_hot_class = tf.sparse.to_dense(example['one_hot_class'])
    return image, class_num, label, height, width, one_hot_class
    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False


GCS_OUTPUT = '../data/test_tfrecords/'
AUTO = tf.data.experimental.AUTOTUNE
TARGET_SIZE = [192, 192]

filenames = tf.io.gfile.glob(GCS_OUTPUT + "*.tfrec")
dataset4 = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
dataset4 = dataset4.with_options(option_no_order)
dataset4 = dataset4.map(read_tfrecord, num_parallel_calls=AUTO)
dataset4 = dataset4.shuffle(300)


# define datatype as int 64

In [40]:
image_features = train_dataset.take(1)

image_features['image'].numpy()

TypeError: 'TakeDataset' object is not subscriptable

In [25]:
display_dataset = dataset4.map(lambda image, class_num, label, height, width, one_hot_class: (image, label))
display_9_images_from_dataset(display_dataset)

NameError: name 'display_9_images_from_dataset' is not defined

In [27]:
for image, class_num, label, height, width, one_hot_class in dataset4.take(3):
    print("Image shape {}, {}x{} px, class={} ({:>10}, {})".format(image.numpy().shape, width, height, class_num, label.numpy(), one_hot_class))

InvalidArgumentError: Feature: class (data type: int64) is required but could not be found.
	 [[{{node ParseSingleExample/ParseExample/ParseExampleV2}}]]

In [21]:
for image_features in train_dataset.take(1):
    image = image_features['image'].numpy()
    display.display(display.Image(data=image))

InvalidArgumentError: Feature: CVC - Abnormal (data type: int64) is required but could not be found.
	 [[{{node ParseExample/ParseExampleV2}}]]

# Step 6: Look at an image as an numpy ndarray

In [35]:
for image_features in train_dataset.take(1):
    image = image_features['image'].numpy()
    numpyArray = imread(BytesIO(image))
    print(numpyArray)

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'image'

# Step 7: Look at the labels

In [None]:
for image_features in train_dataset.take(1):
    print('StudyInstanceUID: ', image_features['StudyInstanceUID'].numpy())
    print('CVC - Abnormal: ', image_features['CVC - Abnormal'].numpy())
    print('CVC - Borderline: ', image_features['CVC - Borderline'].numpy())
    print('CVC - Normal: ', image_features['CVC - Normal'].numpy())
    print('ETT - Abnormal: ', image_features['ETT - Abnormal'].numpy())
    print('ETT - Borderline: ', image_features['ETT - Borderline'].numpy())
    print('ETT - Normal: ', image_features['ETT - Normal'].numpy())
    print('NGT - Abnormal: ', image_features['NGT - Abnormal'].numpy())
    print('NGT - Borderline: ', image_features['NGT - Borderline'].numpy())
    print('NGT - Incompletely Imaged: ', image_features['NGT - Incompletely Imaged'].numpy())
    print('NGT - Normal: ', image_features['NGT - Normal'].numpy())
    print('Swan Ganz Catheter Present: ', image_features['Swan Ganz Catheter Present'].numpy())
    
    


# Step 8: Generate (feature, label) pairs

The "features" are the data you are using to make your prediction (i.e. the medical image)
The "labels" are what you are trying to predict.

We want to transform the TFRecord feature (a string of bytes) back into an image.  Then we want to conver the image from greyscale to RGB color, and change it's size.
For our labels, we just want to turn them into one long list of zeros and ones.

In [None]:
# Define two parsing functions that will turn the TFRecord back into an array and a label        
def generate_training_example(example):
    new_image_size=[224, 224]


    # Convert the image to an ndarray, resize it and convert it to RGB color
    # These are the settings most commonly required by base models used in transfer learning.

    features =  tf.io.decode_image(example['image'], expand_animations = False)
    features =  tf.image.grayscale_to_rgb(features)
    features =  tf.image.resize(features,size=new_image_size)
    
    
    
    #features = example['image'].numpy()
    #features = imread(BytesIO(features))
    #features = cv.resize(features, new_image_size)
    #features = cv.cvtColor(features, cv.COLOR_GRAY2RGB)
    labels = [ # Edit this to add whatever labels you want your model to predict
                example['CVC - Abnormal'],
                example['CVC - Borderline'],
                example['CVC - Normal'],
                example['ETT - Abnormal'],
                example['ETT - Borderline'],
                example['ETT - Normal'],
                example['NGT - Abnormal'],
                example['NGT - Borderline'],
                example['NGT - Incompletely Imaged'],
                example['NGT - Normal'],
                example['Swan Ganz Catheter Present'],
            ]

    return features, labels


train_dataset = train_dataset.map(generate_training_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) # .cache('/kaggle/temp/train.cache')
val_dataset = val_dataset.map(generate_training_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) # .cache('/kaggle/temp/test.cache')

for X in train_dataset.take(1):
    print(X[0])
    print('-------------------')
    print(X[1])

# Step 3: Train a simple model

Nothing fancy, I just want to show that the above code works.
This is an example of "Transfer Learning".  We are starting with an existing, trained model.  We are adding a new "classification head" and then training the model to predict our labels.


In [None]:
def create_model():
    
    n_labels = 11 # number or output classes
    
    auc = tf.keras.metrics.AUC(multi_label=True) # metric for multi-class multi-label models

    # https://keras.io/api/applications/
    # Options include: ResNet50, MobileNetV2
    base_model = tf.keras.applications.ResNet50(
        input_shape=(224, 224, 3), include_top=False, weights="imagenet"
    )

    base_model.trainable = True

    inputs = tf.keras.Input(shape=(224, 224, 3))
    
    # built-in resnet preprocessor
    #https://www.tensorflow.org/api_docs/python/tf/keras/applications/resnet/preprocess_input
    x = tf.keras.applications.resnet.preprocess_input(inputs)  
    
    x = base_model(x, training=False)
    

    # Convert features of shape `base_model.output_shape[1:]` to vectors
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    # A Dense classifier with a single unit (binary classification)
    outputs = tf.keras.layers.Dense(n_labels, activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)

    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    #loss=custom_loss_fn,  # Custom Code to apply class weights
    metrics=[auc])


    return model




my_model = create_model()
print('New model created!')
print()
print(my_model.summary())

In [None]:
train_dataset = train_dataset.repeat().batch(32).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.repeat().batch(32).prefetch(tf.data.experimental.AUTOTUNE)



history = my_model.fit(
    train_dataset,
    epochs=30,
    steps_per_epoch=1000, 
    validation_data=val_dataset,
    validation_steps=50,
)

# Save the trained model
my_model.save('/kaggle/working/my_model/')


# test on the whole evaluiation dataset
#my_model.evaluate(val_dataset, steps=5000)
