In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf
from os import getcwd
import multiprocessing

In [2]:
def create_model():
    input_layer = tf.keras.layers.Input(shape=(224, 224, 3))
    base_model = tf.keras.applications.MobileNetV2(input_tensor=input_layer,
                                                   weights='imagenet',
                                                   include_top=False)
    base_model.trainable = False
    x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
    x = tf.keras.layers.Dense(2, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=input_layer, outputs=x)
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])
    return model

In [3]:
dataset_name = 'cats_vs_dogs'
filePath = f"{getcwd()}/../tmp2"
dataset, info = tfds.load(name=dataset_name, split=tfds.Split.TRAIN, with_info=True, data_dir=filePath)

In [4]:
print(info.version)
info.version

2.0.1


<tensorflow_datasets.core.utils.version.Version at 0x7fd4c45e06a0>

# Naively training with tf.data

Let us run through the code in a normal scenario. We will not use any of the new concepts of parallelization we have learnt in this module.

In [5]:
def preprocess(features):
    image = features['image']
    image = tf.image.resize(image, (224, 224))
    image = image / 255.0
    return image, features['label']

In [6]:
train_dataset = dataset.map(preprocess).batch(32)

```python
model = create_model()
model.fit(train_dataset, epochs=5)
```

# Exercise

This exercise is about parallelizing various stages of Extract, Transform and Load processes. In this exercise, you will be tasked with following tasks:   

1.   Parallelize extraction of stored TFRecords of cats_vs_dogs dataset using interleave operation.
2.   Parallelize transformation during preprocessing of raw dataset using map operation.
3.   Cache the processed dataset in memory using cache operation for faster retrieval.
4.   Parallelize the loading of cached dataset during training cycle using prefetch operation.



In [7]:
file_pattern = f'{getcwd()}/../tmp2/{dataset_name}/{info.version}/{dataset_name}-train.tfrecord*'
files = tf.data.Dataset.list_files(file_pattern)

## Parallelize Extraction


In [8]:
# EXERCISE: Parallelize the extraction of the stored TFRecords of
# the cats_vs_dogs dataset by using the interleave operation with
# cycle_length = 4 and the number of parallel calls set to tf.data.experimental.AUTOTUNE.
train_dataset = files.interleave(tf.data.TFRecordDataset, 
                               cycle_length=4,
                               num_parallel_calls=tf.data.experimental.AUTOTUNE)


In [9]:
def read_tfrecord(serialized_example):
    
    # Create the feature description dictionary
    feature_description = {
        'image': tf.io.FixedLenFeature((), tf.string, ""),
        'label': tf.io.FixedLenFeature((), tf.int64, -1),
    }
    # Parse the serialized_example and decode the image
    example = tf.io.parse_single_example(serialized_example, feature_description)
    image = tf.io.decode_jpeg(example['image'], channels=3)
    
    image = tf.cast(image, tf.float32)
    
    # Normalize the pixels in the image
    image = image/255.
    
    # Resize the image to (224, 224) using tf.image.resize
    image = tf.image.resize(image, [224, 224])
    
    return image, example['label']


## Parallelize Transformation

In [10]:
# Get the number of CPU cores. 
cores = cores = multiprocessing.cpu_count()

print(cores)

# Apply the map transformation with number of parallel calls set to number of cores
train_dataset = train_dataset.map(read_tfrecord,num_parallel_calls=cores)


8


## Cache the dataset

In [11]:
# Cache the dataset in-memory
train_dataset = dataset.cache()


## Parallelize Loading

In [12]:
# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(1024).batch(32)
# Parallelize the loading by prefetching the dataset and setting buffer size to AUTOTUNE
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


```python
model = create_model()
model.fit(train_dataset, epochs=1)
```

# Submission Instructions

In [None]:
# Now click the 'Submit Assignment' button above.
# Once that is complete, please run the following two cells to save your work and close the notebook

In [None]:
%%javascript
<!-- Save the notebook -->
IPython.notebook.save_checkpoint();

In [None]:
%%javascript
<!-- Shutdown and close the notebook -->
window.onbeforeunload = null
window.close();
IPython.notebook.session.delete();