Tensorflow tf.Data api allows you to build a data input pipeline. Using this you can handle large dataset for your deep learning training by streaming training samples from hard disk or S3 storage. tf.data.Dataset is the main class in tf.data api.tf pipeline allows not only to stream the data for training but you can peform various transformations easily by writing a single line of code.

# benefits

1. handle huge datasets by streaming  them from disk using batching

2. apply transformationds to mae dataset ready for model training


In [1]:
import tensorflow as tf

In [2]:
#  create tf dataset
# a python list
daily_sales_numbers = [21, 22, -108, 31, -1, 32, 34, 31]

# tf dataset
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [3]:
# to view the content
for sales in tf_dataset:
    print(sales.numpy())


21
22
-108
31
-1
32
34
31


In [4]:
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

21
22
-108
31
-1
32
34
31


In [6]:
#  view first three elments
for sales in tf_dataset.take(3):
    print(sales.numpy())

21
22
-108


In [8]:
# rmoves the negative values(invalid data)
tf_dataset = tf_dataset.filter(lambda x: x>0)

for sales in tf_dataset.as_numpy_iterator():
    print(sales)


21
22
31
32
34
31


In [9]:
# convert to local currency
tf_dataset = tf_dataset.map(lambda x:x * 114)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

2394
2508
3534
3648
3876
3534


In [11]:
# shuffle te elements randomly
tf_dataset = tf_dataset.shuffle(3)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

3534
3648
2394
3876
3534
2508


In [13]:
# batching
for sales_batch in tf_dataset.batch(2):
    print(sales_batch.numpy())

[2508 2394]
[3534 3648]
[3534 3876]


https://stackoverflow.com/questions/53514495/what-does-batch-repeat-and-shuffle-do-with-tensorflow-da

In [15]:
# do all te aove in one line
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*114).shuffle(2).batch(2)

for sales in tf_dataset.as_numpy_iterator():
    print(sales)


[2508 3534]
[3648 2394]
[3876 3534]


In [17]:
images_ds = tf.data.Dataset.list_files("images/*/*", shuffle= False)

# print several file paths
for file in images_ds.take(5):
    print(file.numpy())


b'images\\cat\\10 Cat Exercises Your Pet Will Enjoy....jpg'
b'images\\cat\\100 Best Girl Cat Names - Unique and....jpg'
b'images\\cat\\10_000_ Best Cat Photos \xc2\xb7 100_ Free....jpeg'
b'images\\cat\\15 ways to keep your indoor cat happy.jpg'
b'images\\cat\\177_330 Cat Sitting Stock Photos....jpg'


In [21]:
# shuffle omages

images_ds = images_ds.shuffle(200)

for file in images_ds.take(3):
    print(file.numpy())

b'images\\dog\\Friends_ You and Your Dog - HelpGuide.org.jpg'
b'images\\dog\\The dog did not develop symptoms of....jpg'
b'images\\cat\\cat _ Breeds & Facts _ Britannica.jpg'


In [22]:
# create list for class names
class_names = ["cat", "dog"]

# split into train and test  using tensorflow

In [24]:

image_count = len(images_ds)
image_count

80

In [25]:
train_size = int(image_count * 0.8)

# use take and skip to split
train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

In [26]:
len(train_ds)

64

In [27]:
len(test_ds)

16

In [29]:
s = 'images\\dog\\The dog did not develop symptoms of....jpg'
s.split("\\")[-2]

'dog'

In [38]:
# retrieve label from image path.
# the Y part
def get_label(file_path):
    import os
    return tf.strings.split(file_path, os.path.sep)[-2]

In [41]:
# the X part
def process_image(file_path):
    label = get_label(file_path)

    img = tf.io.read_file(file_path)  # load the raw data from the file as a string
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128,128])

    return img, label

In [36]:
for t in train_ds.take(4):
    print(t.numpy())


b'images\\cat\\cat _ Breeds & Facts _ Britannica.jpg'
b'images\\cat\\10_000_ Best Cat Photos \xc2\xb7 100_ Free....jpeg'
b'images\\dog\\Dog Breeds Banned By Home Insurance....jpg'
b'images\\cat\\Feline philosophy_ what humans can....jpg'


In [37]:
for label in train_ds.map(get_label):
    print(label)

tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=

In [44]:
train_ds = train_ds.map(process_image)
for img, label in train_ds.take(3):
    print("Image: ", img)
    print("Label: ", label)

Image:  tf.Tensor(
[[[ 91.97302  118.97302  135.97302 ]
  [ 95.98718  122.98718  139.98718 ]
  [ 98.96875  125.96875  142.96875 ]
  ...
  [155.97864  150.97864  144.97864 ]
  [154.57141  149.57141  143.57141 ]
  [154.00427  145.00427  140.00427 ]]

 [[ 99.440796 126.440796 143.4408  ]
  [104.184204 131.1842   148.1842  ]
  [106.       133.       150.      ]
  ...
  [158.       153.       147.      ]
  [155.       150.       144.      ]
  [156.90625  147.90625  142.90625 ]]

 [[105.70703  132.70703  149.70703 ]
  [110.84375  137.84375  154.84375 ]
  [112.84375  139.84375  156.84375 ]
  ...
  [158.       153.       147.      ]
  [157.       152.       146.      ]
  [157.84375  148.84375  143.84375 ]]

 ...

 [[196.42786  163.42786  156.42786 ]
  [197.22034  163.22034  154.22034 ]
  [194.57947  155.94666  145.26306 ]
  ...
  [194.15625  170.15625  160.15625 ]
  [197.15625  167.15625  157.15625 ]
  [197.86328  159.86328  150.86328 ]]

 [[200.16187  176.08093  166.32373 ]
  [204.       180.

In [45]:
# scalling
# use a function
def scale(image, label):
    return image/255, label


In [47]:
train_ds = train_ds.map(scale)

for image, label in train_ds.take(5):
    print("****Image: ", image.numpy()[0][0])
    print("****Label: ", label.numpy())

****Image:  [0.9764706  0.98039216 0.9882353 ]
****Label:  b'dog'
****Image:  [0.51222426 0.5043811  0.4455576 ]
****Label:  b'dog'
****Image:  [0.6117647  0.64715075 0.68828124]
****Label:  b'cat'
****Image:  [1. 1. 1.]
****Label:  b'cat'
****Image:  [0.1137602  0.12586598 0.15382558]
****Label:  b'cat'


# Exercise


Movie reviews are present as individual text file (one file per review) in review folder.

Folder structure looks like this,

reviews
    |__ positive
        |__pos_1.txt
        |__pos_2.txt
        |__pos_3.txt
    |__ negative
        |__neg_1.txt
        |__neg_2.txt
        |__neg_3.txt

You need to read these reviews using tf.data.Dataset and perform following transformations,

1.Read text review and generate a label from folder name. your dataset should have review text and label as a tuple

2.Filter blank text review. Two files are blank in this dataset

3. Do all of the above transformations in single line of code. Also shuffle all the reviews

Solution: https://github.com/codebasics/deep-learning-keras-tf-tutorial/tree/master/44_tf_data_pipeline/Exercise/tf_data_pipeline_exercise_solution.ipynb

