# Scalable Tensorflow Dataset Introduction

In [1]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np

## Datasets, Pandas, and Numpy

In [2]:
df = pd.DataFrame(range(10))
df

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [3]:
df.values

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [4]:
type(df.values)

numpy.ndarray

In [5]:
tf.data.Dataset.from_tensor_slices(df)

<TensorSliceDataset shapes: (1,), types: tf.int64>

In [6]:
ds_from_tensor_slices = tf.data.Dataset.from_tensor_slices(df)
list(ds_from_tensor_slices.as_numpy_iterator())

[array([0]),
 array([1]),
 array([2]),
 array([3]),
 array([4]),
 array([5]),
 array([6]),
 array([7]),
 array([8]),
 array([9])]

In [7]:
ds_from_tensors = tf.data.Dataset.from_tensors(df)
list(ds_from_tensors.as_numpy_iterator())

[array([[0],
        [1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [8],
        [9]])]

In [8]:
print("0th element from ds from tensor slices:", next(ds_from_tensor_slices.as_numpy_iterator()))
print()
print("0th element from ds from tensors:", next(ds_from_tensors.as_numpy_iterator()))

0th element from ds from tensor slices: [0]

0th element from ds from tensors: [[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]


In [9]:
range_ds = tf.data.Dataset.from_tensor_slices(list(range(10)))
print(range_ds)
print(list(range_ds.as_numpy_iterator()))

<TensorSliceDataset shapes: (), types: tf.int32>
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [10]:
def print_ds(ds: tf.data.Dataset) -> None:
    print(list(ds.as_numpy_iterator()))

In [11]:
interleaved_ds = range_ds.interleave(lambda i: tf.data.Dataset.from_tensor_slices(range(i)))
print(interleaved_ds)
print_ds(interleaved_ds)

<InterleaveDataset shapes: (), types: tf.int32>
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 2, 4, 4, 4, 4, 3, 5, 5, 5, 4, 6, 6, 5, 7, 6, 7, 8]


In [12]:
print_ds(interleaved_ds.shuffle(buffer_size=10))
print_ds(interleaved_ds.shuffle(buffer_size=100))

[0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 2, 2, 2, 0, 1, 1, 1, 3, 3, 3, 4, 3, 3, 0, 4, 2, 3, 2, 6, 4, 2, 5, 5, 0, 6, 4, 7, 5, 7, 5, 2, 6, 4, 8]
[3, 5, 1, 5, 2, 0, 3, 6, 2, 1, 6, 3, 0, 8, 1, 5, 2, 2, 2, 0, 4, 7, 1, 3, 6, 0, 1, 1, 3, 3, 1, 0, 0, 4, 5, 4, 4, 2, 0, 2, 0, 7, 0, 1, 4]


In [13]:
print_ds(interleaved_ds.batch(10))

[array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int32), array([1, 1, 1, 1, 1, 0, 2, 2, 2, 2], dtype=int32), array([2, 2, 1, 3, 3, 3, 3, 3, 2, 4], dtype=int32), array([4, 4, 4, 3, 5, 5, 5, 4, 6, 6], dtype=int32), array([5, 7, 6, 7, 8], dtype=int32)]


In [14]:
print("first batch - then shuffle:")
print_ds(interleaved_ds.batch(10).shuffle(10))
print()
print("first shuffle - then batch:")
print_ds(interleaved_ds.shuffle(10).batch(10))

first batch - then shuffle:
[array([2, 2, 1, 3, 3, 3, 3, 3, 2, 4], dtype=int32), array([4, 4, 4, 3, 5, 5, 5, 4, 6, 6], dtype=int32), array([5, 7, 6, 7, 8], dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int32), array([1, 1, 1, 1, 1, 0, 2, 2, 2, 2], dtype=int32)]

first shuffle - then batch:
[array([0, 0, 0, 0, 0, 1, 1, 1, 1, 2], dtype=int32), array([1, 0, 2, 2, 2, 2, 1, 1, 0, 3], dtype=int32), array([0, 3, 2, 3, 3, 0, 1, 3, 4, 2], dtype=int32), array([5, 5, 6, 5, 7, 4, 6, 4, 5, 8], dtype=int32), array([4, 4, 3, 6, 7], dtype=int32)]


# Datasets and Generators: What if the data doesn't fit into memory?

In [15]:
for i in range(3):
    print(i)

0
1
2


In [16]:
try:
    tf.data.Dataset.from_tensor_slices(range(3))
except TypeError as e:
    print(e)

'list' object cannot be interpreted as an integer


In [17]:
try:
    tf.data.Dataset.from_tensors(range(3))
except TypeError as e:
    print(e)

'list' object cannot be interpreted as an integer


* cannot use tf.data.Dataset.from_tensor_slices
* cannot use tf.data.Dataset.from_tensors

In [18]:
def int_generator():
    yield from range(3)
    
ds_from_int_generator = tf.data.Dataset.from_generator(
    generator=int_generator,
    output_types=tf.int32
)
print(ds_from_int_generator)
print_ds(ds_from_int_generator)

<FlatMapDataset shapes: <unknown>, types: tf.int32>
[0, 1, 2]


In [19]:
def tuple_generator():
    yield from zip(range(3), reversed(range(3)))
    
ds_from_tuple_generator = tf.data.Dataset.from_generator(
    generator=tuple_generator,
    output_types=(tf.int32, tf.int32)
)
print(ds_from_tuple_generator)
print_ds(ds_from_tuple_generator)

<FlatMapDataset shapes: (<unknown>, <unknown>), types: (tf.int32, tf.int32)>
[(0, 2), (1, 1), (2, 0)]


In [20]:
tfds.benchmark(ds_from_tuple_generator)

1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 71.33 ex/sec (total: 3 ex, 0.04 sec)
Examples/sec (First only) 59.06 ex/sec (total: 1 ex, 0.02 sec)
Examples/sec (First excluded) 79.59 ex/sec (total: 2 ex, 0.03 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,0.042061,3,71.325357
first,0.016931,1,59.063163
lasts,0.02513,2,79.586941


## Attention: Python code can be really slow

In [21]:
benchmark_size = 2 * 10**4

# dataframe in-memory
large_df = pd.DataFrame(range(benchmark_size))
large_ds_from_tensor_slices = tf.data.Dataset.from_tensor_slices(large_df)

# generator in-memory
def large_generator():
    yield from range(benchmark_size)
large_ds_from_generator = tf.data.Dataset.from_generator(
    generator=large_generator,
    output_types=tf.int32
)

print("from tensor slices:")
tfds.benchmark(large_ds_from_tensor_slices)

print()
print()

print("from generator:")
tfds.benchmark(large_ds_from_generator)

from tensor slices:


  0%|          | 1/20000 [00:00<?, ?it/s]


************ Summary ************

Examples/sec (First included) 20343.32 ex/sec (total: 20000 ex, 0.98 sec)
Examples/sec (First only) 409.58 ex/sec (total: 1 ex, 0.00 sec)
Examples/sec (First excluded) 20392.95 ex/sec (total: 19999 ex, 0.98 sec)


from generator:


1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 3611.40 ex/sec (total: 20000 ex, 5.54 sec)
Examples/sec (First only) 82.60 ex/sec (total: 1 ex, 0.01 sec)
Examples/sec (First excluded) 3619.13 ex/sec (total: 19999 ex, 5.53 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,5.538025,20000,3611.39602
first,0.012106,1,82.601634
lasts,5.525918,19999,3619.126977


## The generator is ~7x slower.

In [22]:
print("batched from tensor slices:")
tfds.benchmark(large_ds_from_tensor_slices.batch(128))

print()
print()

print("batched from generator:")
tfds.benchmark(large_ds_from_generator.batch(128))

batched from tensor slices:


  1%|          | 1/157 [00:00<?, ?it/s]


************ Summary ************

Examples/sec (First included) 3244.41 ex/sec (total: 157 ex, 0.05 sec)
Examples/sec (First only) 300.53 ex/sec (total: 1 ex, 0.00 sec)
Examples/sec (First excluded) 3461.79 ex/sec (total: 156 ex, 0.05 sec)


batched from generator:


1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 41.09 ex/sec (total: 157 ex, 3.82 sec)
Examples/sec (First only) 28.99 ex/sec (total: 1 ex, 0.03 sec)
Examples/sec (First excluded) 41.20 ex/sec (total: 156 ex, 3.79 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,3.820738,157,41.091534
first,0.034492,1,28.992076
lasts,3.786246,156,41.201758


## even batched Generators are not that fast...
### You can use generators, but they are not optimal... However, nothing is impossible with them!

more complex generators:

In [23]:
# imo the prettiest way to implement 'method classes'

class MyGenerator:
    def __init__(self, benchmark_size: int) -> None:
        self.benchmark_size = benchmark_size
    
    def __call__(self, msg: str = "Hello"):
        print(f"I was run with '{msg}'!")
        yield from range(self.benchmark_size)

next(MyGenerator(1)(msg="world"))

I was run with 'world'!


0

In [24]:
ds_from_generator_class = tf.data.Dataset.from_generator(
    generator=MyGenerator(10**4),
    output_types=tf.int32
)

tfds.benchmark(ds_from_generator_class)

I was run with 'Hello'!


1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 3714.20 ex/sec (total: 10000 ex, 2.69 sec)
Examples/sec (First only) 84.86 ex/sec (total: 1 ex, 0.01 sec)
Examples/sec (First excluded) 3730.15 ex/sec (total: 9999 ex, 2.68 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,2.692373,10000,3714.195126
first,0.011784,1,84.859827
lasts,2.680589,9999,3730.150048


## Better Option: read Dataset asynchronously from Files
### Dump Dataframes into Dataset Files

In [25]:
DUMP_PATH_PREFIX = 'test'

In [26]:
rm -r $DUMP_PATH_PREFIX

In [27]:
mkdir $DUMP_PATH_PREFIX

In [28]:
ls $DUMP_PATH_PREFIX

In [29]:
large_df_for_dumping = pd.DataFrame(range(50_000))
large_ds_for_dumping = tf.data.Dataset.from_tensor_slices(large_df_for_dumping)

In [30]:
len(large_ds_for_dumping)

50000

In [31]:
print(large_ds_for_dumping)

<TensorSliceDataset shapes: (1,), types: tf.int64>


In [32]:
dump_paths = []

for i in range(4):
    dump_path = os.path.join(DUMP_PATH_PREFIX, f"{i}.dataset")
    tf.data.experimental.save(
        dataset=large_ds_for_dumping,
        path=dump_path,
        compression='GZIP'
    )
    print(f"stored to '{dump_path}'")
    dump_paths.append(dump_path)

stored to 'test/0.dataset'
stored to 'test/1.dataset'
stored to 'test/2.dataset'
stored to 'test/3.dataset'


In [33]:
print(os.popen(f"du -h --summarize {os.path.join(DUMP_PATH_PREFIX, '*')}").read())

2.5K	test/0.dataset
2.5K	test/1.dataset
2.5K	test/2.dataset
2.5K	test/3.dataset



### Load Datasets

In [34]:
dumped_files_ds = tf.data.Dataset.from_tensor_slices(dump_paths)
print(dumped_files_ds)
print_ds(dumped_files_ds)

<TensorSliceDataset shapes: (), types: tf.string>
[b'test/0.dataset', b'test/1.dataset', b'test/2.dataset', b'test/3.dataset']


In [35]:
element_spec = (tf.TensorSpec(shape=(1,), dtype=tf.int64),)
element_spec

(TensorSpec(shape=(1,), dtype=tf.int64, name=None),)

In [36]:
loaded_files_ds = dumped_files_ds.interleave(lambda path:
    tf.data.experimental.load(
        path=path,
        element_spec=element_spec,
        compression='GZIP'
    )
)
print(loaded_files_ds)

<InterleaveDataset shapes: ((1,),), types: (tf.int64,)>


In [37]:
try:
    len(loaded_files_ds)
except TypeError as e:
    print(e)

dataset length is unknown.


In [38]:
# looks like round robin
print_ds(loaded_files_ds.take(12))

[(array([0]),), (array([0]),), (array([0]),), (array([0]),), (array([1]),), (array([1]),), (array([1]),), (array([1]),), (array([2]),), (array([2]),), (array([2]),), (array([2]),)]


In [39]:
tfds.benchmark(loaded_files_ds)

1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 13051.46 ex/sec (total: 200000 ex, 15.32 sec)
Examples/sec (First only) 73.85 ex/sec (total: 1 ex, 0.01 sec)
Examples/sec (First excluded) 13062.94 ex/sec (total: 199999 ex, 15.31 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,15.323951,200000,13051.464704
first,0.013541,1,73.852446
lasts,15.31041,199999,13062.942092


## Reading from Files is 3-4x faster than reading from a Generator

In [40]:
from typing import List

def load_ds_from_files(file_paths: List[str]) -> tf.data.Dataset:
    ds = tf.data.Dataset.from_tensor_slices(file_paths)
    ds = ds.interleave(lambda path:
        tf.data.experimental.load(
            path=path,
            element_spec=element_spec,
            compression='GZIP'
        ))
    return ds

In [41]:
tfds.benchmark(
    load_ds_from_files(dump_paths)\
    .shuffle(buffer_size=10**5)\
    .batch(128, drop_remainder=True)
)

1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 1758.26 ex/sec (total: 1562 ex, 0.89 sec)
Examples/sec (First only) 3.19 ex/sec (total: 1 ex, 0.31 sec)
Examples/sec (First excluded) 2716.44 ex/sec (total: 1561 ex, 0.57 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,0.888376,1562,1758.263737
first,0.313727,1,3.18748
lasts,0.574649,1561,2716.441069


## >10x Speed Improvement by using Batching 

In [42]:
tfds.benchmark(
    load_ds_from_files(dump_paths)\
    .shuffle(buffer_size=10**4)\
    .batch(128, drop_remainder=True)\
    .prefetch(tf.data.experimental.AUTOTUNE)
)

1it [00:00, ?it/s]


************ Summary ************

Examples/sec (First included) 2039.46 ex/sec (total: 1562 ex, 0.77 sec)
Examples/sec (First only) 18.82 ex/sec (total: 1 ex, 0.05 sec)
Examples/sec (First excluded) 2190.12 ex/sec (total: 1561 ex, 0.71 sec)


Unnamed: 0,duration,num_examples,avg
first+lasts,0.76589,1562,2039.456991
first,0.053143,1,18.817058
lasts,0.712747,1561,2190.118355


## Another 10-20% (at least) by using Prefetching