# TensorFlow crash course
### **PART 8**

## The Data API
### This API is used to read and preprocess large data efficiently

### Data slicing

In [1]:
import tensorflow as tf
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)  # Splits the data into 10 parts to fit in ram 
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


### Chaining trasformation

In [2]:
repeated = dataset.repeat(3)  # Repeats the data 3 times
dataset = repeated.batch(7)  # Splits the data to batches of size 7
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


### Custom transformation
#### Note : If your function contains intensive computions, set num_parallel_calls to True

#### Mapping

In [3]:
# Think of it as a the python's map funcion
dataset = dataset.map(lambda x: x ** 2)

#### Applying

In [4]:
# The same as mapping instead the whole dataset gets transformed
dataset = dataset.apply(tf.data.experimental.unbatch())

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.


#### Filtering

In [5]:
# Think of it as a the python's filter funcion
dataset = dataset.filter(lambda x: x < 10)
for item in dataset.take(4):
    # take() helps you to pick a few items to look at, in this case 4 items
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


### Data shuffling

In [6]:
dataset = dataset.shuffle(buffer_size=1000, seed=42).batch(7) 
# buffer_size specifies the amount of ram to be used to shuffle the data
for item in dataset:
    print(item)
# Note : calling repeat() on a shuffled dataset generates a new order, set reshuffle_each_iteration to False to prevent. 

tf.Tensor([1 1 0 4 1 9 4], shape=(7,), dtype=int32)
tf.Tensor([0 4 9 9 0], shape=(5,), dtype=int32)
