In [6]:
#import some libraries
import os
import cv2
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


## The Data API

The whole Data API revolves around the concept of a *dataset* which represents a sequence of data items

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

"""
Usually you will use datasets that gradually read from disk, but the dataset you saw above is created entirely in RAM.
The from_tensor_slices() function takes a tensor and creates a tf.data.Dataset whose elements are all the slices of X
"""

for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


## Chaining Transformations

In [3]:
dataset = dataset.repeat(3).batch(7, drop_remainder=True)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


**Notes**:

    1. repeat(n) method: it returns a new dataset that will repeat the items of the original dataset n times. You can call this method with no arguments, the new dataset will repeat the source dataset forever, so the code that iterates over the dataset will have to decide when to stop

    2. batch(n) method: it will group the items of the previous dataset in batches of n times, drop_reminder=True will be called if you want to drop the batch don't have the exact same size 

In [4]:
# Creating new dataset with map() method
dataset = dataset.map(lambda x: x * 2)
for item in dataset.take(3):
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)


In [5]:
# Creating new dataset with apply() method
""" 
The map() method applies a transformation to each item, the apply() method applies a transformation to the dataset as a whole
"""

dataset = dataset.apply(tf.data.experimental.unbatch())
for item in dataset.take(3):
    print(item)

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


## Shuffling the Data

Shuffle() method: it will create a new dataset that will start by filling up a buffer with the first items of the source dataset. Then, whenever it is asked for an item, it will pull one out randomly from the buffer and replace it with a fresh one from the source datatset. 

.The buffer_size must be specified, and it is important to make it large enough, or else shuffling will not be very effective

In [9]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.shuffle(buffer_size=5, seed=42).repeat(3).batch(5)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7], shape=(5,), dtype=int64)
tf.Tensor([9 1 8 4 5], shape=(5,), dtype=int64)
tf.Tensor([3 5 2 1 8], shape=(5,), dtype=int64)
tf.Tensor([4 0 7 9 6], shape=(5,), dtype=int64)
tf.Tensor([2 1 3 5 8], shape=(5,), dtype=int64)
tf.Tensor([9 4 6 0 7], shape=(5,), dtype=int64)


## Interleaving lines from multiple files

In [4]:
#Load California housing dataset
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.boston_housing.load_data()

In [5]:
# Scale the dataset to normalize the input values
scaler = StandardScaler()
scaler.fit(X_train_full)
X_train_new = scaler.transform(X_train_full)
X_test_new = scaler.transform(X_test)
print(X_train_new[0:10])

[[-0.27224633 -0.48361547 -0.43576161 -0.25683275 -0.1652266  -0.1764426
   0.81306188  0.1166983  -0.62624905 -0.59517003  1.14850044  0.44807713
   0.8252202 ]
 [-0.40342651  2.99178419 -1.33391162 -0.25683275 -1.21518188  1.89434613
  -1.91036058  1.24758524 -0.85646254 -0.34843254 -1.71818909  0.43190599
  -1.32920239]
 [ 0.1249402  -0.48361547  1.0283258  -0.25683275  0.62864202 -1.82968811
   1.11048828 -1.18743907  1.67588577  1.5652875   0.78447637  0.22061726
  -1.30850006]
 [-0.40149354 -0.48361547 -0.86940196 -0.25683275 -0.3615597  -0.3245576
  -1.23667187  1.10717989 -0.51114231 -1.094663    0.78447637  0.44807713
  -0.65292624]
 [-0.0056343  -0.48361547  1.0283258  -0.25683275  1.32861221  0.15364225
   0.69480801 -0.57857203  1.67588577  1.5652875   0.78447637  0.3898823
   0.26349695]
 [-0.37502238 -0.48361547 -0.54747912 -0.25683275 -0.54935658 -0.78865126
   0.18954148  0.48371503 -0.51114231 -0.71552978  0.51145832  0.38669063
  -0.13812828]
 [ 0.58963463 -0.48361547

In [7]:
#Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(X_train_new, y_train_full, test_size=0.2, random_state=42)

In [11]:
#Save dataset to multiple files
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join('datasets', "boston")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding='utf-8') as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

train_filepaths = save_to_multiple_csv_files(train_data, 'train', None, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, 'valid', None, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, 'test', None, n_parts=10)
print('\nDone writing files. Traning file paths:', train_filepaths)


Done writing files. Traning file paths: ['datasets\\boston\\my_train_00.csv', 'datasets\\boston\\my_train_01.csv', 'datasets\\boston\\my_train_02.csv', 'datasets\\boston\\my_train_03.csv', 'datasets\\boston\\my_train_04.csv', 'datasets\\boston\\my_train_05.csv', 'datasets\\boston\\my_train_06.csv', 'datasets\\boston\\my_train_07.csv', 'datasets\\boston\\my_train_08.csv', 'datasets\\boston\\my_train_09.csv', 'datasets\\boston\\my_train_10.csv', 'datasets\\boston\\my_train_11.csv', 'datasets\\boston\\my_train_12.csv', 'datasets\\boston\\my_train_13.csv', 'datasets\\boston\\my_train_14.csv', 'datasets\\boston\\my_train_15.csv', 'datasets\\boston\\my_train_16.csv', 'datasets\\boston\\my_train_17.csv', 'datasets\\boston\\my_train_18.csv', 'datasets\\boston\\my_train_19.csv']


In [12]:
#Create a dataset containing only these filepaths above:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

"""
By default, the list_files() function returns a dataset that shuffles the filepaths
"""

In [15]:
n_files_in_1_read = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_files_in_1_read)

"""
The interleave() function takes a function that returns a dataset, and applies it to each filepath in the dataset.
The function is applied to each filepath in the dataset, and the resulting datasets are interleaved in the order of the filepaths.
By default, interleave() does not use parallelism; it just reads one line at a time from each file, squentially.
"""

for line in dataset.take(5):
    print(line.numpy())


b'-0.3919630095351413,0.7801662253156655,-0.9076211063403838,-0.2568327484687563,-1.1042109960227324,0.17762277400333037,-2.193453178811165,1.625106473803,-0.3960355701527182,-0.6372959438833558,-0.8536319333458502,0.21199974653499268,-1.0421300691886186,23.7'
b'0.34856843562344747,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,1.2176413250911147,-0.7815981666935935,1.0029847632968047,-0.8944963977369444,1.6758857724016463,1.5652874992218142,0.7844763709927688,0.42179902963935195,0.6030151782242079,15.1'
b'0.11561228941989592,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,1.3286122080855265,0.6120934353778194,0.7521432207546996,-0.5635796768907313,1.6758857724016463,1.5652874992218142,0.7844763709927688,-1.0591370863914173,0.5077844550098752,16.4'
b'0.4670650820425747,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,0.2274395999102092,-1.185035209398477,0.9456495535728953,-0.646284198628302,1.6758857724016463,1.5652874992218142,0.7844763709927688,-0.02

## Preprocessing the Data

In [16]:
#Preprocess Data
n_inputs = X_train.shape[-1]
@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return x , y

""" 
tf.io.decode_csv() function which takes two arguments:
    1. The line to parse
    2. An array containing the default value for each column in the CSV file. This array tells TensorFlow not inly the default value for each column, but also the number of columns and their types.

The decode__csv() function returns a list of scalar tensors, one for each column in the CSV file., but we need to return 1D tensor arrays.
    --> Call tf.stack() on all tensor except the last one (y): this will stack these tensors into a 1D array
"""    

In [17]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000, n_parse_threads=5, batch_size=5):
    #create a dataset of filepaths
    dataset = tf.data.Dataset.list_files(filepaths)
    #Create a datasets with shuffled filepaths
    dataset  = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)
    #Preprocess the data
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    #shuffle the data
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    #batch the data
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

## Prefetching

While our training algorithm is working on one batch, the dataset will already be working in parllel on getting the next bacth ready --> can improve performance dramatically.

If the dataset is small enough to fit in memory --> using the dataset's cache() method to cache its content to RAM, do this:

**After** loading and preprocessing the data
**Before** shuffling, repeating, batching and prefetching