In [1]:
#import some libraries
import os
import cv2
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## The Data API

The whole Data API revolves around the concept of a *dataset* which represents a sequence of data items

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

"""
Usually you will use datasets that gradually read from disk, but the dataset you saw above is created entirely in RAM.
The from_tensor_slices() function takes a tensor and creates a tf.data.Dataset whose elements are all the slices of X
"""

for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


## Chaining Transformations

In [3]:
dataset = dataset.repeat(3).batch(7, drop_remainder=True)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


**Notes**:

    1. repeat(n) method: it returns a new dataset that will repeat the items of the original dataset n times. You can call this method with no arguments, the new dataset will repeat the source dataset forever, so the code that iterates over the dataset will have to decide when to stop

    2. batch(n) method: it will group the items of the previous dataset in batches of n times, drop_reminder=True will be called if you want to drop the batch don't have the exact same size 

In [4]:
# Creating new dataset with map() method
dataset = dataset.map(lambda x: x * 2)
for item in dataset.take(3):
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)


In [5]:
# Creating new dataset with apply() method
""" 
The map() method applies a transformation to each item, the apply() method applies a transformation to the dataset as a whole
"""

dataset = dataset.apply(tf.data.experimental.unbatch())
for item in dataset.take(3):
    print(item)

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


## Shuffling the Data

Shuffle() method: it will create a new dataset that will start by filling up a buffer with the first items of the source dataset. Then, whenever it is asked for an item, it will pull one out randomly from the buffer and replace it with a fresh one from the source datatset. 

.The buffer_size must be specified, and it is important to make it large enough, or else shuffling will not be very effective

In [9]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.shuffle(buffer_size=5, seed=42).repeat(3).batch(5)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7], shape=(5,), dtype=int64)
tf.Tensor([9 1 8 4 5], shape=(5,), dtype=int64)
tf.Tensor([3 5 2 1 8], shape=(5,), dtype=int64)
tf.Tensor([4 0 7 9 6], shape=(5,), dtype=int64)
tf.Tensor([2 1 3 5 8], shape=(5,), dtype=int64)
tf.Tensor([9 4 6 0 7], shape=(5,), dtype=int64)


## Interleaving lines from multiple files

In [4]:
#Load California housing dataset
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.boston_housing.load_data()

In [5]:
# Scale the dataset to normalize the input values
scaler = StandardScaler()
scaler.fit(X_train_full)
X_train_new = scaler.transform(X_train_full)
X_test_new = scaler.transform(X_test)
print(X_train_new[0:10])

[[-0.27224633 -0.48361547 -0.43576161 -0.25683275 -0.1652266  -0.1764426
   0.81306188  0.1166983  -0.62624905 -0.59517003  1.14850044  0.44807713
   0.8252202 ]
 [-0.40342651  2.99178419 -1.33391162 -0.25683275 -1.21518188  1.89434613
  -1.91036058  1.24758524 -0.85646254 -0.34843254 -1.71818909  0.43190599
  -1.32920239]
 [ 0.1249402  -0.48361547  1.0283258  -0.25683275  0.62864202 -1.82968811
   1.11048828 -1.18743907  1.67588577  1.5652875   0.78447637  0.22061726
  -1.30850006]
 [-0.40149354 -0.48361547 -0.86940196 -0.25683275 -0.3615597  -0.3245576
  -1.23667187  1.10717989 -0.51114231 -1.094663    0.78447637  0.44807713
  -0.65292624]
 [-0.0056343  -0.48361547  1.0283258  -0.25683275  1.32861221  0.15364225
   0.69480801 -0.57857203  1.67588577  1.5652875   0.78447637  0.3898823
   0.26349695]
 [-0.37502238 -0.48361547 -0.54747912 -0.25683275 -0.54935658 -0.78865126
   0.18954148  0.48371503 -0.51114231 -0.71552978  0.51145832  0.38669063
  -0.13812828]
 [ 0.58963463 -0.48361547

In [7]:
#Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(X_train_new, y_train_full, test_size=0.2, random_state=42)

In [11]:
#Save dataset to multiple files
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join('datasets', "boston")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding='utf-8') as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

train_filepaths = save_to_multiple_csv_files(train_data, 'train', None, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, 'valid', None, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, 'test', None, n_parts=10)
print('\nDone writing files. Traning file paths:', train_filepaths)


Done writing files. Traning file paths: ['datasets\\boston\\my_train_00.csv', 'datasets\\boston\\my_train_01.csv', 'datasets\\boston\\my_train_02.csv', 'datasets\\boston\\my_train_03.csv', 'datasets\\boston\\my_train_04.csv', 'datasets\\boston\\my_train_05.csv', 'datasets\\boston\\my_train_06.csv', 'datasets\\boston\\my_train_07.csv', 'datasets\\boston\\my_train_08.csv', 'datasets\\boston\\my_train_09.csv', 'datasets\\boston\\my_train_10.csv', 'datasets\\boston\\my_train_11.csv', 'datasets\\boston\\my_train_12.csv', 'datasets\\boston\\my_train_13.csv', 'datasets\\boston\\my_train_14.csv', 'datasets\\boston\\my_train_15.csv', 'datasets\\boston\\my_train_16.csv', 'datasets\\boston\\my_train_17.csv', 'datasets\\boston\\my_train_18.csv', 'datasets\\boston\\my_train_19.csv']


In [12]:
#Create a dataset containing only these filepaths above:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

"""
By default, the list_files() function returns a dataset that shuffles the filepaths
"""

In [15]:
n_files_in_1_read = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_files_in_1_read)

"""
The interleave() function takes a function that returns a dataset, and applies it to each filepath in the dataset.
The function is applied to each filepath in the dataset, and the resulting datasets are interleaved in the order of the filepaths.
By default, interleave() does not use parallelism; it just reads one line at a time from each file, squentially.
"""

for line in dataset.take(5):
    print(line.numpy())


b'-0.3919630095351413,0.7801662253156655,-0.9076211063403838,-0.2568327484687563,-1.1042109960227324,0.17762277400333037,-2.193453178811165,1.625106473803,-0.3960355701527182,-0.6372959438833558,-0.8536319333458502,0.21199974653499268,-1.0421300691886186,23.7'
b'0.34856843562344747,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,1.2176413250911147,-0.7815981666935935,1.0029847632968047,-0.8944963977369444,1.6758857724016463,1.5652874992218142,0.7844763709927688,0.42179902963935195,0.6030151782242079,15.1'
b'0.11561228941989592,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,1.3286122080855265,0.6120934353778194,0.7521432207546996,-0.5635796768907313,1.6758857724016463,1.5652874992218142,0.7844763709927688,-1.0591370863914173,0.5077844550098752,16.4'
b'0.4670650820425747,-0.4836154708652843,1.0283257954396188,-0.2568327484687563,0.2274395999102092,-1.185035209398477,0.9456495535728953,-0.646284198628302,1.6758857724016463,1.5652874992218142,0.7844763709927688,-0.02

## Preprocessing the Data

In [16]:
#Preprocess Data
n_inputs = X_train.shape[-1]
@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return x , y

""" 
tf.io.decode_csv() function which takes two arguments:
    1. The line to parse
    2. An array containing the default value for each column in the CSV file. This array tells TensorFlow not inly the default value for each column, but also the number of columns and their types.

The decode__csv() function returns a list of scalar tensors, one for each column in the CSV file., but we need to return 1D tensor arrays.
    --> Call tf.stack() on all tensor except the last one (y): this will stack these tensors into a 1D array
"""    

In [17]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000, n_parse_threads=5, batch_size=5):
    #create a dataset of filepaths
    dataset = tf.data.Dataset.list_files(filepaths)
    #Create a datasets with shuffled filepaths
    dataset  = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)
    #Preprocess the data
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    #shuffle the data
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    #batch the data
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

## Prefetching

While our training algorithm is working on one batch, the dataset will already be working in parallel on getting the next bacth ready --> can improve performance dramatically.

If the dataset is small enough to fit in memory --> using the dataset's cache() method to cache its content to RAM, do this:

**After** loading and preprocessing the data

**Before** shuffling, repeating, batching and prefetching

## The TFRecord Format

    1. It is TensorFlow's preferred format for storing large amounts of data and reading it efficiently.

    2. A very simple binar format that just contains a sequence of binary records of varying sizes

In [2]:
# Create a TFRecord file using the tf.io.TFRecordWriter() class

with tf.io.TFRecordWriter('my_data.tfrecord') as f:
    f.write(b'This is the first record')
    f.write(b'This is the second record')

filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)



tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'This is the second record', shape=(), dtype=string)


## Compressed TFRecord Files

In [4]:
## Create a compressed TFRecord file
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_compressed.tfrecord', options=options) as f:
    f.write(b'This is the first record')
    f.write(b'This is the second record')

# Read a compressed TFRecord file:
dataset = tf.data.TFRecordDataset(['my_compressed.tfrecord'], compression_type='GZIP')

## A brief introduction to Protocol Buffers

TFRecord files usually contain serialized protocol buffers (bộ đệm giao thức được tuần tự hóa) (also called *protobufs*) --> This is a protable, extensible, and efficient binary format developed at Google.

## TensorFlow Protobufs

The main protobuf typically used in a TFRecord file is the Example protobuf, whihch represents one instance in a dataset --> contains a list of named features, where each featurecan either be a list of byte strings, a list of floats, or a list of integers

In [14]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

# Create example protobuf
person_example = Example(
    features=Features(
        feature={
            'name': Feature(bytes_list=BytesList(value=[b'Alice'])),
            'age': Feature(int64_list=Int64List(value=[25])),
            'height': Feature(float_list=FloatList(value=[1.75])),
            'weight': Feature(float_list=FloatList(value=[65.4]))
        }
    )
)

# Serialize the example protobuf using SerializeToString() method
with tf.io.TFRecordWriter('my_contact.tfrecord') as f:
    f.write(person_example.SerializeToString())

filepaths = ['my_contact.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'\nI\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x19\n\x12\n\x06weight\x12\x08\x12\x06\n\x04\xcd\xcc\x82B\n\x11\n\x04name\x12\t\n\x07\n\x05Alice\n\x12\n\x06height\x12\x08\x12\x06\n\x04\x00\x00\xe0?', shape=(), dtype=string)


## Loading and Parsing serialized protobufs\

To load the serialized Example protobufs above, we will use a tf.data.TFRecordDataset and parse each Example using tf.io.parse_single_example()

tf.io.parse_string_example() requires at least to arguments:

    1. A string scalar tensor containing the serialized data
    
    2. A description of eaxh feature

In [18]:
feature_description = {
    'name': tf.io.FixedLenFeature([], tf.string),
    'age': tf.io.FixedLenFeature([], tf.int64),
    'height': tf.io.FixedLenFeature([], tf.float32),
    'weight': tf.io.FixedLenFeature([], tf.float32)
}

"""
A tf.io.FixedLenFeature() function takes two arguments:
    1. The length of the feature (the feature's shape)
    2. The type of the feature
"""

for serialized_example in tf.data.TFRecordDataset(filepaths):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)

print(parsed_example['name'].numpy())

b'Alice'


## Handling lists of lists using the SequenceExample Protobuf

In [19]:
FeatureList = tf.train.FeatureList
FeatureLists = tf.train.FeatureLists
SequenceExample = tf.train.SequenceExample

context = Features(feature={
    'author_id': Feature(int64_list=Int64List(value=[123])),
    'title': Feature(bytes_list=BytesList(value=[b"A", b"desert", b"place", b"."])),
    'pub_date': Feature(int64_list=Int64List(value=[1623, 12, 25]))

})
content = [["When", "shall", "we", "three", "meet", "again", "?"],
           ["In", "thunder", ",", "lightning", ",", "or", "in", "rain", "?"]]
comments = [["When", "the", "hurlyburly", "'s", "done", "."],
            ["When", "the", "battle", "'s", "lost", "and", "won", "."]]

def words_to_feature(words):
    return Feature(bytes_list=BytesList(value=[word.encode('utf-8') for word in words]))

content_features = [words_to_feature(sentence) for sentence in content]
comments_features = [words_to_feature(comment)  for comment in comments]

sequence_example = SequenceExample(
    context = context,
    feature_lists=FeatureLists(feature_list={
        'content': FeatureList(feature=content_features),
        'comments': FeatureList(feature=comments_features)
    })
)


In [20]:
sequence_example

context {
  feature {
    key: "author_id"
    value {
      int64_list {
        value: 123
      }
    }
  }
  feature {
    key: "pub_date"
    value {
      int64_list {
        value: 1623
        value: 12
        value: 25
      }
    }
  }
  feature {
    key: "title"
    value {
      bytes_list {
        value: "A"
        value: "desert"
        value: "place"
        value: "."
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "comments"
    value {
      feature {
        bytes_list {
          value: "When"
          value: "the"
          value: "hurlyburly"
          value: "\'s"
          value: "done"
          value: "."
        }
      }
      feature {
        bytes_list {
          value: "When"
          value: "the"
          value: "battle"
          value: "\'s"
          value: "lost"
          value: "and"
          value: "won"
          value: "."
        }
      }
    }
  }
  feature_list {
    key: "content"
    value {
      feature {
      

In [21]:
serialized_sequence_example = sequence_example.SerializeToString()

In [22]:
context_feature_descriptions = {
    "author_id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "title": tf.io.VarLenFeature(tf.string),
    "pub_date": tf.io.FixedLenFeature([3], tf.int64, default_value=[0, 0, 0]),
}

sequence_feature_descriptions = {
    "content": tf.io.VarLenFeature(tf.string),
    "comments": tf.io.VarLenFeature(tf.string),
}

parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(
    serialized_sequence_example, context_feature_descriptions,
    sequence_feature_descriptions
)

In [23]:
parsed_context

{'title': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x1f88d53c2c8>,
 'author_id': <tf.Tensor: shape=(), dtype=int64, numpy=123>,
 'pub_date': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([1623,   12,   25], dtype=int64)>}

In [24]:
parsed_context['title'].values

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'A', b'desert', b'place', b'.'], dtype=object)>

In [25]:
parsed_feature_lists

{'comments': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x1f88d53cbc8>,
 'content': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x1f88d53c288>}

In [26]:
tf.RaggedTensor.from_sparse(parsed_feature_lists['comments'])

<tf.RaggedTensor [[b'When', b'the', b'hurlyburly', b"'s", b'done', b'.'], [b'When', b'the', b'battle', b"'s", b'lost', b'and', b'won', b'.']]>

## Preprocessing the Input Features

If your data contains categorical features or text features, they need to be convert to numbers. This can be done ahead of time when preparing your data files, using any tool you like (e.g., Numpy, pandas, or Scikit-Learn)

You can preprocess your data on the fly when loading it with the Data API (e.g. using map() method)

## Encoding Categorical Features Using One-Hot vectors

In [32]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_


In [34]:
# I will use the California housing dataset to demonstrate, since it contains categorical features and missing values:

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()



In [35]:
fetch_housing_data()

In [36]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


In [37]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [40]:
#Encode categorical feature before we feed it to a neural network
#Since there are very few categories, we ca use one-hot encoding

vocab = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)


In the last two lines we create the lookup table, giving it the initializer and specifying the number of *out-of-vocabulary* (oov) buckets --> if we look up a category that does not exist in the vocabulary, the lookup table, will compute a hash of this category and use it to assign the unknown category to one of the oov buckets.

### Why use oov buckets? 
if the number of categories is large(e.g zip codes, cities, words, products,...) and the dataset is large as well, or it keep chaning, the getting the full list of categories may not be convenient.

-->> You should define the vocabulary based on a data sample (rather than the whole training set) and add some oov buckets for other categories that were not in the data sample. The more unknown categories you expect to find during training, the more oov buckets you should use.

In [41]:
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'INLAND'])
cat_indices = table.lookup(categories)
cat_indices


<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

The unknown category is "DESERT" was mapped to one of the two oov buckets (at index 5)

In [42]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

As a rule of thumb, if the number of categories is lower than 10, then one-hot-encoding is generally the way to go. If the number of categories is grater than 50, the embeddings are usually preferable

## Encoding Categorical Features using Embeddings

An embedding is trainable dense vector that represents a category. By default, embedding are initialized randomly

In [43]:
# Create an ebedding matrix
embedding_dim = 2 
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.9651364 , 0.6970465 ],
       [0.9913566 , 0.4567325 ],
       [0.428887  , 0.40931153],
       [0.6071029 , 0.48528087],
       [0.80757165, 0.44847035],
       [0.98547685, 0.2156725 ],
       [0.1528318 , 0.336241  ]], dtype=float32)>

In [44]:
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'INLAND'])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [45]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.6071029 , 0.48528087],
       [0.98547685, 0.2156725 ],
       [0.9913566 , 0.4567325 ],
       [0.9913566 , 0.4567325 ]], dtype=float32)>

The lookup table says that the 'INLAND' category is at index 1, so the tf.nn.embedding_lookup() function returns the embedding at row 1 in the embedding maxtrix

In [46]:
#Keras provides a keras.layers.Embedding layer that handles the embedding matrix
embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets, output_dim=embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.00730877,  0.0483962 ],
       [ 0.03553097,  0.02830509],
       [ 0.00728489, -0.02812772],
       [ 0.00728489, -0.02812772]], dtype=float32)>

## Keras Preprocessing Layers

 - keras.layers.Discretization layer: it will chop continuous data into different bins and encode each bin as one-hot vector.
    For example: you could use it to discretize prices into three categories (low, medium, high) which would be encoded as [1, 0, 0], [0, 1, 0], [0, 0, 1]

- The Discretization layer will not be differentiable, and it should be used at the start of your model

- The TextVectorization layer will also have an option to output word-count vector instead of word indices. For example, the vocabulary contains three words ['and', 'basketball' , 'more'] and the text "more and more" will mapped to the vector [1, 0 , 2] --> ['and' * 1, 'basketball' * 0, 'more' * 2] --> this text representation is called *bag_of_words*

- The word counts should be normalized in a way that reduces the importance of frequent words --> do this is to devide each word count by **the log of total number of training instances** in which the word appears --> a.k.a *Term-Frequency x Inverse-Document-Frequency* technique (TF-IDF)

## Exercise

1. Why do we use Data API?

    - Bcuz it offers many features, including loading data from various sources (such as text or binary files), reading data in parallel from multiple sources, transforming it, interleaving the records, shuffling the data, batching it, and prefetching it

2. What are the benefits of splitting a large dataset into multiple files?

    - Makes it possible to shuffle it at a coarse level before shuffling it at a finer level using a shuffling buffer.

    - Handle huge datasets that do not fit on a single machine.

    - Easy to manipulate thousands of small files rather than one huge files

    - If the data is split across multiple files spread across multiple servers, it is possoble to download several files from different servers simultaneously, which improves the bandwidth usage.

3. During training, how can you tell that your input pipeline is the bottleneck? What can you do to fix it?

    - I can use TensorBoard to visualize profiling data: if the GPU is not fully utilized then your input pipeline is likely to be the bottleneck --> fix it by making sure it read and preprocesses the data in multiple threads in parallel, and esuring it prefetches a few batches. 