# Loading and Preprocessing Data with TensorFlow

In [3]:
import tensorflow as tf

## The tf.data API

In [4]:
X = tf.range(36)
X = tf.reshape(X, (6, 6))

In [5]:
X

<tf.Tensor: shape=(6, 6), dtype=int32, numpy=
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]], dtype=int32)>

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(X)

In [7]:
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(6,), dtype=tf.int32, name=None)>

In [8]:
tf.data.Dataset.range(10)

<_RangeDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [9]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5], shape=(6,), dtype=int32)
tf.Tensor([ 6  7  8  9 10 11], shape=(6,), dtype=int32)
tf.Tensor([12 13 14 15 16 17], shape=(6,), dtype=int32)
tf.Tensor([18 19 20 21 22 23], shape=(6,), dtype=int32)
tf.Tensor([24 25 26 27 28 29], shape=(6,), dtype=int32)
tf.Tensor([30 31 32 33 34 35], shape=(6,), dtype=int32)


In [10]:
X_nested = {'a': ([1, 2, 3], [4, 5, 6]), 'b': [7, 8, 9]}
nested_dataset = tf.data.Dataset.from_tensor_slices(X_nested)
for item in nested_dataset:
    print(item)

{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=4>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=7>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=5>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=8>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=6>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=9>}


## Chaining Transformations

In [11]:
dataset.repeat(2).batch(3)

<_BatchDataset element_spec=TensorSpec(shape=(None, 6), dtype=tf.int32, name=None)>

In [12]:
dataset = dataset.map(lambda x: x * 2)
for item in dataset:
    print(item)
    break

tf.Tensor([ 0  2  4  6  8 10], shape=(6,), dtype=int32)


In [13]:
dataset = dataset.filter(lambda x: tf.reduce_sum(x) < 50)
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10], shape=(6,), dtype=int32)


## Shuffling the Data

In [14]:
dataset = tf.data.Dataset.range(10).repeat(2)
dataset = dataset.shuffle(buffer_size= 4, seed= 42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([1 4 2 3 5 0 6], shape=(7,), dtype=int64)
tf.Tensor([9 8 2 0 3 1 4], shape=(7,), dtype=int64)
tf.Tensor([5 7 9 6 7 8], shape=(6,), dtype=int64)


### Interleaving Lines from Multiple Files

In [15]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [16]:
import numpy as np
from pathlib import Path

def save_to_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = Path() / 'datasets' / 'housing'
    housing_dir.mkdir(parents=True, exist_ok=True)
    filename_format = 'my_{}_{:02d}.csv'

    filepaths = []
    m = len(data)
    chunks = np.array_split(np.arange(m), n_parts)
    for file_idx, row_indices in enumerate(chunks):
        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv, 'w') as f:
            if header is not None:
                f.write(header)
                f.write('\n')
            for row_idx in row_indices:
                f.write(','.join([str(col) for col in data[row_idx]]))
                f.write('\n')
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ['MedianHouseValue']
header = ','.join(header_cols)

train_filepaths = save_to_csv_files(train_data, 'train', header, n_parts=20)
valid_filepaths = save_to_csv_files(valid_data, 'valid', header, n_parts=10)
test_filepaths = save_to_csv_files(test_data, 'test', header, n_parts=10)

In [17]:
print(''.join(open(train_filepaths[0]).readlines()[:4]))

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621



In [18]:
train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

In [19]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [20]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)
tf.Ten

In [21]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [22]:
for line in dataset.take(5):
    print(line)

tf.Tensor(b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418', shape=(), dtype=string)
tf.Tensor(b'2.4792,24.0,3.4547038327526134,1.1341463414634145,2251.0,3.921602787456446,34.18,-118.38,2.0', shape=(), dtype=string)
tf.Tensor(b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67', shape=(), dtype=string)
tf.Tensor(b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205', shape=(), dtype=string)
tf.Tensor(b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215', shape=(), dtype=string)


## Preprocessing the Data

In [23]:
mean, var = tf.nn.moments(X_train, axes= 0)
mean = tf.cast(mean, tf.float32)
std = tf.cast(tf.sqrt(var), tf.float32)
n_inputs = 8

In [24]:
def parse_csv_line(line: str) -> tuple[tf.Tensor]:
    # default values for initial 8 features is 0, and label is empty tensor (which raises an error)
    defaults = [0.] * n_inputs + [tf.constant([], dtype= tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults= defaults)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])

In [25]:
def preprocess(line: str) -> tuple[tf.Tensor]:
    x, y = parse_csv_line(line)
    return (x - mean) / std, y

In [26]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

## Putting Everything Together + Prefetching

In [27]:
def csv_reader_dataset(
    filepaths: list[str], 
    *, 
    n_readers: int = 5,
    n_read_threads: int | None = None,
    n_parse_threads: int = 5,
    shuffle_buffer_size: int = 10_000,
    seed: int = 42,
    batch_size: int = 32
) -> tf.data.Dataset:
    dataset = tf.data.Dataset.list_files(filepaths, seed= seed)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length= n_readers, num_parallel_calls= n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls= n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed= seed)
    return dataset.batch(batch_size).prefetch(1)

In [28]:
example_set = csv_reader_dataset(train_filepaths, batch_size= 3)
for X_batch, y_batch in example_set.take(2):
    print('X =', X_batch)
    print('y =', y_batch)
    print()

X = tf.Tensor(
[[-1.3957452  -0.04940685 -0.22830808  0.22648273  2.2593622   0.35200632
   0.9667386  -1.4121602 ]
 [ 2.7112627  -1.0778131   0.69413143 -0.14870553  0.51810503  0.3507294
  -0.82285154  0.80680597]
 [-0.13484643 -1.868895    0.01032507 -0.13787179 -0.12893449  0.03143518
   0.2687057   0.13212144]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.819]
 [3.674]
 [0.954]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[ 0.09031774  0.9789995   0.1327582  -0.13753782 -0.23388447  0.10211545
   0.97610843 -1.4121602 ]
 [ 0.05218809 -2.0271113   0.2940109  -0.02403445  0.16218767 -0.02844518
   1.4117942  -0.93737936]
 [-0.672276    0.02970133 -0.76922584 -0.15086786  0.4962024  -0.02741998
  -0.7853724   0.77182245]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[2.725]
 [1.205]
 [1.625]], shape=(3, 1), dtype=float32)



In [29]:
for m in dir(tf.data.Dataset):
    if not (m.startswith('_') or m.endswith('_')):
        func = getattr(tf.data.Dataset, m)
        if hasattr(func, '__doc__'):
            print(f'● {f'{m}()':<27}{func.__doc__.split('\n')[0]}')

● apply()                    Applies a transformation function to this dataset.
● as_numpy_iterator()        Returns an iterator which converts all elements of the dataset to numpy.
● batch()                    Combines consecutive elements of this dataset into batches.
● bucket_by_sequence_length()A transformation that buckets elements in a `Dataset` by length.
● cache()                    Caches the elements in this dataset.
● cardinality()              Returns the cardinality of the dataset, if known.
● choose_from_datasets()     Creates a dataset that deterministically chooses elements from `datasets`.
● concatenate()              Creates a `Dataset` by concatenating the given dataset with this dataset.
● counter()                  Creates a `Dataset` that counts from `start` in steps of size `step`.
● element_spec()             The type specification of an element of this dataset.
● enumerate()                Enumerates the elements of this dataset.
● filter()                   Fi

## The TFRecord Format

In [30]:
with tf.io.TFRecordWriter('my_data.tfrecord') as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

In [31]:
filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)

In [32]:
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [33]:
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_compressed.tfrecord', options) as f:
    f.write(b'Compress, compress, compress!')

In [34]:
dataset = tf.data.TFRecordDataset(['my_compressed.tfrecord'],
                                  compression_type='GZIP')

## TensorFlow Protobufs

In [35]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

In [36]:
person_example = Example(
    features= Features(
        feature= {
            'name': Feature(bytes_list= BytesList(value= [b'Alic'])),
            'id': Feature(int64_list= Int64List(value= [123])),
            'emails': Feature(bytes_list= BytesList(value= [b'a@b.com', b'c@a.com']))
        }
    )
)

In [37]:
with tf.io.TFRecordWriter('my_contacts.tfrecord') as f:
    for _ in range(5):
        f.write(person_example.SerializeToString())

## Loading and Parsing examples

In [38]:
feature_description = {
    'name': tf.io.FixedLenFeature([], tf.string, default_value= ''),
    'id': tf.io.FixedLenFeature([], tf.int64, default_value= 0),
    'emails': tf.io.VarLenFeature(tf.string)
}

In [39]:
def parse(serialized_example):
    return tf.io.parse_single_example(serialized_example, feature_description)

In [40]:
dataset = tf.data.TFRecordDataset(['my_contacts.tfrecord']).map(parse)

In [41]:
for parsed_example in dataset.take(1):
    print(parsed_example)

{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@a.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alic'>}


In [42]:
tf.sparse.to_dense(parsed_example['emails'], default_value= b'')

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@a.com'], dtype=object)>

In [43]:
parsed_example['emails'].values

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@a.com'], dtype=object)>

In [44]:
def parse(serialized_examples):
    return tf.io.parse_example(serialized_examples, feature_description)   # for all examples

In [45]:
dataset = tf.data.TFRecordDataset(['my_contacts.tfrecord']).batch(2).map(parse)

In [46]:
for example in dataset:
    print(example)

{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@a.com' b'a@b.com' b'c@a.com'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alic', b'Alic'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@a.com' b'a@b.com' b'c@a.com'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alic', b'Alic'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]], shape=(2, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@a.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor(

## Keras Preprocessing Layers
### Discretization Layer

In [47]:
age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
discretize_layer = tf.keras.layers.Discretization(bin_boundaries= [18., 50.])
age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(6, 1), dtype=int64, numpy=
array([[0],
       [2],
       [2],
       [1],
       [1],
       [0]])>

In [48]:
discretize_layer = tf.keras.layers.Discretization(num_bins= 3)
discretize_layer.adapt(age)
age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(6, 1), dtype=int64, numpy=
array([[1],
       [2],
       [2],
       [1],
       [2],
       [0]])>

### Category Encoding Layer

In [49]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
onehot_layer(age_categories)

<tf.Tensor: shape=(6, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)>

In [50]:
two_age_categories = np.array([[1, 0], [2, 2], [2, 0]])
onehot_layer(two_age_categories)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 1., 0.],
       [0., 0., 1.],
       [1., 0., 1.]], dtype=float32)>

In [51]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens= 3, output_mode= 'count')
onehot_layer(two_age_categories)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 1., 0.],
       [0., 0., 2.],
       [1., 0., 1.]], dtype=float32)>

In [52]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens= 3 + 3)
onehot_layer(two_age_categories + [0, 3])  # adds 3 to the second feature

<tf.Tensor: shape=(3, 6), dtype=float32, numpy=
array([[0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.]], dtype=float32)>

### StringLookup Layer

In [53]:
cities = ['Auckland', 'Paris', 'Paris', 'San Francisco']
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(cities)
str_lookup_layer([['Paris'], ['Auckland'], ['Auckland'], ['Montreal']])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[1],
       [3],
       [3],
       [0]])>

In [54]:
str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices= 5)
str_lookup_layer.adapt(cities)
str_lookup_layer([['Paris'], ['Auckland'], ['Foo'], ['Bar'], ['Baz']])

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[5],
       [7],
       [4],
       [3],
       [4]])>

In [55]:
str_lookup_layer = tf.keras.layers.StringLookup(output_mode= 'one_hot')
str_lookup_layer.adapt(cities)
str_lookup_layer([['Paris'], ['Auckland'], ['Auckland'], ['Montreal']])

<tf.Tensor: shape=(4, 4), dtype=int64, numpy=
array([[0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0]])>

In [56]:
ids = [123, 456, 789]
int_lookup_layer = tf.keras.layers.IntegerLookup()
int_lookup_layer.adapt(ids)
int_lookup_layer([[123], [456], [123], [111]])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[3],
       [2],
       [3],
       [0]])>

### Hashing Layer

In [57]:
hashing_layer = tf.keras.layers.Hashing(num_bins= 10)
hashing_layer([['Paris'], ['Tokyo'], ['Auckland'], ['Montreal']])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[0],
       [1],
       [9],
       [1]])>

### Encoding Categorical Features Using Embeddings

In [58]:
tf.random.set_seed(42)
embedding_layer = tf.keras.layers.Embedding(input_dim= 5, output_dim= 2)
embedding_layer(np.array([2, 4, 2]))

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[ 0.01969345,  0.01732457],
       [-0.04321727,  0.00143442],
       [ 0.01969345,  0.01732457]], dtype=float32)>

In [60]:
tf.random.set_seed(42)
ocean_prox = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(ocean_prox)
lookup_and_embed = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape= [], dtype= tf.string),
    str_lookup_layer,
    tf.keras.layers.Embedding(input_dim= str_lookup_layer.vocabulary_size(),
                              output_dim= 2)
])
lookup_and_embed(np.array(['<1H OCEAN', 'ISLAND', '<1H OCEAN']))

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.04645429,  0.04505834],
       [-0.01421032, -0.00461551],
       [-0.04645429,  0.04505834]], dtype=float32)>

In [61]:
tf.random.set_seed(42)
np.random.seed(42)
X_train_num = np.random.rand(10_000, 8)
X_train_cat = np.random.choice(ocean_prox, size= 10_000).astype(object)
y_train = np.random.rand(10_000, 1)
X_valid_num = np.random.rand(2_000, 8)
X_valid_cat = np.random.choice(ocean_prox, size= 2_000).astype(object)
y_valid = np.random.rand(2_000, 1)

num_input = tf.keras.layers.Input(shape= [8], name= 'num')
cat_input = tf.keras.layers.Input(shape= [], dtype= tf.string, name= 'cat')
cat_embeddings = lookup_and_embed(cat_input) 
encoded_inputs = tf.keras.layers.concatenate([num_input, cat_embeddings])
outputs = tf.keras.layers.Dense(1)(encoded_inputs)
model = tf.keras.models.Model(inputs= [num_input, cat_input], outputs= [outputs])
model.compile(loss= 'mse', optimizer= 'sgd')
history = model.fit((X_train_num, X_train_cat), y_train, epochs= 5,
                    validation_data= ((X_valid_num, X_valid_cat), y_valid))

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2619 - val_loss: 0.1127
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1074 - val_loss: 0.0934
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0914 - val_loss: 0.0866
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0856 - val_loss: 0.0842
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0833 - val_loss: 0.0833


### Text Preprocessing

In [62]:
train_data = ['To be', '!(to be)', 'That\'s the question', 'Be, be, be.']
text_vec_layer = tf.keras.layers.TextVectorization()   # basically tokenize and one-hot
text_vec_layer.adapt(train_data)
text_vec_layer(['Be good!', 'Question: be or be?'])

<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[2, 1, 0, 0],
       [6, 2, 1, 2]])>

In [63]:
text_vec_layer = tf.keras.layers.TextVectorization(ragged= True)
text_vec_layer.adapt(train_data)
text_vec_layer(['Be good!', 'Question: be or be?'])

<tf.RaggedTensor [[2, 1], [6, 2, 1, 2]]>

In [64]:
text_vec_layer = tf.keras.layers.TextVectorization(output_mode= 'tf_idf')
text_vec_layer.adapt(train_data)
text_vec_layer(['Be good!', 'Question: be or be?'])

<tf.Tensor: shape=(2, 6), dtype=float32, numpy=
array([[0.96725637, 0.6931472 , 0.        , 0.        , 0.        ,
        0.        ],
       [0.96725637, 1.3862944 , 0.        , 0.        , 0.        ,
        1.0986123 ]], dtype=float32)>

## Using Pretrained Language Model Components

In [65]:
import tensorflow_hub as hub

hub_layer = hub.KerasLayer('https://tfhub.dev/google/nnlm-en-dim50/2')
sentence_embeddings = hub_layer(tf.constant(['To be', 'Not to be']))
sentence_embeddings.numpy().round(2)

array([[-0.25,  0.28,  0.01,  0.1 ,  0.14,  0.16,  0.25,  0.02,  0.07,
         0.13, -0.19,  0.06, -0.04, -0.07,  0.  , -0.08, -0.14, -0.16,
         0.02, -0.24,  0.16, -0.16, -0.03,  0.03, -0.14,  0.03, -0.09,
        -0.04, -0.14, -0.19,  0.07,  0.15,  0.18, -0.23, -0.07, -0.08,
         0.01, -0.01,  0.09,  0.14, -0.03,  0.03,  0.08,  0.1 , -0.01,
        -0.03, -0.07, -0.1 ,  0.05,  0.31],
       [-0.2 ,  0.2 , -0.08,  0.02,  0.19,  0.05,  0.22, -0.09,  0.02,
         0.19, -0.02, -0.14, -0.2 , -0.04,  0.01, -0.07, -0.22, -0.1 ,
         0.16, -0.44,  0.31, -0.1 ,  0.23,  0.15, -0.05,  0.15, -0.13,
        -0.04, -0.08, -0.16, -0.1 ,  0.13,  0.13, -0.18, -0.04,  0.03,
        -0.1 , -0.07,  0.07,  0.03, -0.08,  0.02,  0.05,  0.07, -0.14,
        -0.1 , -0.18, -0.13, -0.04,  0.15]], dtype=float32)

In [66]:
import subprocess
subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)