In [1]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Download the IMDB movie database

## Split the test set into a validation set (15,000) and a test set (10,000)

In [2]:
import tensorflow_datasets as tfds

  from .autonotebook import tqdm as notebook_tqdm


**Splitting and Slicing**

https://www.tensorflow.org/datasets/splits



In [3]:
train, val, test = tfds.load(name='imdb_reviews', split=['train', 'test[:15000]', 'test[15000:]'])

2022-09-05 13:20:10.746281: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


look at one sample

## Apply `TextVectorization` layer to data

https://keras.io/guides/preprocessing_layers/

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
max_features = 10000

In [None]:
def custom_standardization(input_data):
    # lowercase
    lowercase = tf.strings.lower(input_data)
    # replace punctuation with spaces
    regex = tf.strings.regex_replace(lowercase, '''[.,?!'\";:-]''')
    # split
    split = tf.strings.split(regex)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [6]:
# create text vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features, 
    output_mode='int'
)

In [7]:
# Make a text-only dataset (without labels), then call adapt
train_text = train.map(lambda items: items['text']) # don't use item['label'] since we're vectorizing
vectorize_layer.adapt(train_text)

In [8]:
vectorize_layer.get_vocabulary()[:25]

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'br',
 'was',
 'as',
 'for',
 'with',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are']

In [9]:
# list(train.take(1).as_numpy_iterator())

Print an example output of the tokenizer

In [10]:
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["i hate this movie"], ["i love this movie"]]
model.predict(input_data)



array([[ 10, 782,  11,  18],
       [ 10, 116,  11,  18]])

## Create an efficient dataset for each set with `tf.data`

**Configure the dataset for performance** From https://www.tensorflow.org/tutorials/keras/text_classification#configure_the_dataset_for_performance: 


These are two important methods you should use when loading data to make sure that I/O does not become blocking.

.cache() keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.

.prefetch() overlaps data preprocessing and model execution while training.

In [11]:
BATCH_SIZE = 32

train = train.shuffle(len(train), reshuffle_each_iteration=True).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val = val.shuffle(len(val), reshuffle_each_iteration=True).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test = test.shuffle(len(test), reshuffle_each_iteration=True).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Create NN Model

In [12]:
embedding_dim = 16

In [15]:
from tensorflow import keras

In [16]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    keras.layers.Embedding(max_features + 1, embedding_dim),
    keras.layers.Dense(300, activation="elu", kernel_initializer='he_normal'),
    keras.layers.Dense(300, activation="elu", kernel_initializer='he_normal'),
    keras.layers.Dense(300, activation="elu", kernel_initializer='he_normal'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation="sigmoid")])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dense (Dense)               (None, None, 300)         5100      
                                                                 
 dense_1 (Dense)             (None, None, 300)         90300     
                                                                 
 dense_2 (Dense)             (None, None, 300)         90300     
                                                                 
 dropout (Dropout)           (None, None, 300)         0         
                                                      

In [17]:
model.compile(
    loss="binary_crossentropy", 
    optimizer="nadam",
    metrics=["accuracy"]
)

In [20]:
model.fit(
     train.take(1).as_numpy_iterator(),
    epochs=5, 
    validation_data=val)

Epoch 1/5


ValueError: in user code:

    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/naekid/Documents/Documents - Nathan’s MacBook Pro/data-science/oreilly-hands-on-machine-learning/venv/lib/python3.7/site-packages/keras/engine/input_spec.py", line 183, in assert_input_compatibility
        raise ValueError(f'Missing data for input "{name}". '

    ValueError: Missing data for input "input_3". You passed a data dictionary with keys ['label', 'text']. Expected the following keys: ['input_3']
