This example is used to explore data pre-processing techniques.

In [6]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [7]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [8]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [9]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [10]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'\'P\' (or Club-P) should really be called \'L\' for lame. Every festival has a disappointment and this is the one that fails to live up to its much-hyped logline: "Thai lesbians fighting monsters." Rather, this is the tale of a Khmer country girl who\'s grandmother has taught her a little witchcraft along with a few odd (but specific) rules: "don\'t walk under a clothesline," "don\'t eat raw meat," and "don\'t accept money for your powers." Well, guess what folks, the girl moves to Bangkok to raise some money as a \'bar-girl\' and manages to break all the rules granny taught her which subsequently releases an evil spirit that conveniently kills the \'foreign johns\' who pay for her services.<br /><br />While this film can\'t even be released in Thailand due to it\'s controversial subject matter most American audiences will find this ho-hum horror pic a cross between "Showgirls" and "Interview with the Vampire" as directed by Walt Disney.<br /><br />If not for a few scenes wi

In [11]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [12]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [13]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[  1,  42,   1, ...,   0,   0,   0],
       [  2,  86, 383, ...,   0,   0,   0],
       [ 11,   7,   2, ...,   0,   0,   0]])

In [14]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'\'P\' (or Club-P) should really be called \'L\' for lame. Every festival has a disappointment and this is the one that fails to live up to its much-hyped logline: "Thai lesbians fighting monsters." Rather, this is the tale of a Khmer country girl who\'s grandmother has taught her a little witchcraft along with a few odd (but specific) rules: "don\'t walk under a clothesline," "don\'t eat raw meat," and "don\'t accept money for your powers." Well, guess what folks, the girl moves to Bangkok to raise some money as a \'bar-girl\' and manages to break all the rules granny taught her which subsequently releases an evil spirit that conveniently kills the \'foreign johns\' who pay for her services.<br /><br />While this film can\'t even be released in Thailand due to it\'s controversial subject matter most American audiences will find this ho-hum horror pic a cross between "Showgirls" and "Interview with the Vampire" as directed by Walt Disney.<br /><br />If not for a few scenes 

In [15]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [16]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [23]:
# predict on a sample text without padding.

sample_text = ("I loved the movie")
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

ValueError: Invalid dtype: str544

In [24]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

ValueError: Invalid dtype: str256000