In [None]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
tfds.disable_progress_bar()

dataset, info = tfds.load('imdb_reviews',with_info=True,
as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
train_dataset.element_spec

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...
Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [None]:
for example, label in train_dataset.take(5):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0
text:  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. 

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b"Major Payne was really not very good at all. Despite being funny here and there, the story was ridiculous and the acting was poor. Major Payne's voice and temperament were especially annoying. The idea was ridiculous and the things that the boys had to do in that film were even more ridiculous. I would not recommend this film to anyone."
 b"I got this film from a private collector and was very curious about it. It had a 7,8 in IMDb (9 votes only) and some external comments were pleasant. But I have to say that it is a very usual and uninteresting giallo. Yes, great cinematography, the film is well directed, but it never freaked me out. It starts well, but although it not bored me at all, the story is so ordinary and the things that occur so normal, that I didn't like it very much.<br /><br />You can make a few laughs. And you can see some little tits. But if you like the kind of giallos I like (bizarre, surreal, nonsenseful, gory, atmospheric, brutal murders...) you won't ap

In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[648,   1,  14, ...,   0,   0,   0],
       [ 10, 183,  11, ...,   0,   0,   0],
       [ 10,   1,  11, ...,   0,   0,   0]])

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b"Major Payne was really not very good at all. Despite being funny here and there, the story was ridiculous and the acting was poor. Major Payne's voice and temperament were especially annoying. The idea was ridiculous and the things that the boys had to do in that film were even more ridiculous. I would not recommend this film to anyone."
Round-trip:  major [UNK] was really not very good at all despite being funny here and there the story was ridiculous and the acting was poor major [UNK] voice and [UNK] were especially annoying the idea was ridiculous and the things that the boys had to do in that film were even more ridiculous i would not recommend this film to anyone                                                                                                                                                                                                                                                                                                                      

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,

 # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [None]:
# predict on a sample text without padding.

sample_text = ('The movie was cool. The animation and the graphics were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.0107784]


In [None]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.0107784]
