### Subword Tokenization with the IMDB Reviews Dataset

##### Importing libraries

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import keras_nlp

In [2]:
imdb = tfds.load("imdb_reviews", as_supervised=True)


In [3]:
#extracting reviews and labels

train_reviews = imdb['train'].map(lambda review, label: review)
test_reviews = imdb['test'].map(lambda review, label: review)
train_labels = imdb['train'].map(lambda review, label: label)
test_labels = imdb['test'].map(lambda review, label: label)

In [4]:
list(train_reviews.take(2))[0]

<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">

##### Subword Tokenization

In [5]:
#parameters for tokenization and padding

vocab_size = 10000
max_length = 120
padding_type = "pre"
truncating_type = "post"

In [6]:
#instantiating vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)

#generating vocabulary based on training reviews
vectorize_layer.adapt(train_reviews)


In [7]:
def padding_func(sequences):

    sequences = sequences.ragged_batch(batch_size=sequences.cardinality())

    sequences = sequences.get_single_element()

    padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(), 
                padding=padding_type,
                truncating=truncating_type,
                maxlen=max_length)

    padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)

    return padded_sequences            

In [8]:
train_sequences = train_reviews.map(vectorize_layer).apply(padding_func)

The cell above uses a vocab_size of 10000 but you'll find that it's easy to find OOV tokens when decoding using the lookup dictionary it created

In [9]:
#get the vocabulary

imdb_vocab_fillword = vectorize_layer.get_vocabulary()

#get a sample integer sequence
sample_sequence = train_sequences.take(1).get_single_element()

#lookup each token in the vocabulary
decoded_text = [imdb_vocab_fillword[index] for index in sample_sequence]
decoded_text = " ".join(decoded_text)
print(decoded_text)

    this was an absolutely terrible movie dont be [UNK] in by christopher walken or michael [UNK] both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the [UNK] rebels were making their cases for [UNK] maria [UNK] [UNK] appeared phony and her [UNK] affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actors like christopher [UNK] good name i could barely sit through it


* For binary classifiers, this might not have a big impact but you may have other applications that will benefit from avoiding OOV tokens when training the model (e.g. text generation). If you want the tokenizer above to not have OOVs, then you might have to increase the vocabulary size to more than 88k. Right now, it's only at 10k. This can slow down training and bloat the model size. The encoder also won't be robust when used on other datasets which may contain new words, thus resulting in OOVs again.

* Subword text encoding gets around this problem by using parts of the word to compose whole words. This makes it more flexible when it encounters uncommon words.

Firstly, we will compute the subword vocabulary using the compute_word_piece_vocabulary() function.

- learn from the train_reviews
- set a max vocabulary size of 8k
- reserve special tokens similar to the full word vocabulary
- save the output to a file in the current directory

In [None]:
#computing the subword vocabulary and saving it into a file

keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_reviews,
    vocabulary_size=8000,
    reserved_tokens=["[PAD]", "[UNK]"],
    vocabulary_output_file='imdb_vocab_subwords.txt'
)

NOTE:
* compute_word_piece_vocabulary requires `tensorflow` and `tensorflow-text` for text processing. Run `pip install tensorflow-text`

In [12]:
#since we are using python 3.13 which is not compatible with tensorflow-text>=2.20, we are using pre-downloaded file.

In [None]:
#initialize the subword tokenizer
subword_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary="./imdb_vocab_subwords.txt"
)


In [None]:
#printing the subwords

subword_tokenizer.get_vocabulary()


If we use it on the previous plain text sentence, we will see that it won't have any OOVs even if it has a smaller vocab size (only around 8k compared to 10k above)

In [None]:
#show the size of the subword vocabulary

subword_tokenizer.vocabulary_size()

In [None]:
# Get a sample review
sample_review = train_reviews.take(1).get_single_element()

# Encode the first plaintext sentence using the subword text encoder
tokenized_string = subword_tokenizer.tokenize(sample_review)
print ('Tokenized string is {}'.format(tokenized_string))

# Decode the sequence
original_string = subword_tokenizer.detokenize(tokenized_string)

# Print the result
print('The original string: {}'.format(original_string))

Subword encoding can even perform well on words that are not commonly found in movie reviews. First, see the result when using the full-word tokenizer. As expected, it will show many unknown words.

In [None]:
# Define sample sentence
sample_string = 'TensorFlow, from basics to mastery'

# Encode using the plain text tokenizer
tokenized_string = vectorize_layer(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

# Decode and print the result
decoded_text = [imdb_vocab_fullword[token] for token in tokenized_string]
original_string = ' '.join(decoded_text)
print ('The original string: {}'.format(original_string))

In [None]:
# Encode using the subword text encoder
tokenized_string = subword_tokenizer.tokenize(sample_string)
print('Tokenized string is {}'.format(tokenized_string))

# Decode and print the results
original_string = subword_tokenizer.detokenize(tokenized_string).numpy().decode("utf-8")
print('The original string: {}'.format(original_string))


As you may notice, the sentence is correctly decoded. The downside is the token sequence is much longer. Instead of only 5 when using the full-word tokenizer, you ended up with 12 tokens instead. 

In [None]:
# Show token to subword mapping:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, subword_tokenizer.detokenize([ts]).numpy().decode("utf-8")))

##### Training the model

In [None]:
SHUFFLE_BUFFER_SIZE = 10000
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

# Generate integer sequences using the subword tokenizer
train_sequences_subword = train_reviews.map(lambda review: subword_tokenizer.tokenize(review)).apply(padding_func)
test_sequences_subword = test_reviews.map(lambda review: subword_tokenizer.tokenize(review)).apply(padding_func)

# Combine the integer sequence and labels
train_dataset_vectorized = tf.data.Dataset.zip(train_sequences_subword,train_labels)
test_dataset_vectorized = tf.data.Dataset.zip(test_sequences_subword,test_labels)

# Optimize the datasets for training
train_dataset_final = (train_dataset_vectorized
                       .shuffle(SHUFFLE_BUFFER_SIZE)
                       .cache()
                       .prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
                       .batch(BATCH_SIZE)
                       )

test_dataset_final = (test_dataset_vectorized
                      .cache()
                      .prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
                      .batch(BATCH_SIZE)
                      )

In [None]:
# Define dimensionality of the embedding
EMBEDDING_DIM = 64

# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(MAX_LENGTH,)),
    tf.keras.layers.Embedding(subword_tokenizer.vocabulary_size(), EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Print the model summary
model.summary()

In [None]:
num_epochs = 10

# Set the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Start training
history = model.fit(train_dataset_final, epochs=num_epochs, validation_data=test_dataset_final)

##### Visualizing the results

In [None]:
def plot_loss_acc(history):
  '''Plots the training and validation loss and accuracy from a history object'''
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(acc))

  fig, ax = plt.subplots(1,2, figsize=(12, 6))
  ax[0].plot(epochs, acc, 'bo', label='Training accuracy')
  ax[0].plot(epochs, val_acc, 'b', label='Validation accuracy')
  ax[0].set_title('Training and validation accuracy')
  ax[0].set_xlabel('epochs')
  ax[0].set_ylabel('accuracy')
  ax[0].legend()

  ax[1].plot(epochs, loss, 'bo', label='Training Loss')
  ax[1].plot(epochs, val_loss, 'b', label='Validation Loss')
  ax[1].set_title('Training and validation loss')
  ax[1].set_xlabel('epochs')
  ax[1].set_ylabel('loss')
  ax[1].legend()

  plt.show()

plot_loss_acc(history)