### Single Layer LSTM

In [1]:
#importing libraries

import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import keras_nlp

  from pkg_resources import parse_version


In [2]:
#loading the dataset

imdb = tfds.load("imdb_reviews", as_supervised=True)

In [3]:
#train and test data

train_data, test_data = imdb['train'], imdb['test']


In [4]:
#extracting train and test reviews and labels
train_reviews = train_data.map(lambda review, label: review)
train_labels = train_data.map(lambda review, label: label)

test_reviews = test_data.map(lambda review, label: review)
test_labels = test_data.map(lambda review, label: label)


In [5]:
keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_reviews,
    vocabulary_size=8000,
    reserved_tokens=["[PAD]", "[UNK]"],
    vocabulary_output_file='imdb_vocab_subwords.txt'
)

In [6]:
#initialize the subword tokenizer

subword_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
   vocabulary="./imdb_vocab_subwords.txt" 
)

In [7]:
#data pipeline and padding parameters

shuffle_buffer_size = 10000
prefecth_buffer_size = tf.data.AUTOTUNE
batch_size = 256
padding_type = "pre"
truncating_type = "post"

In [8]:
def padding_func(sequences):

    sequences = sequences.ragged_batch(batch_size=sequences.cardinality())

    sequences = sequences.get_single_element()

    padded_sequences = tf.keras.utils.pad_sequences(
        sequences.numpy(),
        padding=padding_type,
        truncating=truncating_type
    )

    padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)

    return padded_sequences

In [None]:
#generate integer sequences using the subword tokenizer
train_sequences_subword = train_reviews.map(lambda review: subword_tokenizer.tokenize(review)).apply(padding_func)
test_sequences_subword = test_reviews.map(lambda review: subword_tokenizer.tokenize(review)).apply(padding_func)

#combine the integer sequence and labels

train_dataset_vectorized = tf.data.Dataset.zip(train_sequences_subword, train_labels)
test_dataset_vectorized