### Single Layer LSTM

In [3]:
#importing libraries

import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import keras_nlp

  from pkg_resources import parse_version


In [4]:
#loading the dataset

imdb = tfds.load("imdb_reviews", as_supervised=True)

In [5]:
#train and test data

train_data, test_data = imdb['train'], imdb['test']


In [6]:
#extracting train and test reviews and labels
train_reviews = train_data.map(lambda review, label: review)
train_labels = train_data.map(lambda review, label: label)

test_reviews = test_data.map(lambda review, label: review)
test_labels = test_data.map(lambda review, label: label)


In [7]:
keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_reviews,
    vocabulary_size=8000,
    reserved_tokens=["[PAD]", "[UNK]"],
    vocabulary_output_file='imdb_vocab_subwords.txt'
)

In [8]:
#initialize the subword tokenizer

subword_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
   vocabulary="./imdb_vocab_subwords.txt" 
)

In [9]:
#data pipeline and padding parameters

shuffle_buffer_size = 10000
prefetch_buffer_size = tf.data.AUTOTUNE
batch_size = 256
padding_type = "pre"
truncating_type = "post"

In [10]:
#generate integer sequences using the subword tokenizer
train_sequences_subword = (train_reviews
                        .map(lambda review: subword_tokenizer.tokenize(review))
                        .padded_batch(batch_size=batch_size, padded_shapes=[None]))
test_sequences_subword = (test_reviews
                          .map(lambda review: subword_tokenizer.tokenize(review))
                          .padded_batch(batch_size=batch_size, padded_shapes=[None]))

#batching the outputs
train_labels = train_labels.map(lambda y : tf.expand_dims(y, -1))
test_labels = test_labels.map(lambda y : tf.expand_dims(y, -1))

#combine the integer sequence and labels
train_dataset_vectorized = tf.data.Dataset.zip((train_sequences_subword, train_labels))
test_dataset_vectorized = tf.data.Dataset.zip((test_sequences_subword, test_labels))

#optimizing the datasets for training
train_dataset_final = (train_dataset_vectorized
                        .cache()
                        .shuffle(shuffle_buffer_size)
                        .prefetch(prefetch_buffer_size)
                        )

test_dataset_final = (test_dataset_vectorized
                        .cache()
                        .prefetch(prefetch_buffer_size)
                        )


In [11]:
#building and compiling the model

embedding_dim = 64
lstm_dim = 64
dense_dim = 64

model = tf.keras.Sequential([
    tf.keras.Input(shape=(None,), dtype=tf.int32),
    tf.keras.layers.Embedding(subword_tokenizer.vocabulary_size(), embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(dense_dim, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          488640    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 563,009
Trainable params: 563,009
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
#training the model

history=model.fit(train_dataset_final,
          epochs=3,
          validation_data = test_dataset_final)

Epoch 1/3