In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras 
tf.random.set_seed(42)

import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [2]:
print(datasets.keys())

dict_keys(['test', 'train', 'unsupervised'])


In [3]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [4]:
info.splits["train"].num_examples

25000

In [5]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples
print(train_size , test_size)

25000 25000


In [6]:
# traversing through the batches and showing first 200 characters of review and label of the first batch data samples:
for X_batch, y_batch in datasets["train"].batch(3).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative

Review: Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label: 0 = Negative



In [7]:
'''Truncate the reviews, keeping only the first 300 characters of each since you can generally tell whether a review is positive or not in the first sentence or two.
Then we use regular expressions to replace <br/> tags with spaces and to replace any characters other than letters and quotes with spaces.
Finally, the preprocess() function splits the reviews by the spaces, which returns a ragged tensor(variable length tensor), 
and it converts this ragged tensor to a dense tensor(store values in a contiguous sequential block of memory), 
padding all reviews with the padding token <pad> so that they all have the same length.
'''
# input Tensor => X_batch
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(3, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [8]:
from collections import Counter
vocabulary = Counter()

#make a vocabulary dictionary containing the words and their counts correspondingly
for X_batch, y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

vocabulary.most_common()[:5]

[(b'<pad>', 63155),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431)]

In [9]:
type(vocabulary)

collections.Counter

In [10]:
ignore = [b'the',b'a',b'if',b'in',b'it',b'of',b'or',b'the',b'and',b'to']
for word in list(vocabulary):
    if word in ignore:
        del vocabulary[word]

In [11]:
#Deleting the most frequent stop words
vocabulary = Counter({k:vocabulary[k] for k in vocabulary if vocabulary[k] < 5665})

In [12]:
vocabulary.most_common()[:20]

[(b'you', 5500),
 (b'an', 5183),
 (b'at', 4814),
 (b'about', 4799),
 (b'by', 4717),
 (b'all', 4655),
 (b'his', 4625),
 (b'so', 4521),
 (b'like', 4428),
 (b'from', 4321),
 (b'who', 4266),
 (b'has', 4178),
 (b'It', 4038),
 (b'good', 3727),
 (b'my', 3662),
 (b'just', 3636),
 (b'very', 3571),
 (b'out', 3376),
 (b'story', 3211),
 (b'some', 3197)]

In [13]:
print(len(vocabulary))

53864


In [14]:
#Truncating the Vocabulary - keep only top 10k
vocab_size = 10000
truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]

#Creating a lookup table
#Create a tensor words containing the words of truncated_vocabulary
words  = tf.constant(truncated_vocabulary)

In [15]:
'''Computer can only process numbers but not words. Thus we need to convert the words in truncated_vocabulary into numbers.
So we now need to add a preprocessing step to replace each word with its ID (i.e., its index in the truncated_vocabulary).
We will create a lookup table for this, using 1,000 out-of-vocabulary (oov) buckets.
We shall create the lookup table such that the most frequently occurring words have lower indices than less frequently
occurring words.'''
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[10745, 10345, 10637, 10053]])>

In [16]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

#Apply the function preprocess on every batch of data with 32 samples repeatedly on the train data datasets["train"]
train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)
test_set = datasets["test"].batch(1000).map(preprocess)
test_set = test_set.map(encode_words)

In [17]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[10745 10637     1 ... 10403 10403 10403]
 [10180 10850    42 ... 10403 10403 10403]
 [ 4070  6852 10354 ... 10403 10403 10403]
 ...
 [10745 10345    89 ...   302  1018 10403]
 [ 1728  4072   422 ... 10403 10403 10403]
 [ 3336  4363 10180 ... 10403 10403 10403]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [18]:
#Building the Model
'''The first layer is an Embedding layer, which will convert word IDs into embeddings. 
The embedding matrix needs to have one row per word ID (vocab_size + num_oov_buckets) and one column per embedding 
dimension (this example uses 128 dimensions, but this is a hyperparameter you could tune).
Whereas the inputs of the model will be 2D tensors of shape [batch size, time steps], the output of the 
Embedding layer will be a 3D tensor of shape [batch size, time steps, embedding size].'''

'''Create the model model with Embedding layer,GRU layer with 4 units
GRU layer with 2 units,Dense layer with 1 unit and sigmoid activation'''
embed_size = 128


model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
               mask_zero=True,
               input_shape=[None]),
    keras.layers.GRU(4, return_sequences=True),
    keras.layers.GRU(2),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
import time
start  = time.time()
model.fit(train_set, steps_per_epoch=train_size // 32, epochs=2)
end  = time.time()
print("Time of execution:", end-start)



Train for 781 steps
Epoch 1/2
Epoch 2/2
Time of execution: 155.60411405563354


In [20]:
model.evaluate(test_set)

     25/Unknown - 5s 189ms/step - loss: 0.5195 - accuracy: 0.7641

[0.5194795954227448, 0.76408]

In [42]:
y_pred = model.predict(test_set)

In [36]:
y_pred

array([[0.173173  ],
       [0.61271054],
       [0.05875764],
       ...,
       [0.04878023],
       [0.90365326],
       [0.882887  ]], dtype=float32)

In [51]:
'''Remember, we mapped the positive outputs to 1 and the negative outputs to 0. 
However, the sigmoid function predicts floating value between 0 and 1. 
If the value is less than 0.5, the sentiment is considered negative where as if the value is greater than 0.5, 
the sentiment is considered as positive.'''
classes = [0 if i <0.5  else 1 for i in y_pred]


In [52]:
c=0
for X_batch, y_batch in datasets["test"].batch(10).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print("Predicted Label:", y_pred[c], "Predicted Class", classes[c])
        print()
        c = c+1

Review: There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING ...
Label: 1 = Positive
Predicted Label: [0.173173] Predicted Class 0

Review: A blackly comic tale of a down-trodden priest, Nazarin showcases the economy that Luis Bunuel was able to achieve in being able to tell a deeply humanist fable with a minimum of fuss. As an output fro ...
Label: 1 = Positive
Predicted Label: [0.61271054] Predicted Class 1

Review: Scary Movie 1-4, Epic Movie, Date Movie, Meet the Spartans, Not another Teen Movie and Another Gay Movie. Making "Superhero Movie" the eleventh in a series that single handily ruined the parody genre. ...
Label: 0 = Negative
Predicted Label: [0.05875764] Predicted Class 0

Review: Poor Shirley MacLaine tries hard to lend some gravitas to this mawkish, gag-inducing "feel-good" movie, but she's trampled by the run-away sentiment