## Importing libraries

In [1]:
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import os
import numpy as np
from collections import Counter

## Loading dataset

In [2]:
DOWNLOAD_ROOT = 'http://ai.stanford.edu/~amaas/data/sentiment/'
FILENAME = 'aclImdb_v1.tar.gz'
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract = True)

In [3]:
path = Path(filepath).parent / 'aclImdb'

In [4]:
len(path.parts)

6

In [5]:
a = 1

In [6]:
# Displaying the filepaths
for name, subdirs, files in os.walk(path):
    indent = len(Path(name).parts) - len(path.parts)
    print('    ' * indent + Path(name).parts[-1] + os.sep)
    for index, filename in enumerate(sorted(files)):
        if index == 3:
            print('    ' * (indent + 1) + '...')
            break
        print('    ' * (indent + 1) + filename)    

aclImdb/
    README
    imdb.vocab
    imdbEr.txt
    test/
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
    train/
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...


In [7]:
# Func to store the filepaths as strings
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob('*.txt')]

In [8]:
train_pos = review_paths(path / 'train' / 'pos')
train_neg = review_paths(path / 'train' / 'neg')
test_valid_pos = review_paths(path / 'test' / 'pos')
test_valid_neg = review_paths(path / 'test' / 'neg')

In [9]:
len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

## Splitting the test set into test and validation sets

In [10]:
np.random.shuffle(test_valid_pos)

In [11]:
test_pos = test_valid_pos[:5000]
valid_pos = test_valid_pos[5000:]
test_neg = test_valid_neg[:5000]
valid_neg = test_valid_neg[5000:]

In [12]:
len(test_pos), len(test_neg), len(valid_pos), len(valid_neg)

(5000, 5000, 7500, 7500)

## Use tf.data for creating a dataset for each set

In [13]:
# Since the dataset fits in memory, we can use a simple python func to create a dataset using the from_tenso_slices() method.
def imdb_dataset(filepaths_pos, filepaths_neg):
    reviews = []
    labels = []
    for filepaths, label in ((filepaths_neg, 0), (filepaths_pos, 1)):
        for filepath in filepaths:
            with open(filepath) as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices((tf.constant(reviews), tf.constant(labels))) # Creating the dataset from the tensors 

In [14]:
for x, y in imdb_dataset(train_pos, train_neg).take(3):
    print(x)
    print(y)
    print()

tf.Tensor(b'It\'s been a while since seeing this the first time, so I watched it again with the second movie in the series. While I realize there is a 3rd movie out that I haven\'t seen yet, I\'ll review under the original title...<br /><br />Just from the standpoint of production value, screen writing, and movie making, this movie fails on many levels, though it succeeds on a few as well. What can you expect from a low-budget, "B" movie? Not much, and it works from the standpoint of production. However, the writing is certainly disjointed, with little in the way of character development...exactly what I\'d expect when there is an agenda to a film. I didn\'t have a problem with the acting...the cast is solid; however, the screenplay in both movies gives the actors little opportunity to really stretch themselves. Because the film is "Christian," this is predictable, as you can\'t very well portray violent chaos of the "end times" without also breaking some of the ethics which are normal

In [15]:
%timeit -r1 for x, y in imdb_dataset(train_pos, train_neg).repeat(10): pass

23.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


- It took 16 secs to load the dataset and go through it 10 times.

- If the dataset didnt fit in memory we would have to load it using tf.data or convert it into TFRecord file. Luckily each review fits on just one line and is seperated by  `<br />` .

In [16]:
def imdb_dataset(filepaths_pos, filepaths_neg, n_parallel_threads = 5):
    dataset_neg = tf.data.TextLineDataset(filepaths_neg, num_parallel_reads = n_parallel_threads)
    dataset_neg = dataset_neg.map(lambda review : (review, 0))
    dataset_pos = tf.data.TextLineDataset(filepaths_pos, num_parallel_reads = n_parallel_threads)
    dataset_pos = dataset_pos.map(lambda review : (review, 1))
    return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)

In [17]:
%timeit -r1 for x, y in imdb_dataset(train_pos, train_neg).repeat(10) : pass

1min 4s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


- It now takes 30 secs to do the same.This is much slower because the dataset is not cached in RAM and it must be loaded each time.

In [18]:
%timeit -r1 for x, y in imdb_dataset(train_pos, train_neg).cache().repeat(10) : pass

25.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


- The time taken has reduced as we cached the dataset in the RAM without loading it each time.

In [19]:
batch_size = 32
train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)
valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

- We will first define a func to prerpocess the data. It will crop them to 300 characters, converting them to lower cases then replacing the `<br />` and all other non-letter characters to spaces, splitting the reviews into words and finally padding or cropping each review so it ends up with exactly n_words tokens.

In [20]:
def preprocess(x_batch, n_words = 50):
    shape = tf.shape(x_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])
    z = tf.strings.substr(x_batch, 0, 300) # Converting the main review to strings containing 300 characters.
    z = tf.strings.lower(z) # converting the text to lower case
    z = tf.strings.regex_replace(z, b'<br\\s*/?>', b' ') # Converting the non-letter characters to spaces.
    z = tf.strings.regex_replace(z, b'[^a-z]', b' ')
    z = tf.strings.split(z)
    return z.to_tensor(shape = shape, default_value = b'<pad>')

In [82]:
x_example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!!!"])

In [108]:
x_example

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"It's a great, great movie! I loved it.",
       b'It was terrible, run away!!!'], dtype=object)>

In [84]:
preprocess(x_example)

<tf.Tensor: shape=(2, 50), dtype=string, numpy=
array([[b'it', b's', b'a', b'great', b'great', b'movie', b'i', b'loved',
        b'it', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
       [b'it', b'was', b'terrible', b'run', b'away', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'

In [85]:
# Utility func that will take a data sample and will output the list of the top max_size most frequent words.
def get_vocabulary(data_sample, max_size = 1000):
    preprocessed_reviews = preprocess(data_sample).numpy() # Preprocessing the input using the func defined above.
    counter = Counter() # Initializing the counter
    for words in preprocessed_reviews:
        for word in words:
            if word != b'<pad>':
                counter[word] += 1
    return [b'<pad>'] + [word for word, count in counter.most_common(max_size)]

In [86]:
get_vocabulary(x_example)

[b'<pad>',
 b'it',
 b'great',
 b's',
 b'a',
 b'movie',
 b'i',
 b'loved',
 b'was',
 b'terrible',
 b'run',
 b'away']

## Building the custom layers

In [87]:
# Custom layer class to convert the text to vectors by the help of lookup tables.
class Text_vectorization(keras.layers.Layer):
    def __init__(self, max_vocabulary_size = 1000, n_oov_buckets = 100, dtype = tf.string, **kwargs): # Init method
        super().__init__(dtype = dtype, **kwargs)
        self.max_vocabulary_size = max_vocabulary_size # Maximum size of the vocabulary
        self.n_oov_buckets = n_oov_buckets # Size of out-of-vocabulary buckets
    def adapt(self, data_sample):
        self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size) # Creating the vocabulary
        words = tf.constant(self.vocab) # Converting the vocabulary to a tensor
        word_ids = tf.range(len(self.vocab), dtype = tf.int64) # Initializing the indices for the lookup table
        vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids) # Creating teh Key Value Tensor
        self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets) # Creating the lookup table
    def call(self, inputs):
        preprocessed_inputs = preprocess(inputs) # Preprocessing the inputs
        return self.table.lookup(preprocessed_inputs) # Returning the vectorized texts.

In [88]:
text_vectorization = Text_vectorization()

In [89]:
text_vectorization.adapt(x_example)

In [90]:
text_vectorization(x_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[ 1,  3,  4,  2,  2,  5,  6,  7,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  8,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

- We can see that each review was cleaned up and tokenized, then each word was encoded as its index in the vocabulary. All the 0s correspond to the `<pad>` token.
- Now create another instance of the Text_vectorization() layer and adapt it to the entire IMDB dataset.

In [91]:
max_vocabulary_size = 1000
n_oov_buckets = 100

In [92]:
sample_review_batches = train_set.map(lambda review, label : review) # Extracting only the reviews out the dataset

In [93]:
sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()), axis = 0) # Creating a numpy array of the reviews

In [109]:
text_vectorization = Text_vectorization(max_vocabulary_size, n_oov_buckets, input_shape = [])

In [110]:
text_vectorization.adapt(sample_reviews)

In [111]:
text_vectorization(x_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[  9,  14,   2,  64,  64,  12,   5, 257,   9,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  9,  13, 269, 530, 335,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [112]:
text_vectorization.vocab[:10]

[b'<pad>', b'the', b'a', b'of', b'and', b'i', b'to', b'is', b'this', b'it']

- The above are the top 10 used words in the dataset.

- Now we have to encode these word ids before feeding to the model. One method is to  use bag of words, for each review and for each word in the vocabulary we count the no. of occurences of that word in that review.

In [113]:
simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])
tf.reduce_sum(tf.one_hot(simple_example, 4), axis = 1)

<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[2., 2., 0., 1.],
       [3., 0., 2., 0.]], dtype=float32)>

- The first review has the word zero 2 times, one 2 times, two 0 times, 3 one time. So we have denoted the count respectively. The same applies for the seconf part.

In [114]:
tf.one_hot(simple_example, 4)

<tf.Tensor: shape=(2, 5, 4), dtype=float32, numpy=
array([[[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]]], dtype=float32)>

In [115]:
class Bag_of_words(keras.layers.Layer):
    def __init__(self, n_tokens, dtype = tf.int32, **kwargs):
        super().__init__(dtype = dtype, **kwargs)
        self.n_tokens = n_tokens
    def call(self, inputs):
        one_hot = tf.one_hot(inputs, self.n_tokens) # One-hot encoding the review
        return tf.reduce_sum(one_hot, axis = 1)[:, 1:] # Omitting the <pad> as it occurs the most

In [116]:
bag_of_words = Bag_of_words(n_tokens = 4)
bag_of_words(simple_example)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[2., 0., 1.],
       [0., 2., 0.]], dtype=float32)>

- It works fine. Now create another instance with the size required for the train set

In [117]:
n_tokens = max_vocabulary_size + n_oov_buckets + 1

In [118]:
bag_of_words = Bag_of_words(n_tokens)

- Now we can build and train the model

In [119]:
model = keras.models.Sequential()
model.add(text_vectorization)
model.add(bag_of_words)
model.add(keras.layers.Dense(100, activation = 'relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [120]:
model.compile(loss = keras.losses.binary_crossentropy, optimizer = 'nadam', metrics = ['accuracy'])

In [121]:
model.fit(train_set, epochs = 5, validation_data = valid_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe815748e50>

In [107]:
for x, y in train_set.take(1):
    print(x)

tf.Tensor(
[b'I was honestly surprised by Alone in the Dark. It was so bad, I could hardly believe what I was seeing. There are no characters, just a few stereotypes wandering around and getting killed. The extent of the character development was giving each character a name and an occupation, and that\'s about it. There was no real plot, and none of the characters seemed to have any motivation. In fact, many action scenes just began on their own, coming from nowhere with a pounding techno track. While I was watching this movie I kept asking "Where is this happening? What\'s going on?" The acting was high school drama quality, with stiff wooden delivery, as though the actors were reading from cue cards without comprehending their lines. Their trouble delivering lines was made even more obvious by horrible sound design. ADR sounded like it was recorded in an open room. The actors were constantly taking obvious care to hit their marks, looking almost robotic in their movements. So, these