# Importing libraries

In [None]:
import tensorflow as tf
!pip install -q tensorflow_datasets
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Loading the data

In [None]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9PLC3A/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9PLC3A/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9PLC3A/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for s,l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())

for s,l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

In [None]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

# Preprocessing the data

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token = oov_tok)

tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

training_padded = pad_sequences(training_sequences, maxlen = max_length, 
                                truncating = trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length)

Things to Note: 
1.   The tokenizer object is first defined by passing the vocabulary size and the default token to be used in case of absence of a word in the word index.
2.   The training data is to be passed into the fit method which creates a word index that can be used to produce a sequence of numbers in place of words.
3.  The sequences of both training and testing data are produced, which are then padded to obtain a matrix with rows of the same size. Different options are available in the arguments of the padding function.







In [None]:
training_padded[0]

array([   0,    0,   59,   12,   14,   35,  439,  400,   18,  174,   29,
          1,    9,   33, 1378, 3401,   42,  496,    1,  197,   25,   88,
        156,   19,   12,  211,  340,   29,   70,  248,  213,    9,  486,
         62,   70,   88,  116,   99,   24, 5740,   12, 3317,  657,  777,
         12,   18,    7,   35,  406, 8228,  178, 2477,  426,    2,   92,
       1253,  140,   72,  149,   55,    2,    1, 7525,   72,  229,   70,
       2962,   16,    1, 2880,    1,    1, 1506, 4998,    3,   40, 3947,
        119, 1608,   17, 3401,   14,  163,   19,    4, 1253,  927, 7986,
          9,    4,   18,   13,   14, 4200,    5,  102,  148, 1237,   11,
        240,  692,   13,   44,   25,  101,   39,   12, 7232,    1,   39,
       1378,    1,   52,  409,   11,   99, 1214,  874,  145,   10],
      dtype=int32)

# Defining the neural network

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(6, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

Here, embedding is a layer that is of the dimension (vocabulary size x embedding dimension), but the size of the input is (1 x max length). So when each row of training data (a sentence that has been encoded into numbers) is passed through this layer, specific embedding vectors corresponding to each of the indexes in the input are triggered which are the vectors to be updated when this input is passed in.

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten_1 (Flatten)         (None, 1920)              0         
                                                                 
 dense_2 (Dense)             (None, 6)                 11526     
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


Things to Note :

1.   The output shape is therefore (120 x 16), and not (10000 x 16), even though the number of embedding vectors is 10000 for each word in the vocabulary.
2.   In place of flatten(), we can use GlobalAveragePooling1D, which averages along the input row (and not the features row, whose size is 16), to give an output layer of size 16. This makes the model simpler and takes less time, however, it can give lower accuracy than using Flatten().



# Training the model

In [None]:
num_epochs = 10
model.fit(training_padded,
          training_labels_final,
          epochs=num_epochs,
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1822ddd890>

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
weights.shape

(10000, 16)

Here, we can see that there are 10000 weight vectors in the embedding layer, each of size 16. These are the embedding vectors that are unique to each word in the vocabulary.

# Visualizing the trained result

In [None]:
reverse_word_index = dict([(value, key) for (key,value) in word_index.items()])

def decode_review(text):
  return ' '.join([reverse_word_index.get(i,'?') for i in text])

print(decode_review(training_padded[3]))
print(training_sentences[3])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? b'this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <OOV> into a big arm chair and <OOV> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received '
b'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'


In [None]:
import io

out_v = io.open('vesc.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding = 'utf=8')

for word_num in range(1,vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

Each word and its corresponding embedding vector are written one by one into .tsv files that can be mapped in the embedding projector provided by [Tensorflow](https://projector.tensorflow.org/).

In [None]:
download = False
if(download):
  try:
    from google.colab import files
  except ImportError:
    pass
  else:
    files.download('vesc.tsv')
    files.download('meta.tsv')