In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'2.1.0'

In [3]:
import tensorflow_datasets as tfds

In [4]:
imdb, info = tfds.load("imdb_reviews", with_info= True, as_supervised=True)

In [5]:
import numpy as np

In [6]:
train_data, test_data = imdb['train'], imdb['test']
#data is split into 25000 samples for training and 25000 samples for testing

In [8]:
#Defining the list of sentences and labels



train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []


#Iterate over training and test data, then extract sentences and labels and add them to the respective lists
#values of s and l are tensors hence we need to convert them to numpy

for s,l in train_data:
    
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())

for s,l in test_data:
    
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())
    
#labels are expected to be array

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [9]:
#Tokeninzing our sentences


vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

In [13]:
tokenizer.fit_on_texts(training_sentences)

In [14]:
word_index = tokenizer.word_index

In [15]:
print(word_index)



In [16]:
#texts to sequences will convert sentences to the set of sequences as shown in word_index

sequences = tokenizer.texts_to_sequences(training_sentences)

In [17]:
#pad_sequences will convert the sentences of same length ie 120 and create a matrix of them

padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

In [18]:
padded

array([[   0,    0,    0, ..., 1748,   31,  116],
       [ 379,   86,   11, ...,    2,   18,   16],
       [3097,    5,    2, ...,   12,  703,   23],
       ...,
       [  12,   20,    7, ...,   14,    1,  198],
       [  16,    4,   91, ...,   93,  113, 1555],
       [   4,   76,   29, ...,  381, 1269,    3]])

In [19]:
#same for testing data
#testing sentences are sequenced based on the word_index generated by training data

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)


In [20]:
#reversing keys and values of dictionary
#previously we have hello:1
#                   world:2

#after running this code we get-
#                   1:hello
#                   2:world

#and we can see the sentences 

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(training_sentences[3])

? ? ? ? ? for me this is a story that starts with some funny jokes regarding <OOV> <OOV> when he is travelling with a <OOV> and when he is sitting in business <OOV> the problem is that when you have been watching this movie for an hour you will see the same fantasies funny situations again and again and again it is to predictable it is more done as a tv story where you can go away and come back without missing anything br br i like felix <OOV> as frank but that is not enough even when it is a comedy it has to have more variations and some kind of message to it's audience br br
For me this is a story that starts with some funny jokes regarding Franks fanatasies when he is travelling with a staircase and when he is sitting in business meetings... The problem is that when you have been watching this movie for an hour you will see the same fantasies/funny situations again and again and again. It is to predictable. It is more done as a TV story where you can go away and come back without mi

In [21]:
#Defining our NEURAL NETWORK

model = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length)
    ])



USE OF EMBEDDING LAYER-

often words have similar meanings and are close to each other like dull and boring, fun and exciting.
So we can take a vector of higher dimensions (say 16 dimensions) and words that have same meaning are given similar vectors
Then overtime words begin to cluster together
The meaning of words come from the labels of the dataset

Embedding Layer will take these similar words and assingn them similar vectors according to their associated sentiments and produce a 2D array with the length of the sentence and the embedding dimension ie 16



In [22]:
# Since embedding will produce 2D array we need to flatten it out so that it can be sent to dense layers

model.add(tf.keras.layers.Flatten())

#We can also use GlobalAveragePooling1D() layer

In [23]:
model.add(tf.keras.layers.Dense(6, activation = 'relu'))

In [24]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

#we have only two outputs either negative or positive 
#hence we use 1 neuron and singmoid as an activation function


In [25]:
model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [26]:
model.summary()

#in embedding layer OutputShape = 120X16 = MaxLengthOFSentences X Embedding Dimension = 2D array
#in Flatten we got 1D array of sixe 120*16 = 1920

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(padded , training_labels_final, 
          epochs=10, 
          validation_data = (testing_padded, testing_labels_final) 
          
         )

#here accuracy reached to 1 ie the sign of overfitting .

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x206f61f5f08>

In [28]:
#demonstrate the embeddings

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


#we got 16 dimensional array -> vector

(10000, 16)


In [29]:
#to visualize vectors
#we are saving the values of 16 dimensions in out_v

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
    
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    
out_v.close()
out_m.close()