# C3W2L1

IMBD dataset: http://ai.stanford.edu/~amaas/data/sentiment/

In [0]:
# !pip install tensorflow==2.0.0
# should beta0, not alpha0

# !pip install -q tensorflow-datasets

Collecting tensorflow==2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/46/0f/7bd55361168bb32796b360ad15a25de6966c9c1beb58a8e30c01c8279862/tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (86.3MB)
[K     |████████████████████████████████| 86.3MB 55kB/s 
Collecting tensorboard<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/54/99b9d5d52d5cb732f099baaaf7740403e83fe6b0cedde940fabd2b13d75a/tensorboard-2.0.2-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 42.2MB/s 
[?25hCollecting tensorflow-estimator<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/08/8b927337b7019c374719145d1dceba21a8bb909b93b1ad6f8fb7d22c1ca1/tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 60.2MB/s 
Collecting google-auth<2,>=1.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/8d/5f/a1a02695b96d0e09c38abf7d1576b137979cea3d060d60891622cf61276d/goog

In [1]:
import tensorflow as tf
print(tf.__version__)

2.0.0


In [0]:
# Get over with tf 1.x
# tf.enable_eager_execution()

### Using tensorflow data services (TFDS) to get built in datasets

In [0]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [0]:
import numpy as np

# 25k train 25k test
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  # Note that the values of s & l are tensors thats why  we need s.numpy() to extract their values
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())
  
# Note that when training, we would expect a numpy array
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


In [4]:
print(training_sentences[7])
print(training_labels[7])

b"I absolutely LOVED this movie when I was a kid. I cried every time I watched it. It wasn't weird to me. I totally identified with the characters. I would love to see it again (and hope I wont be disappointed!). Pufnstuf rocks!!!! I was really drawn in to the fantasy world. And to me the movie was loooong. I wonder if I ever saw the series and have confused them? The acting I thought was strong. I loved Jack Wilde. He was so dreamy to an 10 year old (when I first saw the movie, not in 1970. I can still remember the characters vividly. The flute was totally believable and I can still 'feel' the evil woods. Witchy poo was scary - I wouldn't want to cross her path."
1


### TOKENIZATION

In [0]:
# Hyper parameters:
vocab_size = 10000 # interested in the top 10000 most popular vocabs
embedding_dim = 16
max_length = 120 # length of a the longest sentence
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Since the tokenizer is fitted on the training set,
# we would expect to have a lot more OOV here when it is being applied on the test test
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

### Neural Network with Word Embedding

In [0]:
model = tf.keras.Sequential([
                             # THE KEY DIFFERNENCE IN NLP MODEL IS THE EMBEDDING LAYER
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             # The rest looks very similar to before
                             # The results of the embedding will be a 2D array with the length of the sentence (120)
                             # and the embedding dimension (for example 16 as its size). So we need to flatten it 
                             # out in much the same way as we needed to flatten out our images.
                             # Flatten
                             tf.keras.layers.Flatten(),
                             # NOTE:
                            #  In NLP, instead of using Flatten layer, we can use GlobalAveragePooling1D()
                            #  to flatten the embedding out
                             # tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(6, activation='relu'),
                             # Binary Classication: pos or neg
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [6]:
# USE THIS MODEL WITH GLOBAL AVERAGE POOLING 1D
model = tf.keras.Sequential([
                             # THE KEY DIFFERNENCE IN NLP MODEL IS THE EMBEDDING LAYER
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             # The rest looks very similar to before
                             # The results of the embedding will be a 2D array with the length of the sentence (120)
                             # and the embedding dimension (for example 16 as its size). So we need to flatten it 
                             # out in much the same way as we needed to flatten out our images.
                             # Flatten
                             # tf.keras.layers.Flatten(),
                             # NOTE:
                            #  In NLP, instead of using Flatten layer, we can use GlobalAveragePooling1D()
                            #  to flatten the embedding out
                             tf.keras.layers.GlobalAveragePooling1D(), # This is simpler which makes training faster than Flatten
                             tf.keras.layers.Dense(6, activation='relu'),
                             # Binary Classication: pos or neg
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 102       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [7]:
# training begins:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, 
          validation_data=(testing_padded, testing_labels_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f67ef4c6470>

### IN ORDER TO PLOT EMBEDDED VECTOR:

In [8]:
# To be able to plot it. Reverse the order of word_index dictionary
# before it was (word, index), now we want (index, word)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# decode_reviewthis will lower case and remove all punctuation
print(decode_review(padded[1]))
print(training_sentences[1])

b oh yeah jenna jameson did it again yeah baby this movie rocks it was one of the 1st movies i saw of her and i have to say i feel in love with her she was great in this move br br her performance was outstanding and what i liked the most was the scenery and the wardrobe it was amazing you can tell that they put a lot into the movie the girls cloth were amazing br br i hope this comment helps and u can buy the movie the storyline is awesome is very unique and i'm sure u are going to like it jenna amazed us once more and no wonder the movie won so many
b"Oh yeah! Jenna Jameson did it again! Yeah Baby! This movie rocks. It was one of the 1st movies i saw of her. And i have to say i feel in love with her, she was great in this move.<br /><br />Her performance was outstanding and what i liked the most was the scenery and the wardrobe it was amazing you can tell that they put a lot into the movie the girls cloth were amazing.<br /><br />I hope this comment helps and u can buy the movie, the

In [9]:
reverse_word_index.items()



In [0]:
reverse_word_index.get(1, '?')

'<OOV>'

In [10]:
# check the first embedding
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim) # there are 10k words with dim of 16

(10000, 16)


### Now it's time to write the vectors and their metadata auto files. The TensorFlow Projector reads this file type and uses it to plot the vectors in 3D space so we can visualize them.

In [0]:
import io

# read files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  # for the meta data, we just write out the word
  out_m.write(word + "\n")
  """
  To the vectors file, we simply write out the value of each of the items in 
  the array of embeddings, i.e, the co-efficient of each dimension on the 
  vector for this word.
  """
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [13]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1430], [968], [4], [1537], [1537], [4738], [], [790], [2015], [11], [2922], [2191], [], [790], [2015], [11], [579], [], [11], [579], [], [4], [1783], [4], [4508], [11], [2922], [1277], [], [], [2015], [1005], [2922], [968], [579], [790], []]


### Projector
https://projector.tensorflow.org/

In [0]:
# DOWND LOAD THE 2 FILEs:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')