# Word Embedding: A quick intro

In [1]:
import tensorflow as tf
tf.__version__

'2.2.0-rc4'

## Download Data

In [0]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

## Prepare Data

In [0]:
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

In [4]:
train_sentences=[]
train_labels=[]
test_sentences=[]
test_labels=[]

for sentence, label in train_data:
  train_sentences.append(str(sentence.numpy()))
  train_labels.append(label.numpy())

for sentence, label in test_data:
  test_sentences.append(str(sentence.numpy()))
  test_labels.append(label.numpy())

#Example
print(train_labels[0])
print(train_sentences[0])

0
b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."


In [0]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [0]:
VOCAB_SIZE = 10000
EMBEDDING_DIM = 16
MAX_LENGTH = 120
TRUNC_TYPE = 'post'
OOV_TOKEN = '<OOV>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences=sequences, maxlen=MAX_LENGTH)

test_sequencies = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(sequences=test_sequencies, maxlen=MAX_LENGTH)

## Fit the Model

### Model A: Baseline

In [7]:
model_a = tf.keras.Sequential([
                             tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
                             tf.keras.layers.Flatten(), 
                            #  tf.keras.layers.GlobalAveragePooling1D(),  #Used with embedding. It averages across the vector to flatten it out. 
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model_a.summary()

model_a.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model_a.fit(x=padded, y=train_labels, epochs=10, verbose=2, validation_data=(test_padded, test_labels))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1921      
Total params: 161,921
Trainable params: 161,921
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
782/782 - 3s - loss: 0.4987 - accuracy: 0.7562 - val_loss: 0.3356 - val_accuracy: 0.8564
Epoch 2/10
782/782 - 3s - loss: 0.2624 - accuracy: 0.8958 - val_loss: 0.3193 - val_accuracy: 0.8603
Epoch 3/10
782/782 - 3s - loss: 0.1809 - accuracy: 0.9371 - val_loss: 0.3230 - val_accuracy: 0.8591
Epoch 4/10
782/782 - 3s - loss: 0.1172 - accuracy: 0.9678 - val_loss: 0.3586 - v

<tensorflow.python.keras.callbacks.History at 0x7f8d3753d898>

#### Comment 

Overfitting!

### Model B: Global Average

In [8]:
model_b = tf.keras.Sequential([
                             tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
                            #  tf.keras.layers.Flatten(), 
                             tf.keras.layers.GlobalAveragePooling1D(),  #Used with embedding. It averages across the vector to flatten it out. 
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model_b.summary()

model_b.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model_b.fit(x=padded, y=train_labels, epochs=10, verbose=2, validation_data=(test_padded, test_labels))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,017
Trainable params: 160,017
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
782/782 - 3s - loss: 0.6223 - accuracy: 0.7402 - val_loss: 0.5284 - val_accuracy: 0.7999
Epoch 2/10
782/782 - 3s - loss: 0.4451 - accuracy: 0.8378 - val_loss: 0.4076 - val_accuracy: 0.8365
Epoch 3/10
782/782 - 3s - loss: 0.3534 - accuracy: 0.8650 - val_loss: 0.3591 - val_accuracy: 0.8498
Epoch 4/10
782/782 - 3s - loss: 0.3068 - accuracy: 0.8799 - val_loss: 0.3351 -

<tensorflow.python.keras.callbacks.History at 0x7f8d356e7208>

## Visualize Embedding

In [14]:
e = model_b.layers[0]
weights = e.get_weights()[0]
print('(vocab_size, embedding_data)')
print(weights.shape)

(vocab_size, embedding_data)
(10000, 16)


In [0]:
reverse_word_index = dict([(v,k) for (k,v) in word_index.items()])

In [0]:
# Now it's time to write the vectors and their metadata auto files. 
# The TensorFlow Projector reads this file type and uses it to plot 
#the vectors in 3D space so we can visualize them.

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, VOCAB_SIZE):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

## Test

In [18]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1431], [966], [4], [1537], [1537], [4715], [], [790], [2019], [11], [2929], [2184], [], [790], [2019], [11], [579], [], [11], [579], [], [4], [1782], [4], [4517], [11], [2929], [1275], [], [], [2019], [1003], [2929], [966], [579], [790], []]
