In [None]:
# REPRESENTING TEXT AS NUMBERS
##################################

# 1. One-hot encodings
######################
# 2. Encode each word with a unique number
###########################################
# 3. Word embeddings
####################
# - A 4-dimensional embedding

In [3]:
# Setup
#######
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.enable_eager_execution()

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds
tfds.disable_progress_bar()


In [4]:
# Using the Embedding layer
###########################
embedding_layer = layers.Embedding(1000, 5)

result = embedding_layer(tf.constant(1,2,3))
print(result.numpy())

[[ 0.02724034  0.04226258  0.03123068 -0.00364912 -0.01445278]
 [ 0.02724034  0.04226258  0.03123068 -0.00364912 -0.01445278]
 [ 0.02724034  0.04226258  0.03123068 -0.00364912 -0.01445278]]


In [5]:
# For text or sequence problems, the Embedding layer takes a 2D
# tensor of intergers, of shape (samples, sequence_length), 
# where each entry is a sequence of intergers.
result = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
result.shape


TensorShape([Dimension(2), Dimension(3), Dimension(5)])

In [8]:
# Learning embeddings from scratch
####################################
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True
)

In [10]:
# Get the encoder (tfds.features.text.SubwordTextEncoder) and 
# have a quick look at the vocabulary

encoder = info.features['text'].encoder
encoder.subwords[:20]

In [None]:
# Use (padded_batch) method to standardize the lengths of the
# review

padded_shapes = ([None],())
train_batches = train_data.shuffle(1000).padded_batch(
    10, padded_shapes=padded_shapes)
test_batches = test_data.shuffle(1000).padded_batch(
    10, padded_shapes=padded_shapes)

In [None]:
train_batch, train_labels = next(iter(train_batches))
train_batch.numpy()

In [None]:
# Create a simple model
#########################
embedding_dim=16

model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
# Compile and train the model
##############################
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

history = model.fit(
    train_batches,
    epochs=10,
    validation_data=test_batches, validation_steps=20
)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()


In [None]:
# Retrieve the learned embeddings
# Will be a matrix of shape (vocab_size, embedding_dimension)

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
# To use  embedding Projector

import io
encoder = info.features['text'].encoder

out_v = io.open('vecs.tsv','w',encoding='utf-8')
out_m = io.open('meta.tsv','w',encoding='utf-8')

for num, word in enumerate(encoder.subwords):
    vec =  weights[num+1] # skip 0 =, it's padding.
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
out_v.close()
out_m.close()