In [None]:
!pip install tensorboard

In [28]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import tensorboard

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True,cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['README', 'test', 'imdbEr.txt', 'imdb.vocab', 'train']

In [3]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'urls_unsup.txt',
 'pos',
 'unsupBow.feat',
 'neg',
 'unsup',
 'urls_neg.txt',
 'urls_pos.txt']

In [4]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:
batch_size = 1024
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=.2,
    subset="training", seed=seed
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size, validation_split=.2,
    subset='validation', seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [22]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

1 b'I believe this is the most powerful film HBO Pictures has made to date. This film should have been released in theaters for the public to view on the big screen. It is available on video so make sure you look for it and check it out. Chris Gerolmo did a great job with the direction and the screenplay. The performances from Stephen Rea, Donald Sutherland and Jeffery DeMunn are flawless. A masterpiece of the genre.'
0 b'*** THIS CONTAINS MANY, MANY SPOILERS, NOT THAT IT MATTERS, SINCE EVERYTHING IS SO PATENTLY OBVIOUS ***<br /><br />Oh my God, where do I start? Well, here - this is the first time I have ever come home from a movie and said "I have to get on IMDb and write a review of this NOW. It is my civic duty." Such is the badness of this flick. <br /><br />*begin digression* But let me just state one thing before I start. I\'m not some Harvard-art-major-film-noir-weenie (in fact, I went to the college at the other end of Mass. Ave in Cambridge, the one where the actual smart peo

In [10]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [11]:
# Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(input_dim=1000, output_dim=5)

In [12]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[-0.02095643, -0.0419461 , -0.03078711,  0.04141687, -0.0284851 ],
       [-0.00714672, -0.03461951,  0.01894302, -0.04981021, -0.02386205],
       [-0.02369664,  0.02954148, -0.01014899,  0.03782015,  0.00205079]],
      dtype=float32)

In [14]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [15]:
# create a custom standardization function to strip HTML break tags '<br />'
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html, 
                                  '[%s]' % re.escape(string.punctuation), '')
  
# Vocabulary size and number of words in a sequence
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text only dataset (no labels) and call adapt to build the vocabulary
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [17]:
embedding_dim = 16

model = Sequential([
    vectorize_layer, 
    Embedding(vocab_size, embedding_dim, name='embedding'),
    GlobalAveragePooling1D(),
    Dense(19, activation='relu'),
    Dense(1)
])

In [18]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [19]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [20]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=15,
          callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fd781c416a0>

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 19)                323       
                                                                 
 dense_1 (Dense)             (None, 1)                 20        
                                                                 
Total params: 160,343
Trainable params: 160,343
Non-trai

In [None]:
!tensorboard dev upload --logdir ./logs \
  --name "tf_word_embeddings" \
  --description "tf word embeddings documentation" \
  --one_shot

* Done. View your TensorBoard at https://tensorboard.dev/experiment/563vImm5TDCMF80zi84XFg/

In [None]:
!tensorboard list

In [33]:
# # docs_infra: no_execute
# %load_ext tensorboard 
# %tensorboard --logdir logs

In [32]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [36]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue # skip 0, it's padding
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + '\n')
  out_m.write(word + '\n')
out_v.close()
out_m.close()

In [37]:
try:
  from google.colab import files
  files.download('vecs.tsv')
  files.download('meta.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>