In [22]:
import os
import shutil
import re
import string
import io

import tensorflow as tf
from keras import Sequential
from keras.layers import Embedding, TextVectorization, GlobalAveragePooling1D, Dense

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(
    "aclImdb_v1.tar.gz", url, untar=True, cache_dir=".", cache_subdir=""
)

dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")

In [3]:
train_dir = os.path.join(dataset_dir, "train")

The train directory also has additional folders which should be removed before creating training dataset.

In [4]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

In [5]:
batch_size = 1024
seed = 123
train_ds, val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="both",
    seed=seed,
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Using 5000 files for validation.


In [6]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch[i].numpy())

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

To make sure that I/O does not become blocking:
- `.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.
- `.prefetch()` overlaps data preprocessing and model execution while training.

In [7]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Keras Embedding layer: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [8]:
# input_dim=1000 means one-hot encoding of len 1000, which in turn means len(vocab)==1000
embedding_layer = Embedding(input_dim=1000, output_dim=5)

In [9]:
result = embedding_layer(tf.constant([1]))
result.numpy()

array([[-0.03836682, -0.03401257, -0.0006951 , -0.04927964, -0.00022895]],
      dtype=float32)

In [10]:
result = embedding_layer(tf.constant([[1, 2], [3, 4]]))
print(result.shape)
result.numpy()

(2, 2, 5)


array([[[-0.03836682, -0.03401257, -0.0006951 , -0.04927964,
         -0.00022895],
        [-0.01645837,  0.02244296,  0.01662347,  0.04492356,
         -0.03390533]],

       [[ 0.03883637,  0.01537195,  0.01466027,  0.02512241,
          0.00198326],
        [ 0.02502811, -0.00169842, -0.03799127,  0.03314408,
         -0.03792956]]], dtype=float32)

In [11]:
# strip HTML break tags '<br />' and punctuations
def custom_standardization(input_data):
    stripped_html = tf.strings.regex_replace(
        tf.strings.lower(input_data), "<br />", " "
    )
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

TextVectorization layer: https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

Maps text features to integer sequences.

In [12]:
# vocabulary size
vocab_size = 10000

# number of words in a sequence
sequence_length = 100

# set maximum_sequence length as all samples are not of the same length.
text_vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [13]:
# print(list(train_ds.as_numpy_iterator())[0][0])
# print(list(train_ds.as_numpy_iterator())[0][1])

In [14]:
# # use a text-only dataset without labels
text_ds = train_ds.map(lambda x, y: x)
# call `adapt` to build the vocabulary
text_vectorize_layer.adapt(text_ds)
text_vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'out',
 'if',
 'some',
 'there',
 'what',
 'good',
 'more',
 'when',
 'very',
 'even',
 'she',
 'up',
 'my',
 'no',
 'would',
 'time',
 'only',
 'which',
 'really',
 'story',
 'their',
 'see',
 'were',
 'had',
 'can',
 'me',
 'we',
 'than',
 'much',
 'well',
 'been',
 'will',
 'get',
 'people',
 'bad',
 'also',
 'other',
 'do',
 'into',
 'great',
 'first',
 'because',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'movies',
 'then',
 'them',
 'way',
 'films',
 'make',
 'could',
 'any',
 'after',
 'too',
 'characters',
 'think',
 'watch',
 'being',
 'two',
 'many',
 'seen',
 'character',
 'plot',
 'little',
 'never',
 'acting',
 'where',
 'best',
 '

In [15]:
text_vectorize_layer("you fighting political solders")

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 22, 996, 999,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])>

5-layer sequential model for classification:

1. TextVectorization: strings to vocabulary indices, feeding transformed strings into the Embedding layer
2. Embedding: takes the integer-encoded vocabulary and looks up the embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: (batch, sequence, embedding). See previous cells.
3. [GlobalAveragePooling1D](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalAveragePooling1D): returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.
4. The fixed-length output vector is piped through a fully-connected (Dense) layer with 16 hidden units.
5. Output: a single output node.

In [74]:
embedding_dim = 16

model = Sequential(
    [
        text_vectorize_layer,
        Embedding(vocab_size, embedding_dim, name="embedding"),
        GlobalAveragePooling1D(),
        Dense(16, activation="relu"),
        Dense(1),
    ]
)

In [75]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [76]:
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [77]:
model.fit(train_ds, validation_data=val_ds, epochs=15, callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x2c7155dd0>

In [78]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 100)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_2 (Dense)             (None, 16)                272       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160289 (626.13 KB)
Trainable params: 160

In [None]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

In [79]:
weights = model.get_layer("embedding").get_weights()[0]
vocab = text_vectorize_layer.get_vocabulary()
out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding
    vec = weights[index]
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

Visualize in: https://projector.tensorflow.org/

In [80]:
import numpy as np


# Function to calculate cosine similarity between two vectors
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm1 * norm2)
    return similarity


# Function to find closest words to a given word
def find_closest_words(word, top_k=10):
    word_index = text_vectorize_layer.get_vocabulary().index(word)
    word_vector = weights[word_index]
    similarities = []

    for index, vec in enumerate(weights):
        if index == 0:
            continue  # skip padding
        similarity = cosine_similarity(word_vector, vec)
        similarities.append((index, similarity))

    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    closest_words = []

    for index, similarity in similarities[:top_k]:
        closest_words.append(text_vectorize_layer.get_vocabulary()[index])

    return closest_words

In [82]:
given_word = "winter"
closest_words = find_closest_words(given_word)

print(f"Closest words to '{given_word}':")
for word in closest_words:
    print(word)

Closest words to 'winter':
winter
referred
west
iti
bruce
excellence
hollywood
caretaker
notch
collecting


In [83]:
# Function to find the word that completes an analogy
def find_analogy(word_a, word_b, word_c):
    word_a_index = text_vectorize_layer.get_vocabulary().index(word_a)
    word_b_index = text_vectorize_layer.get_vocabulary().index(word_b)
    word_c_index = text_vectorize_layer.get_vocabulary().index(word_c)

    word_a_vector = weights[word_a_index]
    word_b_vector = weights[word_b_index]
    word_c_vector = weights[word_c_index]

    analogy_vector = word_b_vector - word_a_vector + word_c_vector

    similarities = []
    for index, vec in enumerate(weights):
        if index == 0:
            continue  # skip padding
        similarity = cosine_similarity(analogy_vector, vec)
        similarities.append((index, similarity))

    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    closest_word_index = similarities[0][0]
    closest_word = text_vectorize_layer.get_vocabulary()[closest_word_index]

    return closest_word

In [98]:
find_analogy("fantastic", "beautiful", "embarrassing")

'sadly'