In [1]:
# https://www.tensorflow.org/tutorials/text/word2vec

import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
from nltk.lm import Vocabulary
import string 

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
SEED = 42
num_ns = 4 # number of negative samples
CHECKPOINT_PATH = '/content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt' # for ModelCheckpoint
log_dir = "/content/gdrive/My Drive/Colab/logs/fit/word2vec" # for tensorboard


In [4]:
### enable google drive access
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Preprocess text using nltk:

In [17]:
words = nltk.corpus.gutenberg.words('austen-emma.txt')
sentences = nltk.corpus.gutenberg.sents('austen-emma.txt')
sentences[10]

['The',
 'danger',
 ',',
 'however',
 ',',
 'was',
 'at',
 'present',
 'so',
 'unperceived',
 ',',
 'that',
 'they',
 'did',
 'not',
 'by',
 'any',
 'means',
 'rank',
 'as',
 'misfortunes',
 'with',
 'her',
 '.']

In [18]:
my_vocab, index = {}, 1  # start indexing from 1
my_vocab['<pad>'] = 0  # add a padding token
for w in words:
  w = w.lower()
  if w in string.punctuation: continue

  if w not in my_vocab:
    my_vocab[w] = index
    index += 1

my_inverse_vocab = {index: token for token, index in my_vocab.items()}

In [19]:
ind_sentences = []
for sentence in sentences:
  ind_sentence = []
  for w in sentence:
    w = w.lower()
    if w not in my_vocab: continue

    ind_sentence.append(my_vocab[w])
  ind_sentences.append(ind_sentence)
    

In [20]:
my_sent = ind_sentences[10]
[my_inverse_vocab[index] for index in my_sent]

['the',
 'danger',
 'however',
 'was',
 'at',
 'present',
 'so',
 'unperceived',
 'that',
 'they',
 'did',
 'not',
 'by',
 'any',
 'means',
 'rank',
 'as',
 'misfortunes',
 'with',
 'her']

--------------------------------------------------------------------------------

In [None]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
tokens

['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']

In [None]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [None]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [11]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
# example
#targets, contexts, labels = generate_training_data([example_sequence], window_size=2, num_ns=4, vocab_size=vocab_size, seed=SEED)
#targets

100%|██████████| 1/1 [00:00<00:00, 112.98it/s]


[5, 5, 1, 5, 3, 4, 1, 1, 2, 4, 5, 1, 2, 3, 4, 2, 3, 6, 4, 7, 7, 6, 6, 1, 1, 3]

In [None]:
# takes time
targets, contexts, labels = generate_training_data(ind_sentences, window_size=2, num_ns=4, vocab_size=len(my_vocab), seed=SEED)

In [15]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

targets.shape: (622400,)
contexts.shape: (622400, 5)
labels.shape: (622400, 5)


In [18]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


`dataset` is what we will use to train our word2vec model below.

---------------------------------------------------

In [19]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    # each word, represented by an integer from 0 to vocab_size-1, is assigned a trainable vector of dimension `embedding_dim`=128. 
    # That is what the following layers.Embedding thing is doing
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [10]:
# try embedding layer
vocab_size_example=1000
emb_mod = tf.keras.Sequential()
emb_mod.add(tf.keras.layers.Embedding(input_dim=vocab_size_example, output_dim=128, input_length=1, name='my_emb'))
input_array = np.random.randint(vocab_size_example, size=(32, 1))
emb_mod.compile('rmsprop', 'mse')
output_array = emb_mod.predict(input_array)
output_array.shape




(32, 1, 128)

In [14]:
wei = emb_mod.get_layer('my_emb').get_weights()
wei[0].shape

(1000, 128)

In [22]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size=len(my_vocab), embedding_dim=embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

fit the model:

In [24]:

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CHECKPOINT_PATH, 
    verbose=1,
    save_freq = 'epoch')

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

word2vec.fit(dataset, epochs=20, callbacks=[cp_callback, tensorboard_callback])

Epoch 1/20
Epoch 1: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 2/20
Epoch 2: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 3/20
Epoch 3: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 4/20
Epoch 4: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 5/20
Epoch 5: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 6/20
Epoch 6: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 7/20
Epoch 7: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 8/20
Epoch 8: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 9/20
Epoch 9: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 10/20
Epoch 10: saving model to /content/gdrive/My Drive/Colab/checkpoints/word2vec.ckpt
Epoch 11/20
Epoch 11: saving model to /content/gdrive/My Drive/Colab

<keras.callbacks.History at 0x7fb687751850>

---------------------------------------------------------------------------------------------------------

Check if can reload model from drive:

In [9]:
loaded_model = tf.keras.models.load_model(CHECKPOINT_PATH)


In [16]:
# Obtain the weights from the model using Model.get_layer and Layer.get_weights. 

weights = loaded_model.get_layer('w2v_embedding').get_weights()[0]
weights.shape # indeed, len(my_vocab) is also 7328

(7328, 128)

Given a word, show the closest words to this word in terms of the vector representation of the model:

In [49]:
ind = my_vocab['emma']
dist = np.ones(len(my_vocab))*(-1) # stores the euclidean distance between each word vector and the word vector specified by index `ind`
for i in range(len(my_vocab)):
  dist[i] = np.linalg.norm(weights[i] - weights[ind]) # euclidean distance


In [50]:
sorted_indices = np.argsort(dist)
print([my_inverse_vocab[sorted_indices[i]] for i in range(20)]) # the closest words shown here don't seem to be semantically close???

['emma', 'meditations', 'middling', 'repeat', 'brewing', 'assert', 'ungraciously', 'sanguinely', 'grammatical', 'softly', '_bride_', 'sacred', 'dirt', 'exult', 'conditionally', 'alas', 'fight', 'unexpensively', 'begging', '_court_']


-----------------------------------------------------------------------------------------------------

In [None]:
window_size = 2
my_skipgrams, labels = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=4.0)
my_skipgrams = np.array(my_skipgrams); labels = np.array(labels)
my_pos_skipgrams = my_skipgrams[labels==1]
my_neg_skipgrams = my_skipgrams[labels==0]

In [None]:
'''
labels_mask = np.array([bool(labels[i]) for i in range(len(labels))])
labels_mask_negated = np.array([bool(1-labels[i]) for i in range(len(labels))])
my_pos_skipgrams = tf.boolean_mask(my_skipgrams, labels_mask)
'''

<tf.Tensor: shape=(26, 2), dtype=int32, numpy=
array([[4, 5],
       [6, 1],
       [7, 6],
       [1, 4],
       [5, 3],
       [1, 5],
       [4, 1],
       [3, 2],
       [3, 4],
       [6, 7],
       [1, 3],
       [5, 4],
       [6, 5],
       [7, 1],
       [2, 1],
       [1, 7],
       [1, 6],
       [4, 3],
       [1, 2],
       [5, 6],
       [3, 5],
       [2, 3],
       [5, 1],
       [2, 4],
       [3, 1],
       [4, 2]], dtype=int32)>

In [None]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)


for target, context in positive_skip_grams:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")
# WHY IS THERE (1,5)? ISN'T THE WINDOW SIZE 2?

(3, 2): (road, wide)
(4, 3): (shimmered, road)
(4, 1): (shimmered, the)
(6, 7): (hot, sun)
(1, 2): (the, wide)
(5, 4): (in, shimmered)
(7, 1): (sun, the)
(3, 5): (road, in)
(2, 3): (wide, road)
(1, 5): (the, in)
(3, 1): (road, the)
(5, 1): (in, the)
(6, 1): (hot, the)
(2, 4): (wide, shimmered)
(4, 2): (shimmered, wide)
(7, 6): (sun, hot)
(1, 6): (the, hot)
(5, 3): (in, road)
(2, 1): (wide, the)
(1, 3): (the, road)
(1, 7): (the, sun)
(3, 4): (road, shimmered)
(4, 5): (shimmered, in)
(5, 6): (in, hot)
(1, 4): (the, shimmered)
(6, 5): (hot, in)


In [None]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]
print(inverse_vocab[target_word])
print(inverse_vocab[context_word])

shimmered
road


In [None]:
# NEGATIVE SAMPLES ARE WEIRD

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates]) # why is there still "shimmered" or "road" in my negative sample???

tf.Tensor([2 0 3 1], shape=(4,), dtype=int64)
['wide', '<pad>', 'road', 'the']
