In [29]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
import tqdm
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [5]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    input_data = input_data.lower()
    input_data = re.sub(r'^.*?<p>', '<p>', input_data)
    input_data = re.sub(r'<p>', '', input_data)
    input_data = re.sub(r'</p>', '', input_data)
    input_data = re.sub(r'</a>', '', input_data)
    input_data = re.sub(r'.*\<(.*)\>.*', '<a>', input_data)
    input_data = re.sub(r'<a>', '', input_data)
    #input_data = re.sub(r'\n', '', input_data)
    input_data = re.sub(r'<pre><code>', '', input_data)
    input_data = re.sub(r'</code></pre>', '', input_data)
    input_data = re.sub(r'<br>', '', input_data)
    lowercase = tf.strings.lower(input_data)
    #stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    #return lowercase
    return input_data

In [None]:
f = open(os.path.join("./", "stackoverflow.txt"), 'w', encoding='utf-8')
for line in open('Answers.csv', encoding = "ISO-8859-1"):
    line = custom_standardization(line)
    if line:
        f.write(line)
f.close()

In [15]:
text_ds = tf.data.TextLineDataset("./stackoverflow.txt").filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [16]:
text_ds

<_FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [17]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 5096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [18]:
vectorize_layer.adapt(text_ds.batch(1024))

In [21]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[1000:1020])

['dictionary', 'validation', 'together', 'named', 'ex', 'ios', 'initialize', 'days', 'eclipse', 'special', 'removed', 'browsers', 'longer', 'shared', 'future', 'sent', 'shouldnt', 'visible', 'num', 'unsigned']


In [19]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [20]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [22]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

18648272


In [45]:
for seq in sequences[999:1005]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[0 0 0 0 0 0 0 0 0 0] => ['', '', '', '', '', '', '', '', '', '']
[ 179  342   12   87 2800  482  139  734    5 1128] => ['another', 'option', 'i', 'was', 'considering', 'rather', 'than', 'writing', 'a', 'native']
[  59    5    1   15 4498  514 1082  465  432    8] => ['its', 'a', '[UNK]', 'that', 'sun', 'never', 'included', 'anything', 'similar', 'in']
[0 0 0 0 0 0 0 0 0 0] => ['', '', '', '', '', '', '', '', '', '']
[364 452  93 432  56 118  12  24   5 241] => ['ive', 'got', 'something', 'similar', 'set', 'up', 'i', 'have', 'a', 'main']
[1 1 0 0 0 0 0 0 0 0] => ['[UNK]', '[UNK]', '', '', '', '', '', '', '', '']


In [46]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [47]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

100%|██████████| 18648272/18648272 [30:38<00:00, 10143.00it/s] 


In [48]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (17396955,)
contexts.shape: (17396955, 5)
labels.shape: (17396955, 5)


In [49]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [50]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [51]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [52]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [53]:
embedding_dim = 256
num_ns = 4
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

checkpoint_filepath = './checkpoint.model.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='accuracy',
    mode='max',
    save_best_only=True)

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.005, patience=0)

In [54]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [55]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback,model_checkpoint_callback, earlystop_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<keras.src.callbacks.History at 0xa0230db10>

In [56]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [57]:
#docs_infra: no_execute
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 62463), started 1 day, 2:28:05 ago. (Use '!kill 62463' to kill it.)

In [58]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [59]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()