In [2]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
import tqdm
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
# a custom standardization function to mainly strip tags 
def custom_standardization(input_data):
    input_data = input_data.lower()
    input_data = re.sub(r'^.*?<p>', '<p>', input_data)
    input_data = re.sub(r'<p>', '', input_data)
    input_data = re.sub(r'</p>', '', input_data)
    input_data = re.sub(r'</a>', '', input_data)
    input_data = re.sub(r'.*\<(.*)\>.*', '<a>', input_data)
    input_data = re.sub(r'<a>', '', input_data)
    #input_data = re.sub(r'\n', '', input_data)
    input_data = re.sub(r'<pre><code>', '', input_data)
    input_data = re.sub(r'</code></pre>', '', input_data)
    input_data = re.sub(r'<br>', '', input_data)

    return input_data

In [3]:
# the preprocess_documents function in gensim automatically stem
# which is not desired
# thus this customized gensim_strip function

#from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_multiple_whitespaces

def gensim_strip(input_data):
    input_data = input_data.lower()
    input_data = strip_tags(input_data)
    input_data = strip_short(input_data, minsize = 3)
    input_data = strip_punctuation(input_data)
    input_data = strip_numeric(input_data)
    input_data = strip_non_alphanum(input_data)
    input_data = strip_multiple_whitespaces(input_data)
    return input_data

In [4]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [5]:
#preprocess the data and store the cleaned ones in "./stackoverflow.txt"

f = open(os.path.join("./", "stackoverflow.txt"), 'w', encoding='utf-8')
for line in open('Answers.csv', encoding = "ISO-8859-1"):
    line = gensim_strip(line)
    doc = nlp(line)
    line = " ".join([token.lemma_ for token in doc])
    out = strip_short(line, minsize = 3)
    if out:
        f.write(f"{out}\n")
f.close()

In [6]:
# tf-idf model to find top 500 frequent words
from sklearn.feature_extraction.text import TfidfVectorizer

my_file = open("stackoverflow.txt", "r") 
data = my_file.read() 
data_into_list = data.split("\n") 

vectorizer = TfidfVectorizer(max_features = 500, stop_words = 'english')
X = vectorizer.fit_transform(data_into_list)
common = vectorizer.get_feature_names_out()

# print out the frequent words
common

array(['able', 'access', 'account', 'action', 'activity', 'actually',
       'add', 'address', 'ajax', 'alert', 'allow', 'amp', 'android',
       'answer', 'api', 'app', 'append', 'application', 'apply',
       'approach', 'args', 'argument', 'arr', 'array', 'article', 'ask',
       'asp', 'assign', 'assume', 'attr', 'attribute', 'auto',
       'available', 'avoid', 'background', 'bar', 'base', 'begin', 'bind',
       'bit', 'block', 'body', 'bool', 'boolean', 'border', 'box',
       'break', 'browser', 'buffer', 'build', 'button', 'byte', 'cache',
       'callback', 'case', 'cast', 'catch', 'category', 'cause', 'cell',
       'center', 'change', 'char', 'character', 'check', 'child', 'class',
       'clear', 'click', 'client', 'close', 'code', 'col', 'collection',
       'color', 'column', 'com', 'come', 'command', 'comment', 'common',
       'compare', 'compile', 'compiler', 'complete', 'component',
       'condition', 'config', 'configuration', 'connect', 'connection',
       'consi

In [7]:
# save the word list to 'Top 500.txt' file
with open('Top 500.txt', 'w') as f:
    for line in common:
        f.write(f"{line}\n")

In [3]:
text_ds = tf.data.TextLineDataset("./stackoverflow.txt").filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [4]:
text_ds

<_FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [5]:
# Define the vocabulary size and the number of words in a sequence.
vocab_size = 5096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.

def custom_standardization1(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization1,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [6]:
vectorize_layer.adapt(text_ds.batch(1024))

In [7]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[1000:1020])

['exec', 'sys', 'minute', 'protocol', 'live', 'timeout', 'invalid', 'student', 'nbsp', 'native', 'graph', 'city', 'complex', 'pos', 'quick', 'together', 'shouldn', 'hit', 'company', 'employee']


In [8]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [9]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [10]:
text_vector_ds

<_UnbatchDataset element_spec=TensorSpec(shape=(10,), dtype=tf.int64, name=None)>

In [11]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

18366634


In [13]:
for seq in sequences[990:995]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[  39  188    3   75    6 2059  322 1233    2  348] => ['one', 'thing', 'you', 'find', 'that', 'selenium', 'seem', 'restart', 'the', 'browser']
[   7    1  596 2580   23    2  348 1096  571    2] => ['use', '[UNK]', 'far', 'effective', 'but', 'the', 'browser', 'selection', 'limit', 'the']
[   1    4 1695 1299  163    5  103  376 2567  123] => ['[UNK]', 'and', 'hopefully', 'temporary', 'solution', 'this', 'problem', 'python', 'cgi', 'script']
[930 653 376   0   0   0   0   0   0   0] => ['usr', 'bin', 'python', '', '', '', '', '', '', '']
[2567  123  922 3350 1440  291  397 1970 2355    0] => ['cgi', 'script', 'produce', 'rss', 'feed', 'top', 'level', 'gallery', 'album', '']


In [14]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [15]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

100%|██████████| 18366634/18366634 [45:37<00:00, 6708.81it/s]  


In [17]:
len(targets)

33258821

In [18]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (33258821,)
contexts.shape: (33258821, 5)
labels.shape: (33258821, 5)


In [19]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [20]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [21]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [22]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [43]:
embedding_dim = 200
num_ns = 4
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

checkpoint_filepath = './checkpoint.model.keras'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='accuracy',
    mode='max',
    save_best_only=True)

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=1)

In [44]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [45]:
word2vec.fit(dataset, epochs=30, callbacks=[tensorboard_callback,model_checkpoint_callback, earlystop_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


<keras.src.callbacks.History at 0x3ddab1660>

In [46]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [47]:
#docs_infra: no_execute
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 3603), started 0:52:29 ago. (Use '!kill 3603' to kill it.)

In [48]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [49]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()