In [1]:
import numpy as np
import tensorflow as tf
import os
import string
import re
import tqdm
from tensorflow.keras import layers

In [2]:
vocab_size = 4000
sequence_length = 15

In [3]:
with open(r'dataset\Sonnet.txt') as f:
    lines = f.read().splitlines()
for line in lines[: 20]:
    print(line)

THE SONNETS

by William Shakespeare


From fairest creatures we desire increase,
That thereby beauty’s rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed’st thy light’s flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel:
Thou that art now the world’s fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And tender churl mak’st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world’s due, by the grave and thee.



In [4]:
def create_dataset(sequence, window_size, num_ns_per_pos, vocab_size, seed = 42):
    target, context, label = [], [], []
    #used to process stopwords in vocabulary
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    for seq in tqdm.tqdm(sequence):
        skip_pair, _ = tf.keras.preprocessing.sequence.skipgrams(
            seq,
            vocabulary_size = vocab_size,
            sampling_table = sampling_table,
            negative_samples = 0,
            window_size = window_size
        )

        for t, c in skip_pair:
            context_class = tf.expand_dims(
          tf.constant([c], dtype="int64"), 1)
            negative_sample, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes = context_class,
                unique = True,
                num_true = 1,
                num_sampled = num_ns_per_pos,
                range_max = vocab_size,
                seed = seed,
                name = 'negative_sampling'
            )
            context_ar = tf.concat([tf.squeeze(context_class, 1), negative_sample], 0)
            label_ar = tf.constant([1] + [0] * num_ns_per_pos, dtype = tf.int64)

            target.append(t)
            context.append(context_ar)
            label.append(label_ar)
    
    return target, context, label

In [5]:
text_data = tf.data.TextLineDataset(r'dataset\Sonnet.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [6]:
def custom_standard(inp):
    inp = tf.strings.lower(inp)
    return tf.strings.regex_replace(inp, '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize = custom_standard,
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length= sequence_length
)

vectorize_layer.adapt(text_data.batch(1024))

In [7]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[: 20])

['', '[UNK]', np.str_('and'), np.str_('the'), np.str_('to'), np.str_('my'), np.str_('of'), np.str_('i'), np.str_('that'), np.str_('in'), np.str_('thy'), np.str_('thou'), np.str_('with'), np.str_('for'), np.str_('is'), np.str_('not'), np.str_('me'), np.str_('but'), np.str_('a'), np.str_('thee')]


In [8]:
text_vector_ds = text_data.batch(1024).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [9]:
inp_data = list(text_vector_ds.as_numpy_iterator())

In [10]:
targets, contexts, labels = create_dataset(
    sequence = inp_data,
    window_size = 2,
    num_ns_per_pos = 4,
    vocab_size = vocab_size,
)

targets = np.array(targets)
contexts = np.array([np.array(ctx) for ctx in contexts])
labels = np.array([np.array(lbl) for lbl in labels])

print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|██████████| 2157/2157 [00:01<00:00, 2120.87it/s]

targets.shape: (6584,)
contexts.shape: (6584, 5)
labels.shape: (6584, 5)





In [11]:
def word2vec_model(vocab_size, embedding_dim):
    target_input = tf.keras.Input(shape=(1,), name="target")
    context_input = tf.keras.Input(shape=(None,), name="context")
    target_embedding = layers.Embedding(vocab_size, embedding_dim, name="w2v_embedding")
    context_embedding = layers.Embedding(vocab_size, embedding_dim)
    target_emb = layers.Lambda(lambda x: tf.squeeze(target_embedding(x), axis=1))(target_input)
    context_emb = context_embedding(context_input)
    dots = layers.Lambda(lambda inputs: tf.einsum('be,bce->bc', inputs[0], inputs[1]))([target_emb, context_emb])

    model = tf.keras.Model(inputs=[target_input, context_input], outputs=dots)
    return model

In [12]:
word2vec = word2vec_model(vocab_size, 128)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
word2vec.summary()




In [13]:
if contexts.ndim == 3:
    contexts = contexts.reshape(-1, contexts.shape[1])
if labels.ndim == 3:
    labels = labels.reshape(-1, labels.shape[1])

word2vec.fit([targets, contexts], labels, epochs=20)

Epoch 1/20




[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2133 - loss: 1.6094
Epoch 2/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6744 - loss: 1.5917
Epoch 3/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7969 - loss: 1.5730
Epoch 4/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8292 - loss: 1.5545
Epoch 5/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8498 - loss: 1.5361
Epoch 6/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8560 - loss: 1.5179
Epoch 7/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8525 - loss: 1.4995
Epoch 8/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8515 - loss: 1.4817
Epoch 9/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2331e8d93d0>