In [9]:
import io
import re
import string
import tqdm
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence
0,0,The Cardiff Roller Collective (CRoC) are a rol...
1,1,"""Go! Pack Go!"" is the fight song of the Green ..."
2,2,Al-Machriq (English translation: The East) was...
3,3,Ajman International Airport (Arabic: مطار عجما...
4,4,Kapla is a construction set for children and a...


In [3]:
text = df['Sentence'].values
print(text[:5])

['The Cardiff Roller Collective (CRoC) are a roller sports league based in Cardiff, Wales. Founded'
 '"Go! Pack Go!" is the fight song of the Green Bay Packers, and the first'
 'Al-Machriq (English translation: The East) was a journal founded in 1898 by Jesuit and Chaldean'
 'Ajman International Airport (Arabic: مطار عجمان الدولي) was an upcoming airport which is currently a'
 'Kapla is a construction set for children and adults. The sets consist only of identical']


In [10]:
def clean_text(sentence):
    pattern = "^[a-zA-Z0-9 ]+$"
    tokens = word_tokenize(sentence)
    tokens = [token.lower() for token in tokens if re.match(pattern, token)]

    return tokens

In [11]:
import collections

tokens2int = collections.defaultdict(lambda: len(tokens2int))


In [13]:
def tokens_2_int(text):
    token_list = clean_text(text)
    return [tokens2int[token] for token in token_list]

def text_list_2_int(text_list):
    return [tokens_2_int(text) for text in text_list]

In [19]:
df['Vectors'] = text_list_2_int(df['Sentence'])
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Vectors
0,0,The Cardiff Roller Collective (CRoC) are a rol...,"[0, 1, 2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 1, 11, 12]"
1,1,"""Go! Pack Go!"" is the fight song of the Green ...","[13, 14, 13, 15, 0, 16, 17, 18, 0, 19, 20, 21,..."
2,2,Al-Machriq (English translation: The East) was...,"[24, 25, 0, 26, 27, 6, 28, 12, 10, 29, 30, 31,..."
3,3,Ajman International Airport (Arabic: مطار عجما...,"[33, 34, 35, 36, 27, 37, 38, 35, 39, 15, 40, 6]"
4,4,Kapla is a construction set for children and a...,"[41, 15, 6, 42, 43, 44, 45, 22, 46, 0, 47, 48,..."


In [15]:
tokens2int

defaultdict(<function __main__.<lambda>()>,
            {'the': 0,
             'cardiff': 1,
             'roller': 2,
             'collective': 3,
             'croc': 4,
             'are': 5,
             'a': 6,
             'sports': 7,
             'league': 8,
             'based': 9,
             'in': 10,
             'wales': 11,
             'founded': 12,
             'go': 13,
             'pack': 14,
             'is': 15,
             'fight': 16,
             'song': 17,
             'of': 18,
             'green': 19,
             'bay': 20,
             'packers': 21,
             'and': 22,
             'first': 23,
             'english': 24,
             'translation': 25,
             'east': 26,
             'was': 27,
             'journal': 28,
             '1898': 29,
             'by': 30,
             'jesuit': 31,
             'chaldean': 32,
             'ajman': 33,
             'international': 34,
             'airport': 35,
             'arabic': 36,

In [20]:
int2tokens = {value: key for key, value in tokens2int.items()}
int2tokens

{0: 'the',
 1: 'cardiff',
 2: 'roller',
 3: 'collective',
 4: 'croc',
 5: 'are',
 6: 'a',
 7: 'sports',
 8: 'league',
 9: 'based',
 10: 'in',
 11: 'wales',
 12: 'founded',
 13: 'go',
 14: 'pack',
 15: 'is',
 16: 'fight',
 17: 'song',
 18: 'of',
 19: 'green',
 20: 'bay',
 21: 'packers',
 22: 'and',
 23: 'first',
 24: 'english',
 25: 'translation',
 26: 'east',
 27: 'was',
 28: 'journal',
 29: '1898',
 30: 'by',
 31: 'jesuit',
 32: 'chaldean',
 33: 'ajman',
 34: 'international',
 35: 'airport',
 36: 'arabic',
 37: 'an',
 38: 'upcoming',
 39: 'which',
 40: 'currently',
 41: 'kapla',
 42: 'construction',
 43: 'set',
 44: 'for',
 45: 'children',
 46: 'adults',
 47: 'sets',
 48: 'consist',
 49: 'only',
 50: 'identical',
 51: 'akmal',
 52: 'ikramovich',
 53: 'ikramov',
 54: 'russia',
 55: 'uzbek',
 56: 'ikromovich',
 57: 'ikromov',
 58: '13',
 59: 'march',
 60: 'alabama',
 61: 'gang',
 62: 'nickname',
 63: 'group',
 64: 'nascar',
 65: 'drivers',
 66: 'subsequently',
 67: 'their',
 68: 'lori',

In [6]:
import string

In [46]:
vocabulary = list(tokens2int.keys())
vocabulary


['the',
 'cardiff',
 'roller',
 'collective',
 'croc',
 'are',
 'a',
 'sports',
 'league',
 'based',
 'in',
 'wales',
 'founded',
 'go',
 'pack',
 'is',
 'fight',
 'song',
 'of',
 'green',
 'bay',
 'packers',
 'and',
 'first',
 'english',
 'translation',
 'east',
 'was',
 'journal',
 '1898',
 'by',
 'jesuit',
 'chaldean',
 'ajman',
 'international',
 'airport',
 'arabic',
 'an',
 'upcoming',
 'which',
 'currently',
 'kapla',
 'construction',
 'set',
 'for',
 'children',
 'adults',
 'sets',
 'consist',
 'only',
 'identical',
 'akmal',
 'ikramovich',
 'ikramov',
 'russia',
 'uzbek',
 'ikromovich',
 'ikromov',
 '13',
 'march',
 'alabama',
 'gang',
 'nickname',
 'group',
 'nascar',
 'drivers',
 'subsequently',
 'their',
 'lori',
 'lemaris',
 'fictional',
 'mermaid',
 'dc',
 'comics',
 'romantic',
 'interest',
 'superman',
 'pasties',
 'singular',
 'pasty',
 'or',
 'pastie',
 'patches',
 'that',
 'cover',
 'person',
 'nipples',
 'areolae',
 'typically',
 'shinto',
 'japanese',
 'romanized',

In [26]:
vocab_size = 4096
sequence_length = 10

vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [32]:
# Construct tf dataset
text = df['Sentence'].tolist()
# text = tf.data.Dataset.from_tensor_slices(text).filter(lambda x: tf.cast(tf.strings.length(x), bool))
print(text[:5])

['The Cardiff Roller Collective (CRoC) are a roller sports league based in Cardiff, Wales. Founded', '"Go! Pack Go!" is the fight song of the Green Bay Packers, and the first', 'Al-Machriq (English translation: The East) was a journal founded in 1898 by Jesuit and Chaldean', 'Ajman International Airport (Arabic: مطار عجمان الدولي) was an upcoming airport which is currently a', 'Kapla is a construction set for children and adults. The sets consist only of identical']


In [47]:
# Vecotorisation
vectorize_layer.adapt(text)

vectorize_sequences = vectorize_layer(text)
vectorize_sequences_np = vectorize_sequences.numpy()

for seq in vectorize_sequences[:5]:
   print(seq)

tf.Tensor([  4 224 145 930 901  45   3 145  76 722], shape=(10,), dtype=int64)
tf.Tensor([194 602 194   2   4 839 464   6   4 192], shape=(10,), dtype=int64)
tf.Tensor([1041   54  398    4  207    9    3  747  100    5], shape=(10,), dtype=int64)
tf.Tensor([1066   63   24   12  288  292  301    9    8  383], shape=(10,), dtype=int64)
tf.Tensor([ 739    2    3  918  491   17  945    7 1084    4], shape=(10,), dtype=int64)


In [37]:
print(vectorize_layer.get_vocabulary())

['', '[UNK]', 'is', 'a', 'the', 'in', 'of', 'and', 'an', 'was', 'born', 'by', 'arabic', 'american', 'al', 'to', 'or', 'for', 'from', '–', 'that', 'published', 'fictional', 'as', 'airport', 'newspaper', 'jazeera', 'former', 'university', 'on', 'ngc', 'located', 'german', 'company', 'comic', 'college', 'appearing', 'who', 'which', 'town', 'politician', 'one', 'its', 'it', 'books', 'are', 'also', 'team', 'swedish', 'professional', 'october', 'november', 'march', 'january', 'english', 'district', 'comics', 'character', 'august', 'us', 'uk', 'network', 'known', 'international', 'artist', 'akira', 'air', '20', '13', 'with', 'when', 'western', 'weekly', 'used', 'urdu', 'superhero', 'sports', 'served', 'september', 'russian', 'richard', 'region', 'pronunciation', 'painter', 'nebula', 'name', 'mozzarella', 'media', 'meaning', 'may', 'marvel', 'lit', 'june', 'james', 'including', 'he', 'has', 'government', 'given', 'general', 'founded', 'film', 'december', 'dc', 'daily', 'czech', 'chinese', 'car

In [43]:
# Generate training data
# Generates skip-gram pairs with negative sampling for a list of sequences
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):

  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

 
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [48]:
targets, contexts, labels = generate_training_data(
    sequences=vectorize_sequences_np,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=42)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(targets.shape, contexts.shape, labels.shape)


100%|██████████| 160/160 [00:00<00:00, 706.09it/s]

(627,) (627, 5) (627, 5)





In [49]:
# Configure the dataset for performance
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [51]:
# to improve performance 
AUTOTUNE = tf.data.AUTOTUNE
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [56]:
# Model trainig
embedding_dim = 128
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size, 
                                                 embedding_dim, 
                                                 input_length=1, 
                                                 name='w2v_embedding')
        self.context_embedding = layers.Embedding(vocab_size, 
                                                 embedding_dim)
        
    def call(self, pair):
        target, context = pair

        # context: batch, context
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)

        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        return dots

In [63]:
# loss function and model compile
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [64]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [65]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [66]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20


2024-07-03 13:52:27.027376: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


TypeError: 'NoneType' object is not iterable