In [33]:
import pandas as pd
from sklearn.datasets import load_files

In [34]:
data = load_files('wkp_sorted')

sentences = []

for file in data.filenames:
    with open(file, 'r') as f:
        words = []
        for line in f:
            words.extend(line.split())
        first_15 = words[:15]
        sentence = ' '.join(first_15)
        sentences.append(sentence)


df = pd.DataFrame(sentences, columns=['Sentence'])
df.to_csv('data.csv')

print(df.head())


                                            Sentence
0  The Cardiff Roller Collective (CRoC) are a rol...
1  "Go! Pack Go!" is the fight song of the Green ...
2  Al-Machriq (English translation: The East) was...
3  Ajman International Airport (Arabic: مطار عجما...
4  Kapla is a construction set for children and a...


In [35]:
# Word2Vec
# from scipy.linalg.special_matrices import triu
from gensim.models import Word2Vec

# Load data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence
0,0,The Cardiff Roller Collective (CRoC) are a rol...
1,1,"""Go! Pack Go!"" is the fight song of the Green ..."
2,2,Al-Machriq (English translation: The East) was...
3,3,Ajman International Airport (Arabic: مطار عجما...
4,4,Kapla is a construction set for children and a...


In [36]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import numpy as np

In [37]:

# Remove stopwords and punctuation
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words and w.isalpha()]
    # remove numbers
    filtered_sentence = [w for w in filtered_sentence if not w.isdigit()]
    return ' '.join(filtered_sentence)


data['Sentence'] = data['Sentence'].apply(remove_stopwords)
data.head()




d = []
for sentence in data['Sentence']:
    tok = []
    for word in word_tokenize(sentence):
        tok.append(word.lower())
    d.append(tok)


data['Tokenized'] = d
data.head()


Unnamed: 0.1,Unnamed: 0,Sentence,Tokenized
0,0,The Cardiff Roller Collective CRoC roller spor...,"[the, cardiff, roller, collective, croc, rolle..."
1,1,Go Pack Go fight song Green Bay Packers first,"[go, pack, go, fight, song, green, bay, packer..."
2,2,English translation The East journal founded J...,"[english, translation, the, east, journal, fou..."
3,3,Ajman International Airport Arabic مطار عجمان ...,"[ajman, international, airport, arabic, مطار, ..."
4,4,Kapla construction set children adults The set...,"[kapla, construction, set, children, adults, t..."


In [38]:
model = Word2Vec(data['Tokenized'], window=5, min_count=1, workers=4)
word_vectors = model.wv
model.save('word2vec.bin')
print(model.wv)

KeyedVectors<vector_size=100, 975 keys>


In [39]:
model = Word2Vec.load('word2vec.bin')
print(model)

Word2Vec<vocab=975, vector_size=100, alpha=0.025>


In [41]:
# Mean of word vectors
def mean_vector(words, model):
    # remove out of vocabulary words
    words = [word for word in words if word in model.wv]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []
    
data['Mean'] = data['Tokenized'].apply(lambda x: mean_vector(x, model))
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Tokenized,Mean
0,0,The Cardiff Roller Collective CRoC roller spor...,"[the, cardiff, roller, collective, croc, rolle...","[-0.0012059944, 0.00057655765, 0.0004546229, 0..."
1,1,Go Pack Go fight song Green Bay Packers first,"[go, pack, go, fight, song, green, bay, packer...","[0.0023387286, 0.0013184368, 0.0018480197, 0.0..."
2,2,English translation The East journal founded J...,"[english, translation, the, east, journal, fou...","[-0.0027212065, 0.0029319907, 0.002298687, -0...."
3,3,Ajman International Airport Arabic مطار عجمان ...,"[ajman, international, airport, arabic, مطار, ...","[0.00021810588, 0.001563381, 0.0033868733, -0...."
4,4,Kapla construction set children adults The set...,"[kapla, construction, set, children, adults, t...","[0.00047397893, 0.0011637352, 0.003106683, -0...."


## Skip Grams

In [5]:
import io
import re
import string
import tqdm

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

2024-07-03 09:44:25.183322: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-03 09:44:25.872984: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-03 09:44:26.901204: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 09:44:27.577377: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 09:44:27.581029: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-03 09:44:29.040701: I tensorflow/core/platform/cpu_feature_guard.cc:

In [6]:
%reload_ext tensorboard

In [7]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [9]:
#  Vetorize a sample
sen = "The wide road shimmered in the hot sun"
tokens = list(sen.lower().split())
print(len(tokens), tokens)

8 ['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']


In [17]:
# Save mapping fron token to integer indices
import collections

tokens2int = collections.defaultdict(lambda: len(tokens2int))
tokens2int['<pad>'] # add a padding token
token_ids = [tokens2int[token] for token in tokens]
vocab_size = (len(tokens2int))

In [12]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [13]:
int2tokens = {index: token for token, index in vocab.items()}
print(int2tokens)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [18]:
# Generate skip-grams
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      token_ids, 
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

26


In [23]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({int2tokens[target]}, {int2tokens[context]})")

(6, 7): (hot, sun)
(1, 3): (the, road)
(5, 1): (in, the)
(5, 3): (in, road)
(3, 2): (road, wide)


In [26]:
# Negative
# Get target and context words for one positive skip-gram
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  
    num_true=1, 
    num_sampled=num_ns,  
    unique=True,  
    range_max=vocab_size,  
    seed=SEED,  
    name="negative_sampling"  
)

print(negative_sampling_candidates)
print([int2tokens[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


2024-07-03 10:36:33.765235: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-03 10:36:33.766990: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [27]:
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word

In [29]:
print(f"target_index    : {target}")
print(f"target_word     : {int2tokens[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[int2tokens[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 6
target_word     : hot
context_indices : [7 2 1 4 3]
context_words   : ['sun', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [30]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]
