# Create word2vec model with negative sampling

### Install packages and adjust setting

In [None]:
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 26.8 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 71.2 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 62.0 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394485 sha256=74f67cd8301f81a7f7cd7cf5b5856c8112813365760119a7b216dff93cf95b25
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86

In [None]:
import re

import numpy as np
import pandas as pd
import tensorflow as tf
from hazm import word_tokenize, Lemmatizer, Stemmer, Normalizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tqdm.notebook import tqdm_notebook

AUTOTUNE = tf.data.AUTOTUNE

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
# InteractiveShell.ast_node_interactivity = "last_expr"

In [None]:
np.set_printoptions(suppress=True)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

### Import corpus and persian stop words

In [None]:
with open("data.txt", "r") as file:
    raw_text = file.read()

with open("persian_stopw.txt", "r") as file:
    raw_stop_words = file.read()

stop_words = word_tokenize(raw_stop_words)


def remove_persian_stopword(tokens):
    # return [word for word in tokens if not word in stop_words and word and word not in proned]
    return [word for word in tokens if not word in stop_words and word]

In [None]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stemmer = Stemmer()


def normalize_text(text):
    return normalizer.normalize(text)


def lemma_tokenizer(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]
    # return [lemmatizer.lemmatize(token).split("#")[0] for token in tokens]


def stem_tokenizer(tokens):
    return [stemmer.stem(token) for token in tokens]

In [None]:
def text_normalization(text):
    raw_text = re.sub(r"-+|\d+|\s+", " ", text)
    raw_text = normalize_text(raw_text)

    return raw_text


def tokenize_text(text, type="lemma"):
    tokens = word_tokenize(text)
    tokens = remove_persian_stopword(tokens)
    if type == "lemma":
        tokens = remove_persian_stopword(lemma_tokenizer(tokens))
    elif type == "stem":
        tokens = remove_persian_stopword(stem_tokenizer(tokens))

    return tokens

### Create tokens dataframe (normalized + lemmatized + removed persian stop words)

In [None]:
data = pd.read_csv("data.txt", names=["sentence"])
data["normalized_sent"] = data["sentence"].apply(lambda x: text_normalization(x))
data["tokens"] = data["normalized_sent"].apply(lambda x: tokenize_text(x))
data.head(10)

Unnamed: 0,sentence,normalized_sent,tokens
0,1,,[]
1,ای رستخیز ناگهان، وی رحمت بی منتها\tای آتشی افروخته، در بیشه ی اندیشه ها,ای رستخیز ناگهان، وی رحمت بی منتها ای آتشی افروخته، در بیشه‌ی اندیشه‌ها,"[رستخیز, ناگهان, رحمت, منتها, آتش, افروخته, بیشه, اندیشه]"
2,امروز خندان آمدی، مفتاح زندان آمدی\tبر مستمندان آمدی، چون بخشش و فضل خدا,امروز خندان آمدی، مفتاح زندان آمدی بر مستمندان آمدی، چون بخشش و فضل خدا,"[خندان, مفتاح, زندان, مستمند, بخشش, فضل, خدا]"
3,خورشید را حاجب تویی، امید را واجب تویی\tمطلب تویی طالب تویی، هم منتها، هم مبتدا,خورشید را حاجب تویی، امید را واجب تویی مطلب تویی طالب تویی، هم منتها، هم مبتدا,"[خورشید, حاجب, امید, واجب, مطلب, طالب, منتها, مبتدا]"
4,در سینه ها برخاسته، اندیشه را آراسته\tهم خویش حاجت خواسته، هم خویشتن کرده روا,در سینه‌ها برخاسته، اندیشه را آراسته هم خویش حاجت خواسته، هم خویشتن کرده روا,"[سینه, برخاسته, اندیشه, آراسته, خویش, حاجت, خواسته, خویشتن, کرده, روا]"
5,ای روح بخش بی بَدَل، وی لذتِ علم و عمل\tباقی بهانه ست و دغل، کاین علت آمد، وآن دوا,ای روح بخش بی بدل، وی لذت علم و عمل باقی بهانه ست و دغل، کاین علت آمد، وآن دوا,"[روح, بخش, بدل, لذت, علم, عمل, باقی, بهانه, دغل, کاین, علت, وآن, دوا]"
6,ما زان دغل کژ بین شده، با بی گنه در کین شده\tگه مست حورالعین شده، گه مست نان و شوربا,ما زان دغل کژ بین شده، با بی گنه در کین شده گه مست حورالعین شده، گه مست نان و شوربا,"[دغل, کژ, بین, گنه, کین, گه, مست, حورالعین, گه, مست, نان, شوربا]"
7,این سُکر بین، هل عقل را، وین ُنقل بین، هل َنقل را\tکز بهر نان و بقل را، چندین نشاید ماجرا,این سکر بین، هل عقل را، وین نقل بین، هل نقل را کز بهر نان و بقل را، چندین نشاید ماجرا,"[سکر, بین, هل, عقل, وین, نقل, بین, هل, نقل, بهر, نان, بقل, نشاید, ماجرا]"
8,تدبیر صد رنگ افکنی، بر روم و بر زنگ افکنی\tواندر میان جنگ افکنی، فی اصطناع لا یری,تدبیر صد رنگ افکنی، بر روم و بر زنگ افکنی واندر میان جنگ افکنی، فی اصطناع لا یری,"[تدبیر, صد, رنگ, افکنی, روم, زنگ, افکنی, واندر, میان, جنگ, افکنی, اصطناع, لا, یری]"
9,میمال پنهان گوش جان، مینه بهانه بر کسان\tجان رب خلصنی زنان، والله که لاغست ای کیا,میمال پنهان گوش جان، مینه بهانه بر کسان جان رب خلصنی زنان، والله که لاغست ای کیا,"[میمال, پنهان, گوش, جان, مینه, بهانه, جان, رب, خلصنی, زن, والله, لاغست, کیا]"


In [None]:
token_df = data["tokens"]
# del data
tokens = token_df.explode().dropna().tolist()
len(tokens)
tokens[:10]


44674

['رستخیز',
 'ناگهان',
 'رحمت',
 'منتها',
 'آتش',
 'افروخته',
 'بیشه',
 'اندیشه',
 'خندان',
 'مفتاح']

### Create word to id and id to word with keras tokenizer

In [None]:
t = Tokenizer(filters="")
t.fit_on_texts(tokens)

sorted_count_list = sorted(t.word_counts.items(), key=lambda x: x[1], reverse=True)
word_to_id, id_to_word = t.word_index, t.index_word

In [None]:
len(word_to_id)

8920

### Generating training data with number of negative samples and window size

In [None]:
def generate_training_data(sentences, window_size, num_negative_s, vocab_size):
    # Elements of each training example are appended to these lists.
    centers, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sentences in corpus
    for sequence in tqdm_notebook(sentences, desc='Sentenses', colour="MAGENTA"):

        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_samples, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0
        )

        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context word and negative samples.
        for center_word, context_word in positive_samples:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_negative_s,
                unique=True,
                range_max=vocab_size,
                seed=42,
                name="negative_sampling"
            )

            # Build context and label vectors (for one center word)
            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

            # Concat negative samples with true context word (positive sample)
            context = tf.concat([context_class, negative_sampling_candidates], 0)

            # label 1 for positive sample and 0 for negative samples.
            label = tf.constant([1] + [0] * num_negative_s, dtype="int64")

            # Append each element from the training example to global lists.
            centers.append(center_word)
            contexts.append(context)
            labels.append(label)

    return centers, contexts, labels

### Convert word tokens to their id with word_to_id

In [None]:
tokenss = []
for row in data["tokens"].dropna().values:
    if row:
        tokenss.append([word_to_id[token] for token in row])
    # print(row)
tokenss[:10]

[[2564, 976, 386, 1424, 24, 1666, 1094, 229],
 [256, 1234, 452, 3763, 870, 410, 22],
 [90, 2565, 664, 2011, 977, 345, 1424, 3764],
 [165, 3765, 229, 1425, 53, 779, 3766, 612, 236, 871],
 [37, 499, 1667, 978, 326, 2566, 247, 527, 1668, 103, 1095, 2567, 346],
 [1668, 613, 84, 2012, 665, 50, 9, 3767, 50, 9, 257, 2568],
 [780, 84, 614, 26, 283, 615, 84, 614, 615, 91, 257, 3768, 1235, 528],
 [1096, 10, 159, 1669, 529, 723, 1669, 3769, 69, 291, 1669, 3770, 67, 872],
 [3771, 124, 75, 1, 3772, 527, 1, 292, 3773, 111, 411, 3774, 1097],
 [171, 32, 3775, 5, 116, 326, 2569, 412, 214, 1098, 64, 724, 284]]

In [None]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# max_length = 10
# padded_tokens = pad_sequences(tokenss, padding='post')
# padded_tokens.shape
# padded_tokens = padded_tokens.tolist()
# padded_tokens = [np.array(lst) for lst in padded_tokens]

tokenss = [np.array(lst) for lst in tokenss]

In [None]:
len(tokenss)

5316

### Create ((center words, contexts words,), labels) for feed to network

In [None]:
window_size = 5
num_negative_samples = 200
vocab_size = len(word_to_id) + 1

centers, contexts, labels = generate_training_data(
    sentences=tokenss,
    window_size=window_size,
    num_negative_s=num_negative_samples,
    vocab_size=vocab_size
)

centers = np.array(centers)
contexts = np.array(contexts)[:, :, 0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {centers.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

Sentenses:   0%|          | 0/5316 [00:00<?, ?it/s]



targets.shape: (69682,)
contexts.shape: (69682, 201)
labels.shape: (69682, 201)


In [None]:
centers[:10]
print()
contexts[:10]

array([ 976,  976, 2564,  976, 2564, 1424,  976, 1424,  976, 2564])




array([[  24,  275,   26, ...,   35, 1784,  364],
       [1666,  106,  232, ...,  278,  123, 8581],
       [  24,  184, 1844, ...,  386, 1030,   23],
       ...,
       [2564, 1930, 1398, ...,  421, 6991,  479],
       [1424, 1080,  527, ...,  571,  200,  981],
       [ 976,  342,  245, ..., 6658,   55,  671]])

### Use tensorflow caching feature and set batch size

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((centers, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
for i in dataset.take(1):
    print(i)

((<tf.Tensor: shape=(1024,), dtype=int64, numpy=array([ 286, 4185, 2674, ..., 4098, 4356, 3936])>, <tf.Tensor: shape=(1024, 201), dtype=int64, numpy=
array([[   8,   66,   26, ..., 1215,  305,  106],
       [1504, 1015,    1, ...,  300,  567,  221],
       [ 676,    1,    0, ...,  363, 1103,   13],
       ...,
       [4097,    2,    8, ..., 1067, 8002, 1121],
       [4355,  221,   22, ..., 2533, 3480,  169],
       [   2,   57, 1770, ...,  127, 1163,  274]])>), <tf.Tensor: shape=(1024, 201), dtype=int64, numpy=
array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])>)


### Build customize model with keras

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1,
            name="center_embedding"
        )
        self.context_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_negative_samples + 1,
            name="context_embedding"
        )

    def call(self, pair):
        target, context = pair
        print()
        print(f"target = {target}")
        print(f"context = {context}")
        # target: (batch, dummy)
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        print(f"word_emb = {word_emb}")

        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        print(f"context_emb = {context_emb}")

        # context_emb: (batch, context, embed)
        # Einstein summation:
        # define element-wise computation: sum(word_emb * context_emb)
        # computes the dot product of target and context embeddings from a training pair
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        print(f"dots = {dots}")

        # dots: (batch, context)
        return dots

In [None]:
embedding_dim = 200
epochs_ = 50
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)
word2vec.fit(dataset, epochs=epochs_)

Epoch 1/50

target = Tensor("IteratorGetNext:0", shape=(1024,), dtype=int64)
context = Tensor("IteratorGetNext:1", shape=(1024, 201), dtype=int64)
word_emb = Tensor("word2_vec/center_embedding/embedding_lookup/Identity_1:0", shape=(1024, 200), dtype=float32)
context_emb = Tensor("word2_vec/context_embedding/embedding_lookup/Identity_1:0", shape=(1024, 201, 200), dtype=float32)
dots = Tensor("word2_vec/einsum/Einsum:0", shape=(1024, 201), dtype=float32)

target = Tensor("IteratorGetNext:0", shape=(1024,), dtype=int64)
context = Tensor("IteratorGetNext:1", shape=(1024, 201), dtype=int64)
word_emb = Tensor("word2_vec/center_embedding/embedding_lookup/Identity_1:0", shape=(1024, 200), dtype=float32)
context_emb = Tensor("word2_vec/context_embedding/embedding_lookup/Identity_1:0", shape=(1024, 201, 200), dtype=float32)
dots = Tensor("word2_vec/einsum/Einsum:0", shape=(1024, 201), dtype=float32)
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/

<keras.callbacks.History at 0x7f39d5c5bfd0>

In [None]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 center_embedding (Embedding  multiple                 1784200   
 )                                                               
                                                                 
 context_embedding (Embeddin  multiple                 1784200   
 g)                                                              
                                                                 
Total params: 3,568,400
Trainable params: 3,568,400
Non-trainable params: 0
_________________________________________________________________


In [None]:
weights = word2vec.get_layer('center_embedding').get_weights()[0]

In [None]:
weights.shape

(8921, 200)

## Create 3d/2d dimentions dataframe for plot interactive scaterplot by plotly library 

In [71]:
from sklearn.decomposition import PCA

three_dim = PCA(random_state=0).fit_transform(weights)[:,:3]
dims3d = pd.DataFrame(three_dim, columns=['x', 'y', 'z'])

two_dim = PCA(random_state=0).fit_transform(weights)[:,:2]
dims2d = pd.DataFrame(two_dim, columns=['x', 'y'])

dims3d

Unnamed: 0,x,y,z
0,-0.038840,-0.140010,-0.062936
1,-0.072194,-0.142408,0.023426
2,-1.116818,-0.100878,0.054325
3,-0.059786,0.143165,-0.471768
4,-0.794100,0.223348,1.273033
...,...,...,...
8916,0.188877,-0.066858,-0.807359
8917,-0.023224,-0.130359,-0.069666
8918,-0.050306,-0.014373,0.337730
8919,-0.059316,-0.347420,0.103774


In [72]:
words = list(word_to_id.keys())
words.append('end')
dims2d['token'] = words
dims3d['token'] = words
dims2d

Unnamed: 0,x,y,token
0,-0.038840,-0.140010,جان
1,-0.072194,-0.142408,دل
2,-1.116818,-0.100878,عشق
3,-0.059786,0.143165,سر
4,-0.794100,0.223348,سو
...,...,...,...
8916,0.188877,-0.066858,خیالاتست
8917,-0.023224,-0.130359,آکند
8918,-0.050306,-0.014373,یکایک
8919,-0.059316,-0.347420,ترکانست


In [73]:
dims2d.head(4000).to_pickle("dim2d.pkl")
dims3d.head(4000).to_pickle("dim3d.pkl")

### Save weights of center_embedding layer (word embeddings) to numpy array

In [None]:
file_name = f'weights_nneg{num_negative_samples}_em{embedding_dim}_ep{epochs_}_vocs{vocab_size}_ws{window_size}'
np.save(file_name, weights)

In [65]:
# load_file_name = f'weights_nneg{num_negative_samples}_em{embedding_dim}_ep{epochs_}_vocs{vocab_size}_ws{window_size}'
# loaded_weights = np.load(load_file_name)

### Find nearest neighbor word with cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix = cosine_similarity(weights, weights)
print(cosine_matrix)


[[ 1.0000002   0.05818215  0.01750027 ... -0.00045787  0.07242609
  -0.03293831]
 [ 0.05818215  1.          0.05692578 ...  0.04686942 -0.10650274
   0.01246404]
 [ 0.01750027  0.05692578  0.9999997  ...  0.07201505 -0.03514342
   0.07119064]
 ...
 [-0.00045787  0.04686942  0.07201505 ...  0.9999994   0.01036097
   0.08479095]
 [ 0.07242609 -0.10650274 -0.03514342 ...  0.01036097  0.9999998
   0.02170496]
 [-0.03293831  0.01246404  0.07119064 ...  0.08479095  0.02170496
   0.99999976]]


In [None]:
cosine_matrix.shape


def cosine_similarity_word(words, cosine_matrix, n=10):
    for word in words:
        similars = []
        for id in cosine_matrix[word_to_id[word]].argsort()[::-1][0:n]:
            similars.append(id_to_word[id])
        print(word, '=', similars, '\n')


(8921, 8921)

In [None]:
sorted_count_list[30:50]

[('مه', 114),
 ('بس', 111),
 ('دانست#دان', 111),
 ('ماه', 110),
 ('کان', 109),
 ('پر', 109),
 ('روح', 107),
 ('عالم', 106),
 ('نی', 103),
 ('ره', 102),
 ('باد', 100),
 ('تن', 100),
 ('تبریز', 100),
 ('کار', 98),
 ('آنک', 98),
 ('گرفت#گیر', 95),
 ('خون', 93),
 ('پا', 92),
 ('رخ', 92),
 ('گه', 91)]

### Test Model

In [61]:
cosine_similarity_word(['حسین', 'یوسف', 'خسرو', 'فروغ' ,'دجله'], cosine_matrix, 10)

حسین = ['حسین', 'آبدارست', 'حشرگاه', 'کربلا', 'کربلایی', 'ریاضت', 'مری', 'می\u200cنهان', 'مجنبان', 'میهمان'] 

یوسف = ['یوسف', 'فریب', 'مقلتی', 'تجری', 'نتیجه', 'بالولا', 'بین', 'الولا', 'غرق', 'زلیخا'] 

خسرو = ['خسرو', 'درمکش', 'خسروان', 'دردمید', 'شه', 'ماتست', 'برآور', 'بگفتی', 'الکبرست', 'ساقیست'] 

فروغ = ['فروغ', 'خجلت', 'صواب', 'فکنده', 'خطا', 'تصرف', 'روا', 'برفروخت', 'دفتر', 'پریدستی'] 

دجله = ['دجله', 'جیحون', 'صما', 'مقیم', 'پرنم', 'جای', 'کوثر', 'هیبت', 'موش', 'ندامت'] 



In [None]:
word_to_id

In [None]:
for i, j in word_to_id.items():
    if 'کم' in i[-2:]:
        print(i)

In [None]:
for i, j in word_to_id.items():
    if '_' in i:
        print(i)

In [None]:
def generate_training_data(sentences, window_size, num_negative_s, vocab_size):
    # Elements of each training example are appended to these lists.
    centers, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sentences in corpus
    for sequence in tqdm_notebook(sentences, desc='Sentenses', colour="MAGENTA"):

        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_samples, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0
        )

        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context word and negative samples.
        for center_word, context_word in positive_samples:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_negative_s,
                unique=True,
                range_max=vocab_size,
                seed=42,
                name="negative_sampling"
            )

            # Build context and label vectors (for one center word)
            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

            # Concat negative samples with true context word (positive sample)
            context = tf.concat([context_class, negative_sampling_candidates], 0)

            # label 1 for positive sample and 0 for negative samples.
            label = tf.constant([1] + [0] * num_negative_s, dtype="int64")

            # Append each element from the training example to global lists.
            centers.append(center_word)
            contexts.append(context)
            labels.append(label)

    return centers, contexts, labels

### Convert word tokens to their id with word_to_id

In [None]:
tokenss = []
for row in data["tokens"].dropna().values:
    if row:
        tokenss.append([word_to_id[token] for token in row])
    # print(row)
tokenss[:10]

[[2565, 976, 386, 1424, 24, 1666, 1094, 229],
 [256, 1234, 452, 3764, 870, 410, 22],
 [90, 2566, 664, 2012, 977, 345, 1424, 3765],
 [165, 3766, 229, 1425, 53, 779, 3767, 612, 236, 871],
 [37, 499, 1667, 978, 326, 2567, 247, 527, 1668, 103, 1095, 2568, 346],
 [1668, 613, 84, 2013, 665, 50, 9, 3768, 50, 9, 257, 2569],
 [780, 84, 614, 26, 283, 615, 84, 614, 615, 91, 257, 3769, 1235, 528],
 [1096, 10, 159, 1669, 529, 723, 1669, 3770, 69, 291, 1669, 3771, 67, 872],
 [3772, 124, 75, 1, 3773, 527, 1, 292, 3774, 111, 411, 3775, 1097],
 [171, 32, 3776, 5, 116, 326, 2570, 412, 214, 1098, 64, 724, 284]]

In [None]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# max_length = 10
# padded_tokens = pad_sequences(tokenss, padding='post')
# padded_tokens.shape
# padded_tokens = padded_tokens.tolist()
# padded_tokens = [np.array(lst) for lst in padded_tokens]

tokenss = [np.array(lst) for lst in tokenss]

In [None]:
len(tokenss)

5316

### Create ((center words, contexts words,), labels) for feed to network

In [None]:
window_size = 5
num_negative_samples = 200
vocab_size = len(word_to_id) + 1

centers, contexts, labels = generate_training_data(
    sentences=tokenss,
    window_size=window_size,
    num_negative_s=num_negative_samples,
    vocab_size=vocab_size
)

centers = np.array(centers)
contexts = np.array(contexts)[:, :, 0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {centers.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

Sentenses:   0%|          | 0/5316 [00:00<?, ?it/s]



targets.shape: (69457,)
contexts.shape: (69457, 201)
labels.shape: (69457, 201)


In [None]:
centers[:10]
print()
contexts[:10]

array([1424, 1424, 1424, 1424, 1424, 1424, 1424,  452,  452,  452])




array([[1666,   14, 1273, ..., 2898, 4240,   71],
       [2565,  613,   13, ...,  875, 5675, 2930],
       [ 976, 2536, 1029, ...,  151, 1438,  563],
       ...,
       [ 870,    1,  531, ..., 2068,  608, 2819],
       [3764, 5778,  531, ...,   29,  332,  120],
       [ 256,    7,  626, ...,   15,  237, 1664]])

### Use tensorflow caching feature and set batch size

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((centers, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
for i in dataset.take(1):
    print(i)

((<tf.Tensor: shape=(1024,), dtype=int64, numpy=array([  22, 3917, 4230, ..., 4160, 1442, 2860])>, <tf.Tensor: shape=(1024, 201), dtype=int64, numpy=
array([[ 131,  271,  406, ..., 1324,   48,   70],
       [ 185,  762, 2120, ..., 4952,  175,  228],
       [4232,  987,    3, ...,   33,  165, 6089],
       ...,
       [1758,   43,   20, ...,  266, 3292,  458],
       [1542,    1,   12, ..., 2382,  152,  536],
       [ 295,    6, 3240, ...,  977,  978,  168]])>), <tf.Tensor: shape=(1024, 201), dtype=int64, numpy=
array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])>)


### Build customize model with keras

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1,
            name="center_embedding"
        )
        self.context_embedding = layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_negative_samples + 1,
            name="context_embedding"
        )

    def call(self, pair):
        target, context = pair
        print()
        print(f"target = {target}")
        print(f"context = {context}")
        # target: (batch, dummy)
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        print(f"word_emb = {word_emb}")

        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        print(f"context_emb = {context_emb}")

        # context_emb: (batch, context, embed)
        # Einstein summation:
        # define element-wise computation: sum(word_emb * context_emb)
        # computes the dot product of target and context embeddings from a training pair
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        print(f"dots = {dots}")

        # dots: (batch, context)
        return dots

In [None]:
embedding_dim = 200
epochs_ = 50
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)
word2vec.fit(dataset, epochs=epochs_)

Epoch 1/50

target = Tensor("IteratorGetNext:0", shape=(1024,), dtype=int64)
context = Tensor("IteratorGetNext:1", shape=(1024, 201), dtype=int64)
word_emb = Tensor("word2_vec_7/center_embedding/embedding_lookup/Identity_1:0", shape=(1024, 200), dtype=float32)
context_emb = Tensor("word2_vec_7/context_embedding/embedding_lookup/Identity_1:0", shape=(1024, 201, 200), dtype=float32)
dots = Tensor("word2_vec_7/einsum/Einsum:0", shape=(1024, 201), dtype=float32)

target = Tensor("IteratorGetNext:0", shape=(1024,), dtype=int64)
context = Tensor("IteratorGetNext:1", shape=(1024, 201), dtype=int64)
word_emb = Tensor("word2_vec_7/center_embedding/embedding_lookup/Identity_1:0", shape=(1024, 200), dtype=float32)
context_emb = Tensor("word2_vec_7/context_embedding/embedding_lookup/Identity_1:0", shape=(1024, 201, 200), dtype=float32)
dots = Tensor("word2_vec_7/einsum/Einsum:0", shape=(1024, 201), dtype=float32)
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/

<keras.callbacks.History at 0x7f59a3e4a690>

In [None]:
word2vec.summary()

Model: "word2_vec_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 center_embedding (Embedding  multiple                 1784400   
 )                                                               
                                                                 
 context_embedding (Embeddin  multiple                 1784400   
 g)                                                              
                                                                 
Total params: 3,568,800
Trainable params: 3,568,800
Non-trainable params: 0
_________________________________________________________________


In [None]:
weights = word2vec.get_layer('center_embedding').get_weights()[0]

In [None]:
weights.shape

(8922, 200)

### Save weights of center_embedding layer (word embeddings) to numpy array

In [None]:
file_name = f'weights_nneg{num_negative_samples}_em{embedding_dim}_ep{epochs_}_vocs{vocab_size}_ws{window_size}'
np.save(file_name, weights)

In [None]:
load_file_name = f'weights_nneg{num_negative_samples}_em{embedding_dim}_ep{epochs_}_vocs{vocab_size}_ws{window_size}'
loaded_weights = np.load(load_file_name)

### Find nearest neighbor word with cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix = cosine_similarity(weights, weights)
print(cosine_matrix)


[[ 1.0000001  -0.0833027  -0.02161321 ...  0.03165701  0.00011451
  -0.06165543]
 [-0.0833027   1.          0.08032579 ...  0.01039312 -0.05719611
  -0.01495304]
 [-0.02161321  0.08032579  1.         ... -0.0136863   0.07858521
   0.01053756]
 ...
 [ 0.03165701  0.01039312 -0.0136863  ...  0.9999998  -0.19599128
   0.08485357]
 [ 0.00011451 -0.05719611  0.07858521 ... -0.19599128  0.99999994
  -0.08715412]
 [-0.06165543 -0.01495304  0.01053756 ...  0.08485357 -0.08715412
   0.9999998 ]]


In [None]:
cosine_matrix.shape


def cosine_similarity_word(words, cosine_matrix, n=10):
    for word in words:
        similars = []
        for id in cosine_matrix[word_to_id[word]].argsort()[::-1][0:n]:
            similars.append(id_to_word[id])
        print(word, '=', similars, '\n')


(8922, 8922)

In [None]:
sorted_count_list[30:50]

[('مه', 114),
 ('بس', 111),
 ('دانست#دان', 111),
 ('ماه', 110),
 ('کان', 109),
 ('پر', 109),
 ('روح', 107),
 ('عالم', 106),
 ('نی', 103),
 ('ره', 102),
 ('باد', 100),
 ('تن', 100),
 ('تبریز', 100),
 ('کار', 98),
 ('آنک', 98),
 ('گرفت#گیر', 95),
 ('خون', 93),
 ('پا', 92),
 ('رخ', 92),
 ('گه', 91)]

### Test Model

In [None]:
cosine_similarity_word(['شاه', 'یوسف', 'خسرو', 'گل','دجله'], cosine_matrix, 10)

شاه = ['شاه', 'فتادست', 'فساق', 'همامست', 'مباش', 'خسروان', 'شهرست', 'بندگان', 'مکافات', 'گردد\u200cگر'] 

یوسف = ['یوسف', 'سیماست', 'زلیخا', 'بریدند', 'نتیجه', 'اندرنگر', 'تمامست', 'دهل', 'اعلا', 'بیچاره'] 

خسرو = ['خسرو', 'ما\u200cگر', 'خورشیدروی', 'قباد', 'بشاید', 'بازبیاریم', 'مخدوم', 'انس', 'سرور', 'حسرت'] 

گل = ['گل', 'هامونست', 'بدریده\u200cای', 'میوه', 'گرینده', 'سامریست', 'اندرفتد', 'بخندان', 'مپیچان', 'گوارد'] 

دجله = ['دجله', 'فرات', 'جیحون', 'مقیم', 'صخره', 'صما', 'پرنم', 'بدندی', 'زهر', 'ویس'] 



In [None]:
word_to_id

In [None]:
for i, j in word_to_id.items():
    if 'کم' in i[-2:]:
        print(i)

کم
حکم
شکم
حاکم
هواکم
فقدکم
اصحابکم
اعقابکم
جنبکم
ذنبکم
ربکم
تفاحکم
اصباحکم
ارواحکم
اریاحکم
یعقوبکم
قدامکم
دونکم
لحظکم
لقیاکم
لقائکم
شدکم
بهواکم
عنکم
فناکم
رایناکم
بضیاکم
بلاکم
غیرکم
سواکم
حورکم
احیاکم
حیاتکم
یترککم
ودکم
خلاکم
فدیتکم
قتیلکم
فاتکم
بدتکم
ایبکم
می‌کم


In [None]:
for i, j in word_to_id.items():
    if '_' in i:
        print(i)