# Word2Vec implementation skip-gram with keras

In [1]:
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re

import numpy as np
import pandas as pd
from hazm import word_tokenize, Lemmatizer, Stemmer, Normalizer
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
np.set_printoptions(suppress=True)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [4]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
# InteractiveShell.ast_node_interactivity = "last_expr"

## Import dataset and persian stop words

In [5]:
with open("Shams_Corpus_Paper3.txt", "r") as file:
    raw_text = file.read()

with open("persian_stopw.txt", "r") as file:
    raw_stop_words = file.read()

stop_words = word_tokenize(raw_stop_words)


def remove_persian_stopword(tokens):
    # return [word for word in tokens if not word in stop_words and word and word not in proned]
    return [word for word in tokens if not word in stop_words and word]

## Preprocessing Part
### Create Lemmatizer and Stemmer functions

In [6]:
normalizer = Normalizer()


def normalize_text(text):
    return normalizer.normalize(text)


lemmatizer = Lemmatizer()


def lemma_tokenizer(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]
    # return [lemmatizer.lemmatize(token).split("#")[0] for token in tokens]


stemmer = Stemmer()


def stem_tokenizer(tokens):
    return [stemmer.stem(token) for token in tokens]

In [7]:
def text_normalization(text):
    raw_text = re.sub(r"-+|\d+|\s+", " ", text)
    raw_text = normalize_text(raw_text)

    return raw_text


def tokenize_text(text, type="lemma"):
    tokens = word_tokenize(text)
    tokens = remove_persian_stopword(tokens)
    if type == "lemma":
        tokens = remove_persian_stopword(lemma_tokenizer(tokens))
    elif type == "stem":
        tokens = remove_persian_stopword(stem_tokenizer(tokens))

    return tokens

### Word tokenize:

In [17]:
data = pd.read_csv("Shams_Corpus_Paper3.txt", names=["sentence"])
data["normalized_sent"] = data["sentence"].apply(lambda x: text_normalization(x))
data["tokens"] = data["normalized_sent"].apply(lambda x: tokenize_text(x))
data.head(10)

Unnamed: 0,sentence,normalized_sent,tokens
0,دیوان شمس تبریزی (غزلیات),دیوان شمس تبریزی (غزلیات),"[دیوان, شمس, تبریزی]"
1,1001 - 1500,,[]
2,--------------------------------------------------------,,[]
3,1001,,[]
4,آه در آن شمع منور چه بود\tکآتش زد در دل و دل را ربود,آه در آن شمع منور چه بود کآتش زد در دل و دل را ربود,"[آه, شمع, منور, کآتش, زد#زن, دل, دل, ربود]"
5,ای زده اندر دل من آتشی\tسوختم ای دوست بیا زود زود,ای زده اندر دل من آتشی سوختم ای دوست بیا زود زود,"[زده, دل, آتش, سوخت#سوز, دوست, زود, زود]"
6,صورت دل صورت مخلوق نیست\tکز رخ دل حسن خدا رو نمود,صورت دل صورت مخلوق نیست کز رخ دل حسن خدا رو نمود,"[دل, رخ, دل, حسن, خدا, رو, نمود]"
7,جز شکرش نیست مرا چاره ای\tجز لب او نیست مرا هیچ سود,جز شکرش نیست مرا چاره‌ای جز لب او نیست مرا هیچ سود,"[شکر, چاره, لب, سود]"
8,یاد کن آن را که یکی صبحدم\tاین دلم از زلف تو بندی گشود,یاد کن آن را که یکی صبحدم این دلم از زلف تو بندی گشود,"[یاد, صبحدم, دل, زلف, بست#بند, گشود#گشا]"
9,جان من اول که بدیدم تو را\tجان من از جان تو چیزی شنود,جان من اول که بدیدم تو را جان من از جان تو چیزی شنود,"[جان, بدیدم, جان, جان, شنود]"


In [18]:
token_df = data["tokens"]
del data
tokens = token_df.explode().dropna().tolist()
len(tokens)
tokens[:10]


37914

['دیوان', 'شمس', 'تبریزی', 'آه', 'شمع', 'منور', 'کآتش', 'زد#زن', 'دل', 'دل']

### Save tokens with pickle serializer

In [19]:
import pickle

with open("tokens_df_moreth2.pkl", "wb") as f:
    pickle.dump(token_df, f)

In [20]:
def concat(*iterables):
    for iterable in iterables:
        yield from iterable

In [21]:
# def generate_training_data(tokens, word_to_id, window):
#     X = []
#     y = []
#     n_tokens = len(tokens)
#     unique_tokens = len(word_to_id)
#     for i in range(n_tokens):
#         idx = concat(
#             range(max(0, i - window), i), range(i, min(n_tokens, i + window + 1))
#         )
#         for j in idx:
#             if i == j:
#                 continue
#             X.append(word_to_id[tokens[i]] - 1)
#             y.append(word_to_id[tokens[j]] - 1)

#     return np.asarray(X), np.asarray(y)

In [22]:
def generate_data(series, word_to_id, window):
    X = []
    y = []
    for index, tokens in series.items():
        n_tokens = len(tokens)
        for i in range(n_tokens):
            idx = concat(
                range(max(0, i - window), i),
                range(i, min(n_tokens, i + window + 1))
            )
            for j in idx:
                if i == j:
                    continue

                X.append(word_to_id[tokens[i]] - 1)
                y.append(word_to_id[tokens[j]] - 1)

    return np.asarray(X), np.asarray(y)

### generate training data with specified window size
#### create word to id and id to word list

In [23]:
window_size = 2

t = Tokenizer(filters="")
t.fit_on_texts(tokens)

sorted_count_list = sorted(t.word_counts.items(), key=lambda x: x[1], reverse=True)
word_to_id, id_to_word = t.word_index, t.index_word

# X, y = generate_training_data(tokens, word_to_id, window_size)
X_sen, y_sen = generate_data(token_df, word_to_id, window_size)

### delete useless variables cause of lack memory :((

In [24]:
# del X_onehot_encoded
# del y_onehot_encoded
X_sen.shape
y_sen.shape
del t
del X_onehot_encoded
del y_onehot_encoded

(120272,)

(120272,)

NameError: ignored

In [25]:
# for i in sorted_count_list:
#     if '#' in i[0]:
#         print(i)
# sorted_count_list[:40]

### Find less frequent words in corpus

In [26]:
proned = []
for i in sorted_count_list:
    if i[1] < 3:
        proned.append(i[0])
stop_words.extend(proned)

In [27]:
# X.shape
X_sen.shape

(120272,)

### Onehot train and test tokens

In [28]:
# echo 1 > /proc/sys/vm/overcommit_memory

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(X_sen)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
one_hotter = onehot_encoder.fit(integer_encoded)
X_onehot_encoded = one_hotter.transform(integer_encoded)
del X_sen

integer_encoded = label_encoder.fit_transform(y_sen)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
y_onehot_encoded = one_hotter.transform(integer_encoded)
del y_sen

In [29]:
# import pickle
# with open("one_hot_encoder.pkl", "wb") as f: 
#     pickle.dump(one_hotter, f)

In [30]:
y_onehot_encoded.shape

(120272, 2575)

In [31]:
X_onehot_encoded.shape

(120272, 2575)

In [32]:
del integer_encoded

### Create network model

In [33]:

from keras.models import Input, Model
from keras.layers import Dense

opt = 'adam'
embed_size = 100
vocab_size = X_onehot_encoded.shape[1]

input_layer = Input(shape=(vocab_size,))
embed_layer = Dense(units=embed_size, activation="linear")(input_layer)
output_layer = Dense(units=vocab_size, activation="softmax")(embed_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=['accuracy', 'mse'])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2575)]            0         
                                                                 
 dense (Dense)               (None, 100)               257600    
                                                                 
 dense_1 (Dense)             (None, 2575)              260075    
                                                                 
Total params: 517,675
Trainable params: 517,675
Non-trainable params: 0
_________________________________________________________________


### Learn network :

In [34]:
epochs_ = 100
model.fit(x=X_onehot_encoded, y=y_onehot_encoded, batch_size=128, epochs=epochs_, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fc77785f910>

### Test model:

In [41]:
def n_similar(words, model_, n=10):
    for word in words:
      word_id = word_to_id[word]
      one_hot = one_hotter.transform(np.array([[word_id]]))
      result = model_.predict([one_hot]).squeeze()
      similars = []
      for similar in (id_to_word[id + 1] for id in np.argsort(result)[::-1][:n]):
          similars.append(similar)
      print(word, '=', similars)

In [53]:
n_similar(['خلق', 'آتش', 'شادی'], model, n=10)

خلق = ['رنگ', 'گل', 'دم', 'دانست#دان', 'جان', 'جگر', 'بو', 'یار', 'سفر', 'یافت#یاب']
آتش = ['شکر', 'دل', 'لب', 'تنگ', 'عشق', 'جان', 'شهد', 'کان', 'پر', 'خوش']
شادی = ['مست', 'دل', 'میان', 'خویش', 'حقست', 'زیر', 'موسی', 'آفتاب', 'قسمت', 'التیه']


### Save model:

In [None]:
model.save(f'mdl_em{embed_size}_ep{epochs_}_vocs{vocab_size}_ws{window_size}_opt{opt}.h5')

## Create word2vec model with pure python:

In [None]:
def init_network(vocab_size, n_embedding):
    model = {
        "w1": np.random.uniform(-1, 1, (vocab_size, n_embedding)),
        "w2": np.random.uniform(-1, 1, (n_embedding, vocab_size))
    }
    return model

In [None]:
model = init_network(len(word_to_id), 10)
model["w1"].shape

In [None]:
model["w2"].shape

In [None]:
def softmax(X):
    #     e_x = np.exp(x - np.max(x))
    #     return e_x / e_x.sum(axis=0)

    res = []
    for x in X:
        exp = np.exp(x)
        res.append(exp / exp.sum())
    return res

In [None]:
def stable_sigmoid(x):
    sig = np.where(x < 0, np.exp(x) / (1 + np.exp(x)), 1 / (1 + np.exp(-x)))
    return sig

In [None]:
X.shape

In [None]:
(X @ model["w1"]).shape

In [None]:
(X @ model["w1"] @ model["w2"]).shape


In [None]:
def forward(model, X, return_cache=True):
    cache = {}

    cache["a1"] = X @ model["w1"]
    cache["a2"] = cache["a1"] @ model["w2"]
    print(f"a2 = {cache['a2']}")
    cache["z"] = softmax(cache["a2"])
    #     cache["z"] = stable_sigmoid(cache["a2"])

    if not return_cache:
        return cache["z"]
    return cache

In [None]:
def cross_entropy(z, y):
    return - np.sum(np.log(z) * y)

In [None]:
def backward(model, X, y, alpha):
    cache = forward(model, X)
    #     dl_weight_inp_hidden = np.outer(target_word_vector, np.dot(weight_hidden_output, total_error.T))
    #     dl_weight_hidden_output = np.outer(hidden_layer, total_error)
    da2 = cache["z"] - y
    dw2 = cache["a1"].T @ da2
    da1 = da2 @ model["w2"].T
    dw1 = X.T @ da1
    assert (dw2.shape == model["w2"].shape)
    assert (dw1.shape == model["w1"].shape)
    model["w1"] -= alpha * dw1
    model["w2"] -= alpha * dw2

    return cross_entropy(cache["z"], y)

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline
% config InlineBackend.figure_format = 'svg'
plt.style.use("seaborn")

model = init_network(len(word_to_id), 10)

n_iter = 100

learning_rate = 0.01

history = [backward(model, X, y, learning_rate) for _ in range(n_iter)]

plt.clf()
plt.plot(range(len(history)), history, color="skyblue")
plt.show()

In [None]:
history

In [None]:
model

In [None]:
learning = one_hot_encode(word_to_id["گرم"], len(word_to_id))
result = forward(model, [learning], return_cache=False)[0]
result

In [None]:
np.argsort(result)[::-1][0:5]

In [None]:
for word in (id_to_word[id] for id in np.argsort(result)[::-1][0:10]):
    print(word)


In [None]:
def get_word_similarities(word, model, n_similars=10):
    try:
        learning = one_hot_encode(word_to_id[word] - 1, len(word_to_id))
    except KeyError:
        print(f"Word = {word} is not in corpus")
        exit()
    result = forward(model, [learning], return_cache=False)[0]
    for word in (id_to_word[id + 1] for id in np.argsort(result)[::-1][0:n_similars]):
        print(word)



In [None]:
def get_embedding(model, word):
    try:
        idx = word_to_id[word] - 1
    except KeyError:
        print("`word` not in corpus")
    one_hot = one_hot_encode(idx, len(word_to_id))
    return forward(model, one_hot)["a1"]

In [None]:
get_embedding(model, "دیو")


In [None]:
get_word_similarities('عیش', model, 10)

In [None]:
get_word_similarities('میخانه', model, 10)

In [None]:
get_word_similarities('بشر', model, 10)

In [None]:
get_word_similarities('ویرانه', model, 10)

In [None]:
get_word_similarities('حلال', model, 10)