# PreProcess

In [20]:
#Read DataSet and Stop words

with open ("hafez.txt", "r") as dataset:
    data = dataset.read().splitlines()

with open ("fa_stop_words.txt", "r") as fa_stop_words:
    stop_words = fa_stop_words.read().splitlines()    

In [21]:
print("data:", "\n1: ", data[0], "\n2: ", data[1], "\n3: ", data[2])
print("\nstop_words:", "\n1: ", stop_words[18], "\n2: ", stop_words[19], "\n3: ", stop_words[20])

data: 
1:  ﻿ 
2:  الا يا ايها الساقي ادر كاسا و ناولها 
3:  كه عشق آسان نمود اول ولي افتاد مشكل‌ها

stop_words: 
1:  ؟ 
2:  آباد 
3:  آخ




---



In [22]:
!pip install hazm

from hazm import *
import numpy as np
import random
import pandas as pd
import tensorflow as tf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
normalizer = Normalizer()
stemmer = Stemmer()
normalized_data = []
tokenized_data = []
clear_data = []
stemmed_data = []
clear_sentences = []
words = []
clear_words = []

#### Normalize
for line in data:
  if line and line != '\ufeff':
    normalized_data.append(normalizer.normalize(line))


#### Tokenize
for line in normalized_data:
  tokenized_data.append(word_tokenize(line))


#### Remove stop words
for line in tokenized_data:
  clear_line = []
  for word in line:
    if word not in stop_words:
      clear_line.append(word)
  clear_data.append(clear_line)



#### Stemize
for line in clear_data:
  stemmed_line = []
  for word in line:
    stemmed_line.append(stemmer.stem(word))
  stemmed_data.append(stemmed_line)


#### Create Bag of words
for i in range(len(stemmed_data)):
  words.extend(stemmed_data[i])


#### Convert words to sentences
for line in stemmed_data:
  clear_sentences.append(" ".join(line))
clear_sentences = [item for item in clear_sentences if item]


# Remove duplicate words
for i in words: 
  if i not in clear_words: 
    clear_words.append(i) 


print("Normalized: ", normalized_data[0])
print("Tokenized: ", tokenized_data[0])
print("Cleared: ", clear_data[0])
print("Words: ", words[0], "size: ", len(words))
print("Non Duplicate Words: ", clear_words[0], "size: ", len(clear_words))
print("Sentence: ", clear_sentences[0])

Normalized:  الا یا ایها الساقی ادر کاسا و ناولها
Tokenized:  ['الا', 'یا', 'ایها', 'الساقی', 'ادر', 'کاسا', 'و', 'ناولها']
Cleared:  ['ایها', 'الساقی', 'ادر', 'کاسا', 'ناولها']
Words:  ای size:  34482
Non Duplicate Words:  ای size:  5773
Sentence:  ای الساق ادر کاسا ناول


# Process

In [24]:
import io
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import save_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

SEED = 42
NUM_NS = 4
AUTOTUNE = tf.data.AUTOTUNE

In [25]:
max_words = len(clear_words)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(clear_sentences)
sequences = tokenizer.texts_to_sequences(clear_sentences)

word2id = tokenizer.word_index
id2word = { v:k for k, v in word2id.items() }

max_len = 20
inverse_vocab = pad_sequences(sequences, padding = 'pre', maxlen= max_len)
print(inverse_vocab)

[[   0    0    0 ... 2998 2143 2999]
 [   0    0    0 ...  591  152  389]
 [   0    0    0 ...   41  217  952]
 ...
 [   0    0    0 ...    0   29  103]
 [   0    0    0 ... 2791  451   45]
 [   0    0    0 ...   96 5713    2]]


In [26]:
def generate_training_data(sequences, window_size, vocab_size):
  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  for sequence in tqdm.tqdm(sequences):
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=NUM_NS,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)
      
      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*NUM_NS, dtype="int64")

      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [27]:
vocab_size = len(clear_words)


targets, contexts, labels = generate_training_data(sequences=sequences, window_size= 2, vocab_size=vocab_size)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|██████████| 8384/8384 [00:04<00:00, 1819.94it/s]




targets.shape: (17216,)
contexts.shape: (17216, 5)
labels.shape: (17216, 5)


In [28]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [29]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [30]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size, embedding_dim, input_length=NUM_NS+1)

  def call(self, pair):
    target, context = pair

    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)

    return dots

In [31]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [32]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [33]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [34]:
word2vec.fit(dataset, epochs=80, callbacks=[tensorboard_callback])

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7f0251873790>

In [35]:
from sklearn.metrics.pairwise import euclidean_distances

weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
distance_matrix = euclidean_distances(weights)

similar_words = {  search_term:  [ id2word[idx] for idx in distance_matrix[word2id[search_term]].argsort()[1:6]] for search_term in ['عشق', 'حافظ', 'می', 'جام', 'دیوانه'] }

similar_words

{'جام': ['زمن', 'شکس', 'حضوری\u200cگر', 'مناز', 'دوخته'],
 'حافظ': ['می\u200cسوز', 'ننگرد', 'العین', 'بیارا', 'مقدر'],
 'دیوانه': ['لاابال', 'آشناس', 'گزارند', 'ننگرد', 'توشه'],
 'عشق': ['پاکباز', 'رمیدن', 'بنمود', 'معماییس', 'فرهادک'],
 'می': ['دیرگاه', 'گهربار', 'شرمسار', 'جاندار', 'خاتم']}

In [36]:
word2vec.save('hafez')

INFO:tensorflow:Assets written to: hafez/assets
