# PreProcess

### Read DataSet

In [1]:
with open ("hafez.txt", "r") as dataset:
    data = dataset.read().splitlines()

In [None]:
data

### Read Stop Words

In [4]:
with open ("fa_stop_words.txt", "r") as fa_stop_words:
    stop_words = fa_stop_words.read().splitlines()

In [None]:
stop_words

### Normalize with Hazm

In [None]:
!pip install hazm

In [7]:
from hazm import *

In [8]:
normalizer = Normalizer()
normalized_data = []
for line in data:
  normalized_data.append(normalizer.normalize(line))

In [None]:
normalized_data

### Tokenize with Hazm

In [10]:
tokenized_data = []
for line in normalized_data:
  tokenized_data.append(word_tokenize(line))

In [None]:
tokenized_data

### Remove Stop Words

In [13]:
clear_data = []
for line in tokenized_data:
  clear_line = []
  for word in line:
    if word not in stop_words:
      clear_line.append(word)
  clear_data.append(clear_line)

In [None]:
clear_data

### Stemming with Hazm
    Finding the root of words

In [20]:
stemmer = Stemmer()
stemmed_data = []
for line in clear_data:
  stemmed_line = []
  for word in line:
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) > 1:
      stemmed_line.append(stemmed_word)

  stemmed_data.append(stemmed_line)  

In [None]:
stemmed_data

### convert line arrays to sentence

In [21]:
clear_sentences = []
for line in stemmed_data:
  clear_sentences.append(" ".join(line))

clear_sentences = [item for item in clear_sentences if item]  

In [None]:
clear_sentences

# Process

### Build the corpus vocabulary

In [None]:
from keras.preprocessing import text

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(clear_sentences)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in clear_sentences]

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

### Build a skip-gram generator

In [None]:
from keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]

#Remove empty elements
for i, element in enumerate(skip_grams):
  skip_grams[i] = [item for item in element if item]  
skip_grams = [item for item in skip_grams if item]  

pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))


### Build the skip-gram model architecture

In [None]:
from keras.layers import *
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential

word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
word_model.add(Reshape((embed_size, )))

context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
context_model.add(Reshape((embed_size,)))

merged_output = add([word_model.output, context_model.output]) 

model = Sequential()
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))

final_model = Model([word_model.input, context_model.input], model(merged_output))
final_model.compile(loss="mean_squared_error", optimizer="rmsprop")
final_model.summary()

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(final_model, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg')) 

### Train the model

In [None]:
skip_grams

In [None]:
import numpy as np

for epoch in range(1, 100):
     loss = 0
     for i, element in enumerate(skip_grams):
         pair_first_element = np.array(list(zip(*element[0]))[0], dtype='int32')
         pair_second_element = np.array(list(zip(*element[0]))[1], dtype='int32')
         labels = np.array(element[1], dtype='int32')
         X = [pair_first_element, pair_second_element]
         Y = labels
         if i % 10000 == 0:
             print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
         loss += final_model.train_on_batch(X,Y)  
     print('Epoch:', epoch, 'Loss:', loss) 

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 1621.2176669342443
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 1434.3298737863079


### Get word embeddings

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0][1:]
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] for search_term in ['حافظ', 'خط','دل', 'عشق', 'خدا']}
similar_words 

(5690, 5690)


{'حافظ': ['استظهار', 'رخنه', 'کشفته', 'بخواند', 'مونس'],
 'خدا': ['کلید', 'کاکل', 'جفاک', 'شاهراه', 'بارد'],
 'خط': ['ناله', 'هوادار', 'عال', 'نمی\u200cآید', 'میان'],
 'دل': ['ندیده', 'چشم', 'بخیل', 'بیزار', 'بازآ'],
 'عشق': ['وآنچه', 'صب', 'خطرهاس', 'مهرگیاه', 'همی\u200cبند']}