<a href="https://colab.research.google.com/github/M-H-Amini/NLP/blob/master/PersianEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  In The Name Of ALLAH
#  Persian Word Embeddings
#  Mohammad Hossein Amini (mhamini@aut.ac.ir)

In this project, I've used a 2-layer neural net to extract word embeddings for persian words. I've used the **CBOW** approach. I make a prediction of the center word from the context with a 2-layer neural net. Weights of each layer after training can be an embedding. So I take average of weights of both layers.

In [None]:
import numpy as np
import keras
from google.colab import drive
import os
from matplotlib import pyplot as plt
import shutil
drive.mount('/content/gdrive')

In [None]:
def createDict(files, min_freq=3):
  '''
    Creates a dictionary from my corpus.
    It returns a dictionary in which for each element,
    the key is the word and the value is a tupple in 
    which the first element is the index and the second
    element is the frequency of the word in the corpus.
  '''
  my_dict = dict()
  for file in files:
    with open(file) as f:
      lines = f.readlines()
    for line in lines:
      words = line.split()
      for word in words:
        if word in my_dict.keys():
          my_dict[word] += 1
        else:
          my_dict[word] = 1

  final_dict = {key:my_dict[key] for key in my_dict.keys() if my_dict[key]>=min_freq}
  final_dict = {key:[c,final_dict[key]] for (c, key) in zip(range(len(final_dict)), final_dict.keys())}
  return final_dict

files = ['Hafez.txt', 'Saadi.txt']
my_dict = createDict(files, 1)
dict_len = len(my_dict)

In [None]:
print(len(my_dict))

In [None]:
def word2onehotvec(word, my_dict):
  '''  Gets a word and returs a hot vector corresponding to it  '''
  res = np.zeros((dict_len, 1))
  res[my_dict[word][0]] = 1
  return res

print(word2onehotvec('آشفتگان', my_dict))

In [None]:
def createTrainingset(files, my_dict, N=2):
  '''
    Create sample of the training set as a list, by words, not vectors.
    Each element is a list of 2N + 1 elements. It's first element is the
    center word and the others are the context words.
  '''
  ds = []
  for file in files:
    with open(file) as f:
      lines = f.readlines()
    for line in lines:
      words = line.split()
      for i in range(N, len(words)-N-1):
        sample = [words[i]]
        sample.extend([words[j] for j in list(range(i-N, i))+list(range(i+1, i+N+1))])
        ds.append(sample)
  return ds

ds = createTrainingset(files, my_dict)

In [None]:
def createVectorizedTrainingset(ds, my_dict):
  '''
    Turns the dataset from the createTrainingset function
    into a vectorized version.
  '''
  X = []
  y = []
  dict_len = len(my_dict)
  zero = np.zeros_like(word2onehotvec('و', my_dict))
  cnt = 0
  for sample in ds:
    cnt += 1
    if cnt > 30000:
      break
    context_vec = zero.copy()
    # print('c', context_vec)
    for i in range(len(sample) - 1):
      context_vec += word2onehotvec(sample[i+1], my_dict)
    context_vec /= (len(sample) - 1)
    # print(context_vec, context_vec.dtype)
    X.append(context_vec)
    y.append(word2onehotvec(sample[0], my_dict))
    # input()
    # break
  print(X[0])
  X = np.array(X)[:, :, 0]
  y = np.array(y)[:, :, 0]
  return X, y
    
X, y = createVectorizedTrainingset(ds, my_dict)

In [None]:
print(X.shape, y.shape)
print(X[0:1])  #  Problem

In [None]:
def saveModel(model, name='persian'):
  model.save(name)
  shutil.rmtree(os.path.join('gdrive', 'My Drive', name), ignore_errors=True)
  shutil.copytree(name, os.path.join('gdrive', 'My Drive', name))
  print('Done')

In [None]:
def loadModel(name):
  if os.path.isdir(os.path.join('gdrive', 'My Drive', name)):
    shutil.rmtree(name, ignore_errors=True)
    shutil.copytree(os.path.join('gdrive', 'My Drive', name), name)
    model = keras.models.load_model(name)
    print('Model loaded!')
    return model
  model = keras.models.Sequential()
  model.add(keras.layers.Dense(100, activation='sigmoid', input_shape=(len(my_dict),)))
  model.add(keras.layers.Dense(len(my_dict), activation='softmax'))
  print('Model not found...Created a new one!')
  return model

model = loadModel('Persian')

In [None]:
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X, y, batch_size=16, epochs=10)

In [None]:
saveModel(model, 'Persian')

In [None]:
def extractEmbeddings(model):
  '''
    Extracts embeddings from the trained model.
  '''
  ws = model.trainable_weights
  w1 = ws[0].numpy()
  w2 = ws[2].numpy().T
  w3 = (w1 + w2) / 2
  return w3

embd = extractEmbeddings(model)

Let's visualize the embeddings in a 2-D plot.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(2)
pca.fit(embd)
embd2d = pca.transform(embd)

In [None]:
chosen_words = ['دوش', 'سحر', 'ملکوت', 'اعلی', 'عشق', 'عاشق', 'معشوق']
chosen_indxs = [my_dict[word][0] for word in chosen_words]
chosen_2dvecs = embd2d[chosen_indxs]

In [None]:
plt.figure()
for i in range(len(chosen_words)):
  plt.plot(chosen_2dvecs[i, 0], chosen_2dvecs[i, 1], 'gx')
  plt.text(chosen_2dvecs[i, 0], chosen_2dvecs[i, 1], chosen_words[i])
plt.show()