# **Neural Word Embedding**

> **Word2Vec, Continuous Bag of Word (CBOW)**

> **Word2Vec, Skip-gram with negative sampling (SGNS)**

> **Main key point: Distributional Hypothesis**

> Goal: Predict the context words from a given word

# **How to implement SGNS Algorithm:**


1.   Data preprocessing
2.   Hyperparameters
3.   Training Data
4.   Model Fitting
5.   Inference/Prediction the testing samples




### **Main Class**

In [0]:
from collections import defaultdict
import numpy as np
class word2vec():

  def __init__(self):
    self.n = hyperparameters['n']
    self.learningrate = hyperparameters['learning_rate']
    self.epochs = hyperparameters['epochs']
    self.windowsize = hyperparameters['window_size']

 
  def word2onehot(self, word):
    word_vector =  np.zeros(self.vocabulary_count)
    word_index = self.word_index[word]
    word_vector[word_index] = 1
    return word_vector

  def generate_training_data(self, setting, corpus):
    word_counts = defaultdict(int)
    # print(word_counts)
    for row in corpus:
      for token in row:
        word_counts[token] +=1 
    #print(word_counts)
    self.vocabulary_count = len(word_counts.keys())
    #print(self.vocabulary_count)
    self.words_list = list(word_counts.keys())
    #print(self.words_list)
    self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
    #print(self.word_index)
    self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
    #print(self.index_word)

    training_data = []
    for sentence in corpus:
      sentence_length = len(sentence)
      for i , word in enumerate(sentence):
        word_target = self.word2onehot(sentence[i])
        #print(word_target)
        word_context = []
        for j in range(i - self.windowsize, i + self.windowsize + 1):
          if j !=i and  j <= sentence_length - 1 and j >= 0:
            word_context.append(self.word2onehot(sentence[j]))
            # print(word_context)
        training_data.append([word_target, word_context])
                              
      return np.array(training_data)
    
  def model_training(self, training_data):
      self.w1 = np.random.uniform(-1, 1, (self.vocabulary_count, self.n))
      self.w2 = np.random.uniform(-1, 1, (self.n, self.vocabulary_count))
      for i in range(0, self.epochs):
        # self.loss = 0
        for word_target, word_context in training_data:
          h, u, y_pred= self.forward_pass(word_target)
          # print(y_pred)
   
  def forward_pass(self, x):
      h = np.dot(self.w1.T, x)
      u = np.dot(self.w2.T, h)
      y_pred= self.softmax(u)
      return h, u, y_pred
    
    
  def softmax(self, x):
      e = np.exp(x - np.max(x))
      return e / e.sum(axis=0)

  def word_vector(self, word):
    word_index = self.word_index[word]
    word_vector = self.w1[word_index]
    return word_vector

  def similar_vectors(self, word, n):
    vw1 = self.word_vector(word)
    word_similar={}
    for i in range(self.vocabulary_count):
      vw2 = self.w1[i]
      theta_nom= np.dot(vw1, vw2)
      theta_denom = np.linalg.norm(vw1) * np.linalg.norm(vw2)
      theta = theta_nom / theta_denom
      # print(theta)

      word = self.index_word[i]
      word_similar[word] = theta
    # {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}
    words_sorted = sorted(word_similar.items(), key=lambda ss: ss[1], reverse=True)
    for word, similar in words_sorted[:n]:
      print(word, similar)



### **1.Data PreProcessing**

In [0]:
# Define the mini corpus
document = "A combination of Machine Learning and Natural Language Processing works well"

# Tokenizing and build a vocabulary
corpus = [[]]
for token in document.split():
  corpus[0].append(token.lower())

print(corpus)

[['a', 'combination', 'of', 'machine', 'learning', 'and', 'natural', 'language', 'processing', 'works', 'well']]


### **2. Hyperparameters**

In [0]:
hyperparameters = {
    'window_size': 2, #it covers two words left and two words right
    'n': 11, # dimension of word embedding
    'epochs': 40, # number of training epochs
    'learning_rate': 0.01, # a coefficient for updating weights
}

### **3. Generate Training Data**

In [0]:
# we need to create one-hot vector based on our given corpus
# 1 [target(a)], [context(combination, of)] == [10000000000],[01000000000][00100000000]
# instance
w2v = word2vec()

training_data = w2v.generate_training_data(hyperparameters, corpus)
# print(training_data)

### **4. Model Training**




In [0]:
w2v.model_training(training_data)

[0.08779638 0.07481873 0.02720081 0.07741055 0.00744272 0.16597957
 0.02244375 0.03065301 0.24696449 0.05698785 0.20230215]
[0.05535867 0.00701134 0.03824704 0.10045965 0.56325283 0.01725363
 0.0265237  0.05966504 0.09830109 0.0269263  0.00700071]
[0.04181769 0.01066172 0.11196532 0.21611837 0.09291857 0.06353542
 0.12186928 0.09201719 0.00719568 0.10265939 0.13924137]
[0.01979228 0.67167761 0.0380796  0.00334896 0.01721451 0.03192899
 0.10938238 0.05316565 0.02946898 0.01481168 0.01112936]
[0.08594459 0.01955307 0.03806679 0.20510115 0.00741567 0.1290254
 0.00654433 0.01746104 0.087872   0.22842949 0.17458647]
[0.09563497 0.0609889  0.12708249 0.11587498 0.02070406 0.07517313
 0.07438113 0.10863157 0.08416487 0.03121457 0.20614931]
[0.05032016 0.23525726 0.16200512 0.01933368 0.09044005 0.02026146
 0.06624078 0.18744993 0.0542594  0.08477761 0.02965456]
[0.09318229 0.04413759 0.24420036 0.10517933 0.12382943 0.06460056
 0.0371188  0.0105303  0.0077964  0.15646752 0.11295743]
[0.045164

### **5. Model Prediction**




In [0]:
vector = w2v.word_vector("works")
print(vector)

[-0.5965974   0.59358364  0.49175356  0.59782454 -0.10149338  0.5909372
 -0.4941789   0.73069452 -0.13549471 -0.7486393   0.16786503]


### **Finding Similar Words**




In [0]:
w2v.similar_vectors("works", 5)

works 1.0
language 0.34217254302544925
machine 0.20539544566784484
natural 0.16382679527923805
a 0.13091314242232238
