In [36]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [25]:
corpus = [
          'he is a king',
          'she is a queen',
          'he is a man',
          'she is a woman',
          'warsaw is poland capital',
          'berlin is germany capital',
          'paris is france capital',
          ]

In [26]:
def tokenize_corpus(corpus):
  tokens = [x.split() for x in corpus]
  return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [27]:
tokenized_corpus

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [28]:
vocabulary = []

for sentence in tokenized_corpus:
  for token in sentence:
    if token not in vocabulary:
      vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [29]:
print(vocabulary)

print(word2idx)

print(idx2word)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']
{'he': 0, 'is': 1, 'a': 2, 'king': 3, 'she': 4, 'queen': 5, 'man': 6, 'woman': 7, 'warsaw': 8, 'poland': 9, 'capital': 10, 'berlin': 11, 'germany': 12, 'paris': 13, 'france': 14}
{0: 'he', 1: 'is', 2: 'a', 3: 'king', 4: 'she', 5: 'queen', 6: 'man', 7: 'woman', 8: 'warsaw', 9: 'poland', 10: 'capital', 11: 'berlin', 12: 'germany', 13: 'paris', 14: 'france'}


In [None]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
  print("sentence : ",sentence)
  indices = [word2idx[word] for word in sentence]
  print("indices : ",indices)
  print("indices length : ",range(len(indices)))
  # for each word, threated as center word
  for center_word_pos in range(len(indices)):
    print("center_word_pos : ",center_word_pos)
    # for each window position
    for w in range(-window_size, window_size + 1):
      context_word_pos = center_word_pos + w
      print("context_word_pos : ",context_word_pos)
      # make soure not jump out sentence
      if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
        print("True")
        continue
      context_word_idx = indices[context_word_pos]
      idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [34]:
def get_input_layer(word_idx):
  x = torch.zeros(vocabulary_size).float()
  x[word_idx] = 1.0
  return x

In [42]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
  loss_val = 0
  for data, target in idx_pairs:
    x = Variable(get_input_layer(data)).float()
    y_true = Variable(torch.from_numpy(np.array([target])).long())

    z1 = torch.matmul(W1, x)
    z2 = torch.matmul(W2, z1)
    
    log_softmax = F.log_softmax(z2, dim=0)

    loss = F.nll_loss(log_softmax.view(1,-1), y_true)
    loss_val += loss.item()
    loss.backward()
    W1.data -= learning_rate * W1.grad.data
    W2.data -= learning_rate * W2.grad.data

    W1.grad.data.zero_()
    W2.grad.data.zero_()
  if epo % 10 == 0:    
    print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 4.71808587695871
Loss at epo 10: 4.166711593951498
Loss at epo 20: 3.7977089405059816
Loss at epo 30: 3.5320591398647854
Loss at epo 40: 3.32631641796657
Loss at epo 50: 3.158679756096431
Loss at epo 60: 3.017600643634796
Loss at epo 70: 2.896435400417873
Loss at epo 80: 2.791146412917546
Loss at epo 90: 2.699245287690844
