# Lab 6 - word-2-vec with pytorch and gensim

 "A word is characterized by the company it keeps" - Firth (1957)
 

# Exercise 1 (2pt)


- Train word2vec skip-gram model on sentence "the quick brown fox jumps over the lazy dog". Assume context window = 2, embedding_dim = 5. No preprocessing apart from tokenization.
- Compute model output probabilities for words "lazy" and "dog". If you have trained the model correctly, the output probabilities for word "lazy" should be higher for words "over", "the", "dog" (close to 1/3 each) and lower for other words (close to 0 each). For word "dog", the output probabilities should be higher for words, "the", "dog" (close to 1/2 each) and lower for other words (close to 0 each). 
- Compute dot product between the vector of word "dog" and the vector of word "lazy" (could be representation of center vector and representation of context vector) and between "dog" and "brown". Which one is higher? Why?


You can use this tutorial https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb

Use pytorch (or tensorflow).

In [3]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

tcmalloc: large alloc 1073750016 bytes == 0x57a44000 @  0x7f1fba1282a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


<torch._C.Generator at 0x7f6e54fbd710>

In [0]:
sentence = "the quick brown fox jumps over the lazy dog"

In [11]:
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

vocab = (word_tokenize(sentence))
word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}

WINDOW_SIZE = 2
VOCAB_SIZE = len(vocab)
EMBEDDING_DIMS = 5
EPOCHS = 35
idx_pairs = []
indices = [word2idx[word] for word in vocab]
# for each word, treated as center word
for center_word_pos in range(len(indices)):
    # for each window position
    for w in range(-WINDOW_SIZE, WINDOW_SIZE + 1):
        context_word_pos = center_word_pos + w
        # make sure not jump out sentence
        if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
            continue
        context_word_idx = indices[context_word_pos]
        idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array


class Network(nn.Module):
  def __init__(self, vocab_size, embedding_dims):
    super(Network, self).__init__()
    self.linear = nn.Linear(vocab_size, embedding_dims)
    self.inner = nn.Linear(embedding_dims, vocab_size)
    
  def forward(self, input_vec):
    return F.log_softmax(
        (self.inner(
            self.linear(input_vec)
            )
        ), dim=0
    )


def get_input_one_hot(word_idx):
    x = torch.zeros(VOCAB_SIZE).float()
    x[word_idx] = 1.0
    return x
  
model = Network(VOCAB_SIZE, EMBEDDING_DIMS)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(EPOCHS):
  loss_val = 0
  for data, target in idx_pairs:
    model.zero_grad()
    y_true = torch.autograd.Variable(torch.from_numpy(np.array([target])).long())
    x = get_input_one_hot(data).float()
    logits = model(x)
    loss = loss_function(logits.view(1,-1), y_true)
    loss_val += loss.data.item()
    loss.backward()
    optimizer.step()
  with torch.no_grad():
    if epoch % 5 == 0:
      print(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}')
with torch.no_grad():
  lazy = get_input_one_hot(word2idx['lazy'])
  probs = torch.exp(model(lazy))
  print('\n Probs for word \'lazy\'')
  for word, prob in zip(vocab,probs.tolist()):
    print('\t{} : {} '.format(word, format(prob, '.3f')))
  print('Sum = {}'.format(format(probs.sum(), '.1f')))
  print('\n Probs for word \'dog\'')
  dog = get_input_one_hot(word2idx['dog'])
  probs = torch.exp(model(dog))
  for word, prob in zip(vocab,probs.tolist()):
    print('\t{} : {} '.format(word, format(prob, '.3f')))
  print('Sum = {}'.format(format(probs.sum(), '.1f')))
  
  brown = get_input_one_hot(word2idx['brown'])
  print
  print('dot(\'dog\', \'lazy\') =  {}'.format(torch.dot(torch.exp(model(dog)), torch.exp(model(lazy)))))
  print('dot(\'dog\', \'brown\') =  {}'.format(torch.dot(torch.exp(model(dog)), torch.exp(model(brown)))))
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loss at epoch 0: 2.2149279634157817
Loss at epoch 5: 2.053186905384064
Loss at epoch 10: 1.935728371143341
Loss at epoch 15: 1.82925044298172
Loss at epoch 20: 1.7955500602722168
Loss at epoch 25: 1.7913446108500162
Loss at epoch 30: 1.7913328210512798

 Probs for word 'lazy'
	the : 0.009 
	quick : 0.012 
	brown : 0.052 
	fox : 0.018 
	jumps : 0.004 
	over : 0.210 
	the : 0.272 
	lazy : 0.034 
	dog : 0.389 
Sum = 1.0

 Probs for word 'dog'
	the : 0.002 
	quick : 0.011 
	brown : 0.009 
	fox : 0.036 
	jumps : 0.023 
	over : 0.007 
	the : 0.317 
	lazy : 0.552 
	dog : 0.044 
Sum = 1.0
dot('dog', 'lazy') =  0.12468066811561584
dot('dog', 'brown') =  0.09756961464881897


   Dot('dog', 'lazy') is bigger than dot('dog', 'brown') because they are more similar to each other ('the' is the context for both 'lazy' and 'dog', wheareas 'dog' and 'brown' contexts are disjoint)

If our vocabulary is bigger, the word2vec model needs a LOT of data to obtain reasonable results. With this amount of data, the code needs to be optimized very well. Writing such code will be more suitable for a project instead of a simple exercise, therefore in the next exercise we will use [gensim](https://radimrehurek.com/gensim/), a library made for efficient training of word vectors.

# * Exercise 2 (2pt)

- Use [gensim](https://radimrehurek.com/gensim/) to train a word2vec model on [OpinRank](http://kavita-ganesan.com/entity-ranking-data/). You can follow this [tutorial](https://medium.freecodecamp.org/how-to-get-started-with-word2vec-and-then-how-to-make-it-work-d0a2fca9dad3), but make sure you have used negative sampling.
- Find 10 similar words to word "dirty" and "canada"
- Check if similarity between "dirty" and "dusty" is bigger than between "dirty" and "clean"

In [1]:
!pip install --upgrade gensim


Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.0MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 13.2MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

In [0]:
# imports needed and logging
import gzip
import gensim 
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving OpinRankDatasetWithJudgments.zip to OpinRankDatasetWithJudgments.zip


In [0]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    print("reading file {0}...this may take a while".format(input_file))
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 10000 == 0):
                print("read {0} reviews".format(i))
            # do some pre-processing and return list of words for each review
            # text
            yield gensim.utils.simple_preprocess(line)