# Word2Vec practice(Pytorch)

If you don't have data for word2vec, you can download the dataset
from https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip,  
or you can download the dataset using urlib.request like following.

### import urlib.request  
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")

In [1]:
# Packages for preprocessing
import re
import math
import random
from lxml import etree
from collections import Counter
from numpy.random import multinomial

# Pakages for training
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Preprocess dataset  
I follow preprocessing .xml file in the following site.  
https://wikidocs.net/60855  
  
1. Load the dataset: open()  
2. Extract the contents between CONTENTS and /CONTENTS
3. Substitute not text element to ' '
4. Split the text into sentences.
5. Eiminate the punctuation marks and substitute it to blank
   & Change the capital letter to a small letter
6. Tokenize the preprocessed sentences

In [2]:
dataset = open('dataset/ted_en-20160408.xml', 'r', encoding='UTF8')

text = '\n'.join(etree.parse(dataset).xpath('//content/text()'))
text = re.sub(r'\([^)]*\)', ' ', text)
print("*Print one sentence in text:\n\n{}".format(text[:95]))

sentences = text.split('.')
print("\n*Print one sentence in sentences:\n\n{}".format(sentences[0]))

pre_sentences = []
for sentence in sentences:
    pre_sentences.append(re.sub(r"[^a-z0-9]+", " ", sentence.lower()))

print("\n*Print one sentence in pre_sentences:\n\n{}".format(pre_sentences[0]))

Tokenized_sentence = [sentence.split(" ") for sentence in pre_sentences]
tokenized_sentence = []
for sentence in Tokenized_sentence:
    if len(sentence) < 5: continue
    tokenized_sentence.append([w for w in sentence if w != ''])
print("\n*Print one sentence in tokenized_sentence:\n\n{}".format(tokenized_sentence[0]))

*Print one sentence in text:

Here are two reasons companies fail: they only do more of the same, or they only do what's new.

*Print one sentence in sentences:

Here are two reasons companies fail: they only do more of the same, or they only do what's new

*Print one sentence in pre_sentences:

here are two reasons companies fail they only do more of the same or they only do what s new

*Print one sentence in tokenized_sentence:

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']


## Word to index & Index to Word
I follow the instruction from https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb    

1. Make vocabulary from tokenized_sentence
2. Count the word frequency and cut words that are appear less than min_freq
3. Subsampling frequent words
4. Create dictionaries for mapping between word and index 

### Min frequency
  
Words below the minimun frequency are dropped before training occurs.
So, before starting the training, I cut the words that appears less than 'min_freq'

### Sub sampling

Word2Vec researchers have decided to reduce the amount of learning in a probabilistic way for words that appear frequently in the corpus. This is because there are many opportunities to be updated as much as the frequency of appearance.  
Word2Vec researchers say the i-th word (wi)
The probability of excluding ) from learning is defined below.  
  
$$ P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}}$$  
  
They recommend the value of t as 0.00001

In [3]:
min_freq = 5

vocabulary = {}

for sentence in tokenized_sentence:
    for token in sentence:
        if token not in vocabulary:
            vocabulary[token] = 1
        else:
            vocabulary[token] += 1

In [4]:
# CUT
VOCAB = {word:cnt for (word,cnt) in vocabulary.items() if cnt >= min_freq}

In [5]:
# Sub Sampling
sum_word_counts = sum(list(VOCAB.values()))
words_prob = {word: cnt/float(sum_word_counts) for word, cnt in VOCAB.items()}

filtered = []
for sentence in tokenized_sentence:
    filtered.append([])
    for token in sentence:
        if token not in VOCAB: continue
        prob = 1 - math.sqrt(0.00001/words_prob[token])
        if random.random() >= prob:
            filtered[-1].append(token)

In [6]:
SENTENCE = []

for sentence in filtered:
    if len(sentence) < 5: continue
    SENTENCE.append(sentence)

word2index = {word: idx for idx, (word, cnt) in enumerate(VOCAB.items())}
index2word = {idx: word for idx, (word, cnt) in enumerate(VOCAB.items())}

vocab_size = len(VOCAB)

## Get pairs of words that exists within the window size

1. Negative Sampling
2. Get pairs of words that exists within the window size.  
We will use them to train the word2vec embedding model.
  
### Negative Sampling



You can change the window size. But we select the value of window size as 5

In [7]:
# Negative Sampling
def sample_negative(sample_size):
    sample_prob = {}
    
    words = sum(filtered, [])
    words_counts = {}
    for word in words:
        if word not in words_count:
            words_count[word] = 1
        else:
            words_count[word] += 1
    
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_prob[word] = word_count[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_prob.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                word_list.append(words[index])
        yield word_list

In [None]:
window_size = 5
negative_size = 10
word_pairs = []

negative_samples = sample_negative(10)

for sentence in SENTENCE:
    indices = [word2index[word] for word in sentence]
    
    for center_idx in range(len(indices)):
        # save window
        for context_idx in range(center_idx - window_size, center_idx + window_size + 1):
            if context_idx < 0 or context_idx >= len(indices) or context_idx == center_idx: continue
            word_pairs.append((indices[center_idx], indices[context_idx], next(negative_samples)))
    break

In [None]:
# Example of cutted sentence with window size
temp_cnt = 0
sentence_len = len(SENTENCE[0])
print(SENTENCE[0])

for pair in word_pairs:
    (center_idx, context_idx) = pair
    
    print("Center: {}, Context: {}".format(index2word[center_idx], index2word[context_idx]))
        
    temp_cnt += 1
    if temp_cnt == 30: break

## Skip gram
  
We will use Skip gram, not CBOW.  
This is the probability distribution for single pair. 
  
$$ P(context|center;\theta) $$  
  
Then, maximize this distribution through all word/context pairs.  
  
$$ max \prod_{context} \prod_{center} P(context|center;\theta) $$  
  
After then, make this prob. distribution as negative log likelihood  
  
$$ min_\theta -\frac{1}{T} \Sigma_{center} \Sigma_{context} log P(context|center;\theta) $$  
  
### Define P
  
We have to define the probability distribution. Assume there are vectors that represent the word in two ways.  
1. v : if a word is the center word
2. u : if a word is the context word
  
Then, we can write P as follows:  
  
$$ P(context|center;\theta) = \frac{exp(u^T_{context} v_{center})}{\Sigma_{w \in vocab} exp(u^T_{w} v_{center})}$$

## Train Word2Vec model

Now, we are ready to train the word2vec model.

In [None]:
n_epochs = 10
emb_dim = 50
lr = 0.01

In [None]:
#min_freq, vocab_size, window_size, word_pairs
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.output = nn.Linear(emb_dim, vocab_size)
        
    def forward(self, x):
        # x : [vocab_size]
        x = self.embedding(x)
        # x : [emb_dim]
        out = self.output(x)
        # out : [vocab_size]
        out = out.view(1,-1)
        # out : [1, vocab_size]
        return out
    

In [None]:
model = Word2Vec(vocab_size, emb_dim)
criterion = F.cross_entropy
optimizer = optim.SGD(model.parameters(), lr = lr)

def get_one_hot(index):
    x = torch.zeros(vocab_size).float()
    x[index] = 1.0
    return x

def train(model, data):
    model.train()
    total_loss = 0
    
    for pairs in data:
        epoch_loss = 0
        x = get_one_hot(pairs[0])
        ys = pairs[1:]
        
        for y in ys:
            
            optimizer.zero_grad()
            output = model(x)
            y_true = torch.LongTensor([y])
            loss = criterion(output, y_true)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            
        total_loss += epoch_loss/len(ys)
    
    return total_loss

In [None]:
import time
for epoch in range(n_epochs):
    start = time.time()
    train_loss = train(model, word_pairs)
    end = time.time()
    
    print("Epoch : {0}\tTime: {1:.4f}s".format(epoch, end-start))
    print("Train loss: {:.4f}".format(train_loss))