# Word2Vec and GloVe

## 0. Import and download

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import reuters
import time

In [2]:
nltk.download("reuters")
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## 1. Load documents from nltk reuter news fuel category

I chose the the category "cpu" from Reuters news in th nltk to train the models

In [184]:
cpu_docs = reuters.fileids('cpu')

In [216]:
[reuters.raw(doc) for doc in reuters.fileids(reuters.categories())]

10788

In [185]:
corpus = [reuters.raw(doc) for doc in cpu_docs]

In [186]:
len(corpus)

4

In [6]:
from nltk.tokenize import word_tokenize

flatten = lambda l: [item for sublist in l for item in sublist]

# creating a function to prepare corpus and get vocab, word2index, index2word
def prepare_corpus(corpus):
    corpus = [word_tokenize(sent) for sent in corpus]
    vocab = list(set(flatten(corpus)))
    word2index = {w: i+1 for i, w in enumerate(vocab)}
    vocab.append('<UNK>')
    voc_size = len(vocab)
    word2index['<UNK>'] = 0
    index2word = {v:k for k, v in word2index.items()}
    
    return corpus, vocab, voc_size, word2index, index2word

In [188]:
corpus, vocab, voc_size, word2index, index2word = prepare_corpus(corpus)

## 2. Prepare data for Skipgram

In [8]:
for c in corpus:
    print(c)

['U.S.', 'INDUSTRIAL', 'CAPACITY', 'USE', 'RATE', '81.2', 'PCT', 'IN', 'SEPTEMBER', ',', 'UNCHANGED', 'FROM', 'AUGUST', 'U.S.', 'INDUSTRIAL', 'CAPACITY', 'USE', 'RATE', '81.2', 'PCT', 'IN', 'SEPTEMBER', ',', 'UNCHANGED', 'FROM', 'AUGUST']
['CANADA', 'MANUFACTURING', 'UTILIZATION', 'RATE', 'RISES', 'Utilization', 'of', 'Canadian', 'manufacturing', 'capacity', 'rose', 'to', '77.2', 'pct', 'in', 'the', 'fourth', 'quarter', 'of', '1986', 'from', '77', 'pct', 'in', 'the', 'third', 'quarter', ',', 'Statistics', 'Canada', 'said', '.', '``', 'Although', 'the', 'change', 'was', 'small', ',', 'this', 'marked', 'the', 'first', 'quarter', 'since', 'the', 'third', 'quarter', 'of', '1985', 'in', 'which', 'the', 'utilization', 'rates', 'for', 'manufacturing', 'as', 'a', 'whole', 'rose', ',', "''", 'the', 'federal', 'agency', 'said', '.', 'Increased', 'residential', 'construction', 'led', 'to', 'strong', 'increases', 'in', 'the', 'building', 'materials', 'sector', ',', 'led', 'by', 'a', '3.3', 'pct', 

The below function will take corpus and window size to populate skipgrams. This function returns skip_grams for Skipgram model and skip_gram_pair for glove model

In [9]:
def skip_gram(corpus,window_size=2):
    skip_grams, skip_grams_pair = [], []
    for sent in corpus:
        for i in range(window_size, len(sent) - window_size):
            target = word2index[sent[i]]
            context = []
            for j in range(window_size):
                context.append(word2index[sent[i - (1+j)]] )
                context.append(word2index[sent[i + (1+j)]] )
            for w in context:
                skip_grams.append([target, w])
                skip_grams_pair.append((index2word[target], index2word[w]))
    return skip_grams,skip_grams_pair

In [10]:
def random_batch_skipgram(batch_size, word_sequence,window_size=2):
    
    # Make skip gram 
    skip_grams,_= skip_gram(word_sequence,window_size=window_size)
                
    # print(len(skip_grams))
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

### Testing the method

In [11]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch_skipgram(batch_size, corpus,window_size=1)

print("Input: ", input_batch)
print("Target: ", target_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[ 56]
 [168]]
Target:  [[ 87]
 [147]]


## 3. Skipgram

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [12]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

## 4. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [13]:
from collections import Counter
def unigram(corpus,vocab,Z=0.001):
    
    unigram_table = []
    word_count = Counter(flatten(corpus))
    num_total_words = sum([c for w, c in word_count.items()])
    for vo in vocab:
        unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))
    
    return unigram_table

In [14]:
unigram_table = unigram(corpus,vocab,Z=0.001)

### Negative Sampling

In [15]:
import random

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    return torch.cat(neg_samples)

### Testing the negative sampling

In [16]:
input_batch  = torch.Tensor(input_batch)
target_batch = torch.LongTensor(target_batch)

In [17]:
target_batch.shape

torch.Size([2, 1])

In [18]:
input_batch

tensor([[ 56.],
        [168.]])

In [19]:
num_neg = 3
negative_sampling(target_batch, unigram_table, num_neg)

#{'grapes': 0, 'apple': 1, 'animal': 2, 'cat': 3, 'ice': 4, 'orange': 5, 'dog': 6, 'monkey': 7, 'conda': 8, 'fruit': 9, 'banana': 10}

tensor([[ 73, 151, 150],
        [109, 100,  15]])

In [20]:
target_batch[1]

tensor([147])

## 5. Skipgram(Negative Sampling)

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [21]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## 6. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [22]:
from collections import Counter
from itertools import combinations_with_replacement

X_i = Counter(flatten(corpus)) # X_i
X_i

Counter({'pct': 28,
         'in': 27,
         'the': 19,
         ',': 16,
         '.': 16,
         'from': 12,
         'January': 12,
         'to': 11,
         'of': 10,
         'February': 8,
         'PCT': 7,
         'IN': 7,
         'said': 7,
         'and': 7,
         'U.S.': 6,
         'RATE': 6,
         'CAPACITY': 5,
         'USE': 5,
         'manufacturing': 5,
         'capacity': 5,
         'was': 5,
         'for': 5,
         'materials': 5,
         'rate': 5,
         'INDUSTRIAL': 4,
         'FROM': 4,
         'rose': 4,
         'quarter': 4,
         '1986': 4,
         'a': 4,
         '79.8': 4,
         '79.6': 4,
         'Fed': 4,
         '81.2': 3,
         'December': 3,
         'The': 3,
         'metals': 3,
         'month': 3,
         'use': 3,
         'SEPTEMBER': 2,
         'UNCHANGED': 2,
         'AUGUST': 2,
         'third': 2,
         'utilization': 2,
         'as': 2,
         'led': 2,
         'ROSE': 2,
         'TO': 2

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "figures/glove_weighting_func.png" width=400>

In [23]:
#simply a normalized function...don't worry too much
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

Need to build co-occurence matrix for negative sampling

In [24]:
def build_cooc_matrix(word_seq,window_size=2):
    _, skip_grams_pair = skip_gram(word_seq,window_size=window_size)
    X_ik_skipgram = Counter(skip_grams_pair)
    X_ik = {}  #for keeping the co-occurences
    weighting_dic = {} #scaling the percentage of sampling

    for bigram in combinations_with_replacement(vocab, 2):
        if X_ik_skipgram.get(bigram) is not None:  #matches 
            co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
            X_ik[bigram] = co_occer + 1 # + 1 for stability issue
            X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
        else:
            pass
            
        weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
        weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)
        
    return X_ik, weighting_dic, skip_grams_pair

In [25]:
X_ik,weighting_dic,skip_grams_pair = build_cooc_matrix(corpus,window_size=2)

In [26]:
X_ik,weighting_dic

({('primary', 'gain'): 2,
  ('gain', 'primary'): 2,
  ('primary', 'in'): 2,
  ('in', 'primary'): 2,
  ('primary', 'metals'): 2,
  ('metals', 'primary'): 2,
  ('primary', 'production'): 2,
  ('production', 'primary'): 2,
  ('helped', 'raise'): 2,
  ('raise', 'helped'): 2,
  ('helped', 'metals'): 2,
  ('metals', 'helped'): 2,
  ('helped', 'production'): 2,
  ('production', 'helped'): 2,
  ('helped', 'manufacturing'): 2,
  ('manufacturing', 'helped'): 2,
  ('1986', 'February'): 4,
  ('February', '1986'): 4,
  ('1986', 'quarter'): 2,
  ('quarter', '1986'): 2,
  ('1986', '77'): 2,
  ('77', '1986'): 2,
  ('1986', ','): 2,
  (',', '1986'): 2,
  ('1986', '.'): 2,
  ('.', '1986'): 2,
  ('1986', 'rate'): 3,
  ('rate', '1986'): 3,
  ('1986', 'of'): 4,
  ('of', '1986'): 4,
  ('1986', 'from'): 2,
  ('from', '1986'): 2,
  ('1986', 'Fabricated'): 2,
  ('Fabricated', '1986'): 2,
  ('1986', 'the'): 3,
  ('the', '1986'): 3,
  ('for', 'decline'): 2,
  ('decline', 'for'): 2,
  ('for', 'use'): 2,
  ('use',

## 7. Prepare data for GloVe

In [27]:
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [28]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(batch_size, corpus, skip_grams_pair, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[ 3]
 [21]]
Target:  [[10]
 [59]]
Cooc:  [[1.38629436]
 [0.69314718]]
Weighting:  [[0.08944272]
 [0.05318296]]


## 8. GloVe

<img src ="figures/glove.png">

In [29]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## 9. Training

In [30]:
batch_size     = 10 # mini-batch size
embedding_size = 2 
num_epochs = 5000

prepare training functions for both glove and skipgram models

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
def elapsed_time(elapsed_time):
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
def train_skipgram(corpus,epoch,model,random_batch,optimizer,samples,neg_sampling=False,num_neg=10,window_size=2):
    train_time = 0
    for epoch in range(num_epochs):
    
        start = time.time()

        input_batch, target_batch = random_batch(batch_size, corpus,window_size=window_size)
        input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
        target_batch = torch.LongTensor(target_batch) #[batch_size, 1]
        
        if neg_sampling == True:
            samples = negative_sampling(target_batch, unigram_table, num_neg)

        optimizer.zero_grad()
        loss = model(input_batch, target_batch, samples)

        loss.backward()
        optimizer.step()

        end = time.time()

        # epoch_mins, epoch_secs = epoch_time(start, end)
        
        train_time += (end - start)

        if (epoch + 1) % 1000 == 0:
            train_min, train_sec = elapsed_time(train_time)
            print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {train_min}m {train_sec}s")
            
    return train_time,loss.item()

In [34]:
def train_glove(corpus,epoch,model,random_batch,optimizer,skip_grams, X_ik, weighting_dic):
    train_time = 0
    for epoch in range(num_epochs):
    
        start = time.time()

        input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
        input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
        target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
        cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
        weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
        
        optimizer.zero_grad()
        loss = model(input_batch, target_batch, cooc_batch, weighting_batch)

        loss.backward()
        optimizer.step()

        end = time.time()

        # epoch_mins, epoch_secs = epoch_time(start, end)
        
        train_time += (end - start)

        if (epoch + 1) % 1000 == 0:
            train_min, train_sec = elapsed_time(train_time)
            print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {train_min}m {train_sec}s")

    return train_time,loss.item()

### Training Skipgram(without negative sampling)

In [35]:
skip_gram_model = Skipgram(voc_size, embedding_size)

optimizer = optim.Adam(skip_gram_model.parameters(), lr=0.001)

In [36]:
#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([10, 170])

In [37]:
train_time_sg,train_loss_sg = train_skipgram(corpus,num_epochs,skip_gram_model,random_batch_skipgram,optimizer,all_vocabs,window_size=2)

Epoch: 1000 | cost: 5.004533 | time: 0m 8s
Epoch: 2000 | cost: 4.787563 | time: 0m 14s
Epoch: 3000 | cost: 4.955553 | time: 0m 21s
Epoch: 4000 | cost: 4.896844 | time: 0m 28s
Epoch: 5000 | cost: 4.192191 | time: 0m 34s


In [38]:
train_time_sg,train_loss_sg

(34.58736801147461, 4.192191123962402)

### Training Skipgram(negative sampling)

In [39]:
skip_gram_neg_model = SkipgramNegSampling(voc_size, embedding_size)

optimizer = optim.Adam(skip_gram_neg_model.parameters(), lr=0.001)

num_neg        = 10 # num of negative sampling

In [40]:
unigram_table = unigram(corpus,vocab,Z=0.001)

In [41]:
train_time_sg_neg,train_loss_sg_neg = train_skipgram(corpus,num_epochs,skip_gram_neg_model,random_batch_skipgram,optimizer,all_vocabs,
                                   neg_sampling=True,num_neg=10,window_size=2)

Epoch: 1000 | cost: 9.527617 | time: 0m 9s
Epoch: 2000 | cost: 7.867122 | time: 0m 17s
Epoch: 3000 | cost: 7.742731 | time: 0m 25s
Epoch: 4000 | cost: 6.746655 | time: 0m 34s
Epoch: 5000 | cost: 6.416509 | time: 0m 40s


In [42]:
train_time_sg_neg,train_loss_sg_neg

(40.499459743499756, 6.416508674621582)

### Training GloVe

In [43]:
glove_model = GloVe(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(glove_model.parameters(), lr=0.001)

In [44]:
X_ik, weighting_dic, skip_grams_pair = build_cooc_matrix(corpus,window_size=2)

In [45]:
train_time_gv, train_loss_gv = train_glove(corpus,num_epochs,glove_model,random_batch_glove,optimizer,skip_grams_pair, X_ik, weighting_dic)

Epoch: 1000 | cost: 4.125558 | time: 0m 3s
Epoch: 2000 | cost: 3.848725 | time: 0m 7s
Epoch: 3000 | cost: 0.567919 | time: 0m 11s
Epoch: 4000 | cost: 0.819312 | time: 0m 15s
Epoch: 5000 | cost: 0.368629 | time: 0m 19s


In [46]:
train_time_gv, train_loss_gv

(19.605977535247803, 0.3686288595199585)

### Loading GloVe(Gensim)

In [47]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')
glove_gensim_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

## 10. Model Comparison and Analysis

### Sematic Accuracy on common-capital-countries and Syntactic Accuracy on past tense datasets

https://www.fit.vut.cz/person/imikolov/public/rnnlm/word-test.v1.txt

In [48]:
import urllib
url = 'https://www.fit.vut.cz/person/imikolov/public/rnnlm/word-test.v1.txt'
uh = urllib.request.urlopen(url)
data = uh.read().decode()

In [49]:
common_capital_countries = data.split(":")[1]
past_tense = data.split(":")[-3]

In [50]:
test_ccc, _, _, _, _ = prepare_corpus(common_capital_countries.split("\n"))
test_past, _, _, _, _ = prepare_corpus(past_tense.split("\n"))

test_ccc = test_ccc[1:-1]
test_past = test_past[1:-1]

In [51]:
#let's write a function to get embedding given a word
def get_embed(word,model):
    
    if word not in vocab:
        word = '<UNK>'
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    
    return word_embed

In [52]:
#numpy version
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b.T)/(norm(a)*norm(b.T))
    return cos_sim

In [58]:
def test_accuracy(corpus,model,vocab,gensim=False):
    count = []
    predicted_word = {}
    if gensim == True:
        for sent in corpus:
            # print(sent)
            vocab = model.index_to_key
            if sent[0] in vocab and sent[1] in vocab and sent[2] in vocab:
                pw = model.most_similar(positive=[sent[0], sent[2]], negative=[sent[1]])[0][0]
            else:
                pw = '<UNK>'
            # break
            predicted_word[sent[3]] = pw
        
    else:
        embedding_matrix = np.array([get_embed(word,model).detach().numpy() for word in vocab]).squeeze(1)
        for sent in corpus:
            # print(sent)
            vec = get_embed(sent[0],model) - get_embed(sent[1],model) + get_embed(sent[2],model)
            # print(vec)
            # break
            predicted_word[sent[3]] = vocab[cos_sim(embedding_matrix,vec.detach().numpy()).argmax(0).item()]
        
    count = [1 if y_true == y_predicted else 0 for y_true, y_predicted in predicted_word.items()]
    print(predicted_word)
    return (sum(count) / len(corpus)) * 100
        

In [54]:
sem_acc , syn_acc = [] , []

sem_acc.append(test_accuracy(test_ccc,skip_gram_model,vocab))
syn_acc.append(test_accuracy(test_past,skip_gram_model,vocab))

sem_acc.append(test_accuracy(test_ccc,skip_gram_neg_model,vocab))
syn_acc.append(test_accuracy(test_past,skip_gram_neg_model,vocab))

sem_acc.append(test_accuracy(test_ccc,glove_model,vocab))
syn_acc.append(test_accuracy(test_past,glove_model,vocab))

sem_acc.append(test_accuracy(test_ccc,glove_gensim_model,vocab,gensim=True))
syn_acc.append(test_accuracy(test_past,glove_gensim_model,vocab,gensim=True))

In [55]:
sem_acc, syn_acc

([0.0, 0.0, 0.0, 0.0], [0.0641025641025641, 0.0, 0.0, 0.0])

In [78]:
result_table = {"Model" : ["Skipgram", "Skipgram (Neg)", "GloVe", "GloVe (Gensim)"],
                "Window Size" : [2,2,2,"-"],
                "Training Loss" :[train_loss_sg,train_loss_sg_neg,train_loss_gv,"-"],
                "Training Time" :[train_time_sg,train_time_sg_neg,train_time_gv,"-"],
                "Semantic Accuracy" : sem_acc,
                "Syntactic accuracy" : syn_acc
                }

In [79]:
import pandas as pd 
pd.DataFrame(result_table)

Unnamed: 0,Model,Window Size,Training Loss,Training Time,Semantic Accuracy,Syntactic accuracy
0,Skipgram,2,4.192191,34.587368,0.0,0.064103
1,Skipgram (Neg),2,6.416509,40.49946,0.0,0.0
2,GloVe,2,0.368629,19.605978,0.0,0.0
3,GloVe (Gensim),-,-,-,0.0,0.0


It can be seen that GloVe model is the best in terms of training time and training loss while the skipgram models is better without negative sampling. The reason why training time is better in skipgram without negative sampling might be that the corpus trained is small and using all vocab is more straightfoward and faster than sampling for each batch. 

On the other hand, The accuracy for both sematic and syntactic can be said zero. This is because most of the test words are out out vocab.

### Testing on wordsim353

http://alfonseca.org/eng/research/wordsim353.html

In [144]:
word_sim = open("./wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt","r")
word_sim = [sublist.split("\t") for sublist in word_sim.read().split("\n")]
word_sim.pop()

['']

In [166]:
def test_word_sim(corpus,model,vocab,gensim=False):

    if gensim == True:
        r_corpus = [sent+[((model[sent[0]] @ model[sent[1]].T).item())]
                    for sent in corpus if  sent[0] in vocab or sent[1] in vocab]
    else:
        r_corpus = [sent+[((get_embed(sent[0],model) @ get_embed(sent[1],model).T).item())]
                    for sent in corpus if  sent[0] in vocab or sent[1] in vocab]
            
    return r_corpus

In [173]:
from scipy.stats import spearmanr

In [176]:
df_sg = pd.DataFrame(np.array(test_word_sim(word_sim,skip_gram_model,vocab,gensim=False)),columns=["word1","word2","human(mean)","dotproduct"])
print(spearmanr(df_sg['human(mean)'],df_sg['dotproduct']))
df_sg

SignificanceResult(statistic=0.07175961192160948, pvalue=0.7993878023812299)


Unnamed: 0,word1,word2,human(mean),dotproduct
0,production,crew,6.25,0.205778881907463
1,energy,crisis,5.94,-0.3523871898651123
2,reason,criterion,5.91,-0.0883221924304962
3,change,attitude,5.44,1.163570761680603
4,energy,laboratory,5.09,-0.3523871898651123
5,consumer,energy,4.75,-0.3523871898651123
6,start,year,4.06,0.1289408504962921
7,report,gain,3.63,-0.0809737667441368
8,five,month,3.38,-0.3914070129394531
9,announcement,production,3.38,0.205778881907463


In [177]:
df_sg_neg = pd.DataFrame(np.array(test_word_sim(word_sim,skip_gram_neg_model,vocab,gensim=False)),columns=["word1","word2","human(mean)","dotproduct"])
print(spearmanr(df_sg_neg['human(mean)'],df_sg_neg['dotproduct']))
df_sg_neg

SignificanceResult(statistic=0.144427573361214, pvalue=0.6075705087134136)


Unnamed: 0,word1,word2,human(mean),dotproduct
0,production,crew,6.25,-0.3486137390136719
1,energy,crisis,5.94,-0.6441017389297485
2,reason,criterion,5.91,0.4922899603843689
3,change,attitude,5.44,-0.0848661288619041
4,energy,laboratory,5.09,-0.6441017389297485
5,consumer,energy,4.75,-0.6441017389297485
6,start,year,4.06,-0.6634553074836731
7,report,gain,3.63,-0.1207262948155403
8,five,month,3.38,-0.2055964320898056
9,announcement,production,3.38,-0.3486137390136719


In [178]:
df_gv = pd.DataFrame(np.array(test_word_sim(word_sim,glove_model,vocab,gensim=False)),columns=["word1","word2","human(mean)","dotproduct"])
print(spearmanr(df_gv['human(mean)'],df_gv['dotproduct']))
df_gv

SignificanceResult(statistic=-0.1698613598650756, pvalue=0.5450314731003615)


Unnamed: 0,word1,word2,human(mean),dotproduct
0,production,crew,6.25,0.5257719159126282
1,energy,crisis,5.94,-1.2882895469665527
2,reason,criterion,5.91,-0.6485856771469116
3,change,attitude,5.44,0.305195540189743
4,energy,laboratory,5.09,-1.2882895469665527
5,consumer,energy,4.75,-1.2882895469665527
6,start,year,4.06,0.4470191895961761
7,report,gain,3.63,-0.7052314281463623
8,five,month,3.38,0.5614281296730042
9,announcement,production,3.38,0.5257719159126282


In [179]:
df_gv_g = pd.DataFrame(np.array(test_word_sim(word_sim,glove_gensim_model,vocab,gensim=True)),columns=["word1","word2","human(mean)","dotproduct"])
print(spearmanr(df_gv_g['human(mean)'],df_gv_g['dotproduct']))
df_gv_g

SignificanceResult(statistic=0.2683367442131824, pvalue=0.33354558967402226)


Unnamed: 0,word1,word2,human(mean),dotproduct
0,production,crew,6.25,11.914396286010742
1,energy,crisis,5.94,19.712059020996094
2,reason,criterion,5.91,6.579436779022217
3,change,attitude,5.44,17.136171340942383
4,energy,laboratory,5.09,17.056880950927734
5,consumer,energy,4.75,22.46796989440918
6,start,year,4.06,24.8997859954834
7,report,gain,3.63,15.000954627990724
8,five,month,3.38,24.38193130493164
9,announcement,production,3.38,13.073078155517578


To test the correlation, there are only 14 pairs of words that are not out of vocab. The correlation score can be summarized as follows

skipgram < skipgram(neg) < Glove < Glove(gensim)

In [190]:
import pickle

# save the model to disk
filename = 'app/model/'
pickle.dump(skip_gram_model, open(filename + "skip_gram_model.model", 'wb'))
pickle.dump(skip_gram_neg_model, open(filename + "skip_gram_neg_model.model", 'wb'))
pickle.dump(glove_model, open(filename + "glove_model.model", 'wb'))
pickle.dump(glove_gensim_model, open(filename + "glove_gensim_model.model", 'wb'))

In [191]:
loaded_model = pickle.load(open(filename + "glove_gensim_model.model", 'rb'))

In [194]:
loaded_model.key_to_index

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [211]:
[word[0] for word in loaded_model.most_similar(["fuel","price"])]

['prices',
 'gasoline',
 'demand',
 'supply',
 'cost',
 'costs',
 'drop',
 'gas',
 'low',
 'oil']

In [198]:
"see".split(" ")

['see']

In [219]:
loaded_model.most_similar_to_given(["tea"],["cola","drink"])

'drink'

In [227]:
loaded_model[["cola","drink"]].mean(0).shape

(100,)

In [243]:
# docs = [reuters.raw(doc) for doc in reuters.fileids(reuters.categories())]
docs = [reuters.raw(doc) for doc in reuters.fileids("fuel")]
docs_dict = {}
for i,d in enumerate(docs):
    docs_dict[i] = d
    
Corpus, _, _, _, _ = prepare_corpus(docs)

In [355]:
def remove_oov(corpus,model):
    for word in corpus:
        if word not in model.index_to_key:
            # print(word)
            corpus.remove(word)
    return corpus

In [356]:
for _ in range(10):
    Corpus_dict = {}
    for i,sublist in enumerate(Corpus):
        c_list = remove_oov(sublist,loaded_model)
        Corpus_dict[i] = c_list
        # print(c_list)

In [374]:
def Corpus_vec(Corpus_dict,query,model):
    
    Corpus_vec_dict = []
    query = remove_oov(query.split(" "),model)
    query_vec = model[query].mean(0).T
    for k,v in Corpus_dict.items():
        sim = model[v].mean(0) @ query_vec
        Corpus_vec_dict.append(sim)
    return Corpus_vec_dict

In [384]:
top_paras_index = np.argsort(Corpus_vec(Corpus_dict,"fuel price",loaded_model))[-11:-1]

In [389]:
def retrieve(docs_dict,index):
    output = ""
    for i,ind in enumerate(index):
        output += f"{i+1}" + "\n-----------------------\n" + docs_dict[ind]
        
    return output

In [390]:
print(retrieve(docs_dict,top_paras_index))

1
-----------------------
PACIFIC RESOURCES &lt;PRI> INSTALLS OIL MOORING
  Pacific Resources Inc said it has
  installed a CALM (Catenary Anchor Leg Mooring) single-point
  mooring terminal off the southwest coast of Oahu at its
  Hawaiian Independent Refinery at a cost of 3.5 mln dlrs.
      The system transfers crude and fuel oils from tankers to
  the refinery's tank farm and carries refined products to ships
  for export, PRI said.
      Company chairman Robert G. Reed said the new mooring system
  will permit 24-hour service in most kinds of weather and will
  reduce ship turnaround time. He said the mooring is the first
  of its kind in the U.S.
      The new system can accomodate vessels up to 150,000
  deadweight tons, or one mln barrels of cargo, PRI said.
  

2
-----------------------
ECUADOR ADOPTS AUSTERITY MEASURES AFTER QUAKE
  OPEC member Ecuador adopted austerity
  measures to conserve fuel after oil production was paralyzed by
  a strong earthquake.
      Energy and M

In [308]:
Corpus_vec = {}
for i,sublist in enumerate(Corpus):
    # print(sublist)
    try:
        vec = loaded_model[sublist].mean(0)
        Corpus_vec[i] = vec
    except:
        for word in sublist:
            if word not in set(loaded_model.index_to_key):
                Corpus[i].remove(word)
    # print(sublist)

In [349]:
Corpus_dict

{0: ['(',
  ')',
  ',',
  'a',
  'subsidiary',
  'of',
  'group',
  ',',
  'said',
  'it',
  'will',
  'raise',
  'contract',
  'prices',
  'for',
  'heavy',
  'fuel',
  '50',
  'cts',
  'to',
  'one',
  'dlr',
  'a',
  'barrel',
  ',',
  'effective',
  'tomorrow',
  '.',
  'increase',
  'brings',
  'the',
  'price',
  'for',
  '0.5',
  'pct',
  'sulphur',
  'fuel',
  'to',
  '21.50',
  'dlrs',
  ',',
  'up',
  '50',
  'cts',
  ',',
  '0.7',
  'pct',
  'sulphur',
  'to',
  '21',
  'dlrs',
  ',',
  'up',
  '75',
  'cts',
  ',',
  '0.7',
  'pct',
  'sulphur',
  'to',
  '21',
  'dlrs',
  ',',
  'up',
  '75',
  'cts',
  ',',
  'one',
  'pct',
  'sulphur',
  'to',
  '20.25',
  'dlrs',
  ',',
  'up',
  '75',
  'cts',
  ',',
  'two',
  'pct',
  'sulphur',
  'to',
  '19.75',
  'dlrs',
  ',',
  'up',
  'one',
  'dlr',
  ',',
  '2.2',
  'pct',
  'sulphur',
  'to',
  '19.50',
  'dlrs',
  ',',
  'up',
  'one',
  'dlr',
  ',',
  '2.8',
  'pct',
  'sulphur',
  '19',
  'dlrs',
  ',',
  'up',
  'one',

13.788231

In [274]:
Corpus_dict = {}
for i,sublist in enumerate(Corpus):
    for word in loaded_model.index_to_key:
        if word not in sublist:
            sublist.remove(word)
    Corpus_dict[i] = sublist


ValueError: list.remove(x): x not in list

In [246]:
len(Corpus)

23

In [236]:
len(docs_dict)

10788

In [234]:
len(Corpus)

10788

In [None]:
loaded_model[Corpus_dict[0]]

KeyError: "Key 'DUTCH' not present"

In [269]:
Corpus_dict[0]

['DUTCH',
 '(',
 ')',
 'TO',
 'HEAVY',
 'PRICES',
 'Petroleum',
 ',',
 'a',
 'subsidiary',
 'of',
 'Dutch/Shell',
 'group',
 ',',
 'said',
 'it',
 'will',
 'raise',
 'contract',
 'prices',
 'for',
 'heavy',
 'fuel',
 '50',
 'cts',
 'to',
 'one',
 'dlr',
 'a',
 'barrel',
 ',',
 'effective',
 'tomorrow',
 '.',
 'increase',
 'brings',
 'the',
 'price',
 'for',
 '0.5',
 'pct',
 'sulphur',
 'fuel',
 'to',
 '21.50',
 'dlrs',
 ',',
 'up',
 '50',
 'cts',
 ',',
 '0.7',
 'pct',
 'sulphur',
 'to',
 '21',
 'dlrs',
 ',',
 'up',
 '75',
 'cts',
 ',',
 '0.7',
 'pct',
 'sulphur',
 'to',
 '21',
 'dlrs',
 ',',
 'up',
 '75',
 'cts',
 ',',
 'one',
 'pct',
 'sulphur',
 'to',
 '20.25',
 'dlrs',
 ',',
 'up',
 '75',
 'cts',
 ',',
 'two',
 'pct',
 'sulphur',
 'to',
 '19.75',
 'dlrs',
 ',',
 'up',
 'one',
 'dlr',
 ',',
 '2.2',
 'pct',
 'sulphur',
 'to',
 '19.50',
 'dlrs',
 ',',
 'up',
 'one',
 'dlr',
 ',',
 '2.8',
 'pct',
 'sulphur',
 '19',
 'dlrs',
 ',',
 'up',
 'one',
 'dlr',
 ',',
 'the',
 'company',
 'said',