## 1. Load Data

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
#nltk.download()

In [4]:
from nltk.corpus import reuters
reuters_corpus = reuters.raw()
reuters_corpus = reuters_corpus.lower()
reuters_corpus = reuters_corpus.split("\n      ")
reuters_corpus = [sent.replace("\n", "").split(" ") for sent in reuters_corpus]

In [5]:
reuters_corpus = reuters_corpus[:1000]

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [7]:
np.__version__, torch.__version__

('1.26.0', '2.1.0')

In [8]:
import matplotlib
matplotlib.__version__

'3.8.1'

In [9]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(reuters_corpus))) #all the words we have in the system - <UNK>
vocabs.append('<UNK>')

In [10]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['fear']

1841

In [11]:
index2word = {v:k for k, v in word2index.items()}

## 2.Build Co-occurence Matrix X

In [12]:
from collections import Counter

X_i = Counter(flatten(reuters_corpus))

In [13]:
skip_grams = []

for doc in reuters_corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('fear', 'asian'),
 ('fear', 'exporters'),
 ('fear', 'damage'),
 ('fear', 'from'),
 ('damage', 'exporters'),
 ('damage', 'fear'),
 ('damage', 'from'),
 ('damage', 'u.s.-japan'),
 ('from', 'fear'),
 ('from', 'damage'),
 ('from', 'u.s.-japan'),
 ('from', 'rift'),
 ('u.s.-japan', 'damage'),
 ('u.s.-japan', 'from'),
 ('u.s.-japan', 'rift'),
 ('u.s.-japan', ''),
 ('rift', 'from'),
 ('rift', 'u.s.-japan'),
 ('rift', ''),
 ('rift', 'mounting'),
 ('', 'u.s.-japan'),
 ('', 'rift'),
 ('', 'mounting'),
 ('', 'trade'),
 ('mounting', 'rift'),
 ('mounting', ''),
 ('mounting', 'trade'),
 ('mounting', 'friction'),
 ('trade', ''),
 ('trade', 'mounting'),
 ('trade', 'friction'),
 ('trade', 'between'),
 ('friction', 'mounting'),
 ('friction', 'trade'),
 ('friction', 'between'),
 ('friction', 'the'),
 ('between', 'trade'),
 ('between', 'friction'),
 ('between', 'the'),
 ('between', ''),
 ('the', 'friction'),
 ('the', 'between'),
 ('the', ''),
 ('the', 'u.s.'),
 ('', 'between'),
 ('', 'the'),
 ('', 'u.s.'

In [14]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('the', ''): 399,
         ('', 'the'): 390,
         ('the', 'of'): 279,
         ('of', 'the'): 279,
         ('of', ''): 241,
         ('to', ''): 241,
         ('', 'to'): 238,
         ('', 'of'): 237,
         ('in', ''): 218,
         ('', 'in'): 218,
         ('and', ''): 208,
         ('', 'and'): 207,
         ('', 'a'): 151,
         ('a', ''): 150,
         ('vs', 'cts'): 150,
         ('in', 'the'): 149,
         ('the', 'in'): 149,
         ('vs', 'mln'): 137,
         ('', ''): 132,
         ('to', 'the'): 114,
         ('the', 'to'): 112,
         ('cts', 'vs'): 98,
         ('for', ''): 93,
         ('', 'for'): 92,
         ('mln', 'vs'): 91,
         ('of', 'a'): 90,
         ('a', 'of'): 87,
         ('mln', ''): 74,
         ('', 'mln'): 73,
         ('it', 'said'): 72,
         ('the', 'said'): 71,
         ('for', 'the'): 70,
         ('said', ''): 68,
         ('the', 'for'): 68,
         ('', 'said'): 67,
         ('dlrs', ''): 67,
         ('', 'dlrs'

In [15]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [16]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [17]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [18]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, reuters_corpus, skip_grams, X_ik, weighting_dic)

In [19]:
x

array([[   0],
       [5864]])

In [20]:
y

array([[   0],
       [5736]])

In [21]:
cooc

array([[4.89034913],
       [1.09861229]])

In [22]:
weighting

array([[1.        ],
       [0.07208434]])

## 4.Model

In [39]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [40]:
#test our system
voc_size = len(vocabs)
emb_size = 2
model = Glove(voc_size, emb_size)

In [41]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [26]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [42]:
loss

tensor(4.3050, grad_fn=<SumBackward0>)

## 5. Training

In [43]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [44]:
def training_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [45]:
import time

# Training
num_epochs = 10
start = time.time()
for epoch in range(num_epochs):
    
    
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, reuters_corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    #if (epoch + 1) % 1000 == 0:
    print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")

end = time.time()
mins, secs = training_time(start, end)
print(f"time: {mins}m {secs}s")

Epoch: 1 | cost: 0.902794
Epoch: 2 | cost: 5.132260
Epoch: 3 | cost: 4.131680
Epoch: 4 | cost: 1.442448
Epoch: 5 | cost: 45.676178
Epoch: 6 | cost: 5.058868
Epoch: 7 | cost: 7.586196
Epoch: 8 | cost: 11.795411
Epoch: 9 | cost: 42.060669
Epoch: 10 | cost: 80.799538
time: 0m 1s


## 6.Semantic and Syntatic

In [46]:
import pandas as pd

pd.read_csv("word-test.v1.txt", skiprows=0)

# analogies_path = datapath('word-test.v1.txt')
# word_analogies_data = open(analogies_path, 'r').readlines()

Unnamed: 0,// Copyright 2013 Google Inc. All Rights Reserved.
0,: capital-common-countries
1,Athens Greece Baghdad Iraq
2,Athens Greece Bangkok Thailand
3,Athens Greece Beijing China
4,Athens Greece Berlin Germany
...,...
19553,write writes talk talks
19554,write writes think thinks
19555,write writes vanish vanishes
19556,write writes walk walks


In [47]:
with open("word-test.v1.txt") as f:
    data = f.read()

In [48]:
data = data.replace("\t", "")
data = data.lower()
data = data.split(": ")

In [49]:
capital = data[1]
capital = capital.split('\n')[1:-1]
capital_row = [line.split(" ") for line in capital]

In [50]:
past = data[12]
past = past.split('\n')[1:-1]
past_row = [line.split(" ") for line in past]

In [51]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        word = '<UNK>'
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    embed = embed[0][0].item(), embed[0][1].item()
    
    return np.array(embed)

In [52]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [53]:
glove_embeddings = {}

for vocab in vocabs:
    glove_embeddings[vocab] = get_embed(vocab)

In [54]:
import pickle

with open("embeddings_glove.pkl", 'wb') as f:
    pickle.dump(glove_embeddings, f)

In [55]:
past_correct = 0
for row in past_row:
   row = [word.lower() for word in row]
   w1, w2, w3, w4 = row
   
   embedding = get_embed(w3) - get_embed(w1) + get_embed(w2)
   similarities = {}

   for vocab in glove_embeddings.keys():
       similarities[vocab] = cosine_similarity(embedding, glove_embeddings[vocab])
       
   predicted_word = max(similarities, key=similarities.get)
   
   if predicted_word == w4:
       past_correct += 1
   
past_accuracy = past_correct / len(past_row)
print(past_accuracy)

0.0


In [56]:
capital_correct = 0
for row in capital_row:
   row = [word.lower() for word in row]
   w1, w2, w3, w4 = row
   
   embedding = get_embed(w3) - get_embed(w1) + get_embed(w2)
   similarities = {}

   for vocab in glove_embeddings.keys():
       similarities[vocab] = cosine_similarity(embedding, glove_embeddings[vocab])
       
   predicted_word = max(similarities, key=similarities.get)
   
   if predicted_word == w4:
       capital_correct += 1
   
capital_accuracy = capital_correct / len(capital_row)
print(capital_accuracy)

0.0


In [57]:
import numpy as np
import pandas as pd

df = pd.read_csv("wordsim_similarity_goldstandard.txt", sep='\t', header=None)
df

Unnamed: 0,0,1,2
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [58]:
def get_word_vector(word, embs):
    try:
        return embs[word]
    except:
        return embs['<UNK>']

In [59]:
df['dot'] = df.apply(lambda row: np.dot(get_word_vector(row[0].lower(), glove_embeddings), get_word_vector(row[0].lower(), glove_embeddings)), axis=1)
df

Unnamed: 0,0,1,2,dot
0,tiger,cat,7.35,0.00879
1,tiger,tiger,10.00,0.00879
2,plane,car,5.77,0.00879
3,train,car,6.31,0.00879
4,television,radio,6.77,0.65180
...,...,...,...,...
198,rooster,voyage,0.62,0.00879
199,noon,string,0.54,0.00879
200,chord,smile,0.54,0.00879
201,professor,cucumber,0.31,0.00879


In [60]:
df.corr(numeric_only=True, method='spearman')

Unnamed: 0,2,dot
2,1.0,-0.156839
dot,-0.156839,1.0


## Model comparation between Skipgram, Skipgram (Neg), Glove and Glove (Gensim) model
| Model          | Window Size | Training Loss | Training Time | Syntactic Accuracy | Semantic Accuracy |
|----------------|-------------|---------------|---------------|--------------------|-------------------|
| Skipgram       |      2      |    9.948281   |     3.15 s    |         0 %        |        0 %        |
| Skipgram (NEG) |      2      |    1.497438   |     3.1 s     |        0.2 %       |        0 %        |
| Glove          |      2      |   80.799538   |      1 s      |         0 %        |        0 %        |
| Glove (Gensim) |      -      |       -       |       -       |       93.87 %      |      55.49 %      |
##  
##  
|   Model  | Skipgram |  NEG | Glove | Glove (gensim) |
|:--------:|----------|:----:|:-----:|----------------|
| SpearMan |   0.17   | 0.07 | -0.16 |      0.53      |