## 1. Load Data

In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
# nltk.download()

In [4]:
from nltk.corpus import reuters
reuters_corpus = reuters.raw()
reuters_corpus = reuters_corpus.lower()
reuters_corpus = reuters_corpus.split("\n      ")
reuters_corpus = [sent.replace("\n", "").split(" ") for sent in reuters_corpus]

In [5]:
reuters_corpus = reuters_corpus[:1000]

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [7]:
np.__version__, torch.__version__

('1.26.0', '2.1.0')

In [8]:
import matplotlib
matplotlib.__version__

'3.8.1'

In [9]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(reuters_corpus))) #all the words we have in the system - <UNK>
vocabs.append('<UNK>')

In [10]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['fear']

3181

In [11]:
index2word = {v:k for k, v in word2index.items()}

In [12]:
word2index["he"]

3702

## 2. Prepare train data

In [13]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-2]], word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, reuters_corpus)

In [14]:
x.shape

(2, 1)

In [15]:
x

array([[ 533],
       [3605]])

In [16]:
y.shape

(2, 1)

## 3.Model

In [17]:
len(vocabs)

6034

In [18]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [19]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 6031, 6032, 6033],
        [   0,    1,    2,  ..., 6031, 6032, 6033]])

In [20]:
model = Skipgram(voc_size, 2)
model

Skipgram(
  (embedding_center): Embedding(6034, 2)
  (embedding_outside): Embedding(6034, 2)
)

In [21]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [22]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [23]:
loss

tensor(9.9904, grad_fn=<NegBackward0>)

## 4.Training

In [24]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [25]:
import time

In [26]:
num_epochs = 10

start = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, reuters_corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    # if (epoch + 1) % 1000 == 0:
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

print(f"Time to train: {time.time() - start}")

Epoch      1 | Loss: 8.322208
Epoch      2 | Loss: 8.256674
Epoch      3 | Loss: 10.050245
Epoch      4 | Loss: 9.194986
Epoch      5 | Loss: 9.520333
Epoch      6 | Loss: 8.439960
Epoch      7 | Loss: 9.632087
Epoch      8 | Loss: 9.160061
Epoch      9 | Loss: 9.639753
Epoch     10 | Loss: 9.948281
Time to train: 3.1521451473236084


## 5. Semantic and Syntatic

In [27]:
import pandas as pd

pd.read_csv("word-test.v1.txt", skiprows=0)

# analogies_path = datapath('word-test.v1.txt')
# word_analogies_data = open(analogies_path, 'r').readlines()

Unnamed: 0,// Copyright 2013 Google Inc. All Rights Reserved.
0,: capital-common-countries
1,Athens Greece Baghdad Iraq
2,Athens Greece Bangkok Thailand
3,Athens Greece Beijing China
4,Athens Greece Berlin Germany
...,...
19553,write writes talk talks
19554,write writes think thinks
19555,write writes vanish vanishes
19556,write writes walk walks


In [28]:
with open("word-test.v1.txt") as f:
    data = f.read()

In [29]:
data = data.replace("\t", "")
data = data.lower()
data = data.split(": ")

In [30]:
capital = data[1]
capital = capital.split('\n')[1:-1]
capital_row = [line.split(" ") for line in capital]

In [31]:
past = data[12]
past = past.split('\n')[1:-1]
past_row = [line.split(" ") for line in past]

In [32]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        word = '<UNK>'
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    embed = embed[0][0].item(), embed[0][1].item()
    
    return np.array(embed)

In [33]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [34]:
skipgram_embeddings = {}

for vocab in vocabs:
    skipgram_embeddings[vocab] = get_embed(vocab)



In [35]:
import pickle

with open("embeddings_skipgram.pkl", 'wb') as f:
    pickle.dump(skipgram_embeddings, f)

In [36]:
past_correct = 0
for row in past_row:
   row = [word.lower() for word in row]
   w1, w2, w3, w4 = row
   
   embedding = get_embed(w3) - get_embed(w1) + get_embed(w2)
   similarities = {}

   for vocab in skipgram_embeddings.keys():
       similarities[vocab] = cosine_similarity(embedding, skipgram_embeddings[vocab])
       
   predicted_word = max(similarities, key=similarities.get)
   
   if predicted_word == w4:
       past_correct += 1
   
past_accuracy = past_correct / len(past_row)
print(past_accuracy)

0.0


In [37]:
capital_correct = 0
for row in capital_row:
   row = [word.lower() for word in row]
   w1, w2, w3, w4 = row
   
   embedding = get_embed(w3) - get_embed(w1) + get_embed(w2)
   similarities = {}

   for vocab in skipgram_embeddings.keys():
       similarities[vocab] = cosine_similarity(embedding, skipgram_embeddings[vocab])
       
   predicted_word = max(similarities, key=similarities.get)
   
   if predicted_word == w4:
       capital_correct += 1
   
capital_accuracy = capital_correct / len(capital_row)
print(capital_accuracy)

0.0


In [38]:
import numpy as np
import pandas as pd

df = pd.read_csv("wordsim_similarity_goldstandard.txt", sep='\t', header=None)
df

Unnamed: 0,0,1,2
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [39]:
def get_word_vector(word, embs):
    try:
        return embs[word]
    except:
        return embs['<UNK>']

In [41]:
df['dot'] = df.apply(lambda row: np.dot(get_word_vector(row[0].lower(), skipgram_embeddings), get_word_vector(row[0].lower(), skipgram_embeddings)), axis=1)
df

Unnamed: 0,0,1,2,dot
0,tiger,cat,7.35,3.250181
1,tiger,tiger,10.00,3.250181
2,plane,car,5.77,3.250181
3,train,car,6.31,3.250181
4,television,radio,6.77,0.838233
...,...,...,...,...
198,rooster,voyage,0.62,3.250181
199,noon,string,0.54,3.250181
200,chord,smile,0.54,3.250181
201,professor,cucumber,0.31,3.250181


In [42]:
df.corr(numeric_only=True, method='spearman')

Unnamed: 0,2,dot
2,1.0,0.172489
dot,0.172489,1.0


## Model comparation between Skipgram, Skipgram (Neg), Glove and Glove (Gensim) model
| Model          | Window Size | Training Loss | Training Time | Syntactic Accuracy | Semantic Accuracy |
|----------------|-------------|---------------|---------------|--------------------|-------------------|
| Skipgram       |      2      |    9.948281   |     3.15 s    |         0 %        |        0 %        |
| Skipgram (NEG) |      2      |    1.497438   |     3.1 s     |        0.2 %       |        0 %        |
| Glove          |      2      |   80.799538   |      1 s      |         0 %        |        0 %        |
| Glove (Gensim) |      -      |       -       |       -       |       93.87 %      |      55.49 %      |
##  
##  
|   Model  | Skipgram |  NEG | Glove | Glove (gensim) |
|:--------:|----------|:----:|:-----:|----------------|
| SpearMan |   0.17   | 0.07 | -0.16 |      0.53      |