In [1]:
# Importing necessary libraries
import numpy as np  # For numerical operations (e.g., matrices and arrays)
import time  # For measuring time
import pickle  # For loading and saving models
import torch  # PyTorch, a popular machine learning library
import torch.nn as nn  # For defining neural networks
from collections import Counter  # To count occurrences of elements
import nltk  # Natural Language Toolkit for text processing
from nltk.corpus import reuters
import torch.optim as optim  # For optimization algorithms in PyTorch
import matplotlib.pyplot as plt  # For plotting graphs
from itertools import combinations_with_replacement  # For generating combinations
import math  # For mathematical functions
from gensim.test.utils import datapath  # For Gensim utilities
from gensim.models import KeyedVectors  # For working with word embeddings (Word2Vec, GloVe, etc.)
from gensim.scripts.glove2word2vec import glove2word2vec  # For converting GloVe model to Word2Vec format
import pandas as pd  # For data manipulation (e.g., DataFrames)

# Downloading NLTK corpora
nltk.download('reuters')  
nltk.download('punkt')  # Download Punkt tokenizer models for text tokenization
nltk.download('punkt_tab')  # Downloads tokenization models for tab-delimited text (used by other NLTK functionalities)

[nltk_data] Downloading package reuters to C:\Users\Mir
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mir
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Mir
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Set the device to GPU if available, otherwise fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Optionally, log the device being used
print(f"Using device: {device}")

Using device: cuda


In [3]:
corpus = reuters.sents() 
corpus = corpus[:5000] # todo
corpus = [[word.lower() for word in sent] for sent in corpus]

# Print the first sentence in the preprocessed corpus
print("First sentence of the preprocessed corpus:", corpus[0])

First sentence of the preprocessed corpus: ['asian', 'exporters', 'fear', 'damage', 'from', 'u', '.', 's', '.-', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'u', '.', 's', '.', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [4]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]

# Assign unique integer
vocabs = list(set(flatten(corpus))) # All the words we have in the system - <UNK>

In [5]:
vocabs.append('<UNK>') #append unknown token to vocab

In [6]:
print(vocabs)



In [7]:
len(vocabs)

9761

In [8]:
# Create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['support']

379

In [9]:
index2word = {v:k for k, v in word2index.items()}
index2word[10]

'injection'

In [10]:
torch.manual_seed(42)
voc_size   = len(vocabs)
emb_size = batch_size = window_size = 2

## Word2Vec

### 1. Prepare train data

In [11]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, windows_size):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(windows_size, len(doc)-windows_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = []
            for j in range(i-windows_size, i+windows_size+1):
                outside.append(word2index[doc[j]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

## Model

In [12]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [13]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [14]:
#prepare all vocabs
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size).to(device)
all_vocabs

tensor([[   0,    1,    2,  ..., 9758, 9759, 9760],
        [   0,    1,    2,  ..., 9758, 9759, 9760]], device='cuda:0')

In [15]:
skipgram_model  = Skipgram(voc_size, emb_size).to(device)
optimizer  = optim.Adam(skipgram_model.parameters(), lr=0.001)
num_epochs = 5
start_time = time.time()

for epoch in range(num_epochs):
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    loss = skipgram_model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    # if (epoch + 1) % 100 == 0: #todo
    print(f"Epoch {epoch+1:6.0f}   |   Loss: {loss:2.4f}")

print(f"Train time: {time.time()-start_time}")

Epoch      1   |   Loss: 10.0207
Epoch      2   |   Loss: 9.1517
Epoch      3   |   Loss: 9.0348
Epoch      4   |   Loss: 10.7632
Epoch      5   |   Loss: 9.3899
Train time: 8.23994517326355


## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [16]:
z = 0.001

In [17]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

161358

In [18]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

Counter(unigram_table)

Counter({'.': 125,
         ',': 92,
         'the': 84,
         'of': 53,
         'to': 50,
         'in': 44,
         'and': 40,
         'said': 39,
         'a': 38,
         'mln': 34,
         '-': 30,
         's': 28,
         '/': 27,
         'vs': 26,
         'for': 25,
         '1': 24,
         'dlrs': 23,
         "'": 22,
         'it': 20,
         'pct': 19,
         '2': 19,
         '000': 19,
         'on': 18,
         'lt': 17,
         '&': 17,
         'cts': 17,
         ';': 17,
         'at': 16,
         'its': 16,
         'that': 16,
         'from': 16,
         'year': 15,
         '>': 15,
         'by': 15,
         'net': 15,
         '"': 15,
         'is': 15,
         '1986': 14,
         'billion': 14,
         '0': 14,
         '4': 14,
         'be': 14,
         'u': 14,
         'was': 13,
         '3': 13,
         '5': 13,
         'with': 13,
         '87': 13,
         'has': 12,
         'will': 12,
         '7': 12,
         'as': 11

## Model

In [19]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [20]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

### 3. Training

In [21]:
skipgram_neg_model   = SkipgramNegSampling(voc_size, emb_size).to(device)
optimizer  = optim.Adam(skipgram_neg_model.parameters(), lr=0.001)
num_epochs = 10 
k = 5
start_time = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k).to(device)
    loss = skipgram_neg_model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    # if (epoch + 1) % 1000 == 0: todo
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

print(f"Training time: {time.time()-start_time}")

Epoch      1 | Loss: 6.321082
Epoch      2 | Loss: 4.033810
Epoch      3 | Loss: 1.528757
Epoch      4 | Loss: 3.479053
Epoch      5 | Loss: 1.895015
Epoch      6 | Loss: 1.612695
Epoch      7 | Loss: 3.429874
Epoch      8 | Loss: 0.395954
Epoch      9 | Loss: 0.689170
Epoch     10 | Loss: 2.031862
Training time: 14.946703910827637


## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 2.

In [22]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'.': 10173,
         ',': 6728,
         'the': 5959,
         'of': 3235,
         'to': 3016,
         'in': 2560,
         'and': 2239,
         'said': 2182,
         'a': 2078,
         'mln': 1814,
         '-': 1552,
         's': 1417,
         '/': 1342,
         'vs': 1300,
         'for': 1229,
         '1': 1176,
         'dlrs': 1091,
         "'": 1028,
         'it': 921,
         '000': 861,
         'pct': 844,
         '2': 823,
         'on': 799,
         ';': 721,
         'lt': 720,
         '&': 719,
         'cts': 711,
         'at': 702,
         'from': 690,
         'its': 657,
         'that': 654,
         'by': 646,
         'is': 636,
         'net': 632,
         '>': 622,
         'year': 610,
         '"': 598,
         '4': 596,
         '1986': 560,
         '0': 559,
         'be': 548,
         'u': 546,
         'billion': 545,
         '87': 544,
         'was': 543,
         'with': 516,
         '5': 513,
         '3': 494,
         '

In [23]:
skip_grams = []

for doc in corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skip_grams.append((center, each_out))

skip_grams

[('fear', 'asian'),
 ('fear', 'exporters'),
 ('fear', 'damage'),
 ('fear', 'from'),
 ('damage', 'exporters'),
 ('damage', 'fear'),
 ('damage', 'from'),
 ('damage', 'u'),
 ('from', 'fear'),
 ('from', 'damage'),
 ('from', 'u'),
 ('from', '.'),
 ('u', 'damage'),
 ('u', 'from'),
 ('u', '.'),
 ('u', 's'),
 ('.', 'from'),
 ('.', 'u'),
 ('.', 's'),
 ('.', '.-'),
 ('s', 'u'),
 ('s', '.'),
 ('s', '.-'),
 ('s', 'japan'),
 ('.-', '.'),
 ('.-', 's'),
 ('.-', 'japan'),
 ('.-', 'rift'),
 ('japan', 's'),
 ('japan', '.-'),
 ('japan', 'rift'),
 ('japan', 'mounting'),
 ('rift', '.-'),
 ('rift', 'japan'),
 ('rift', 'mounting'),
 ('rift', 'trade'),
 ('mounting', 'japan'),
 ('mounting', 'rift'),
 ('mounting', 'trade'),
 ('mounting', 'friction'),
 ('trade', 'rift'),
 ('trade', 'mounting'),
 ('trade', 'friction'),
 ('trade', 'between'),
 ('friction', 'mounting'),
 ('friction', 'trade'),
 ('friction', 'between'),
 ('friction', 'the'),
 ('between', 'trade'),
 ('between', 'friction'),
 ('between', 'the'),
 ('be

In [24]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({(',', ','): 1625,
         ('of', 'the'): 1236,
         ('the', 'of'): 1201,
         ('mln', '.'): 1199,
         ('.', 'mln'): 1187,
         ('.', '.'): 1131,
         ('s', '.'): 959,
         ('.', 's'): 920,
         (',', '000'): 888,
         ('s', "'"): 886,
         ('000', ','): 879,
         ('1', '.'): 828,
         ('.', '1'): 827,
         ('87', '/'): 816,
         ('/', '87'): 813,
         ("'", 's'): 803,
         ('.', '0'): 765,
         ('0', '.'): 765,
         ('vs', ','): 729,
         (',', 'vs'): 723,
         (';', 'lt'): 720,
         (';', '&'): 719,
         ('lt', ';'): 700,
         ('lt', '&'): 699,
         (',', 'the'): 688,
         ('the', ','): 679,
         ('/', '/'): 669,
         ('the', 'in'): 666,
         ('in', 'the'): 655,
         ('nil', 'nil'): 644,
         ('2', '.'): 616,
         ('&', 'lt'): 614,
         ('&', ';'): 614,
         ('.', '2'): 610,
         ('09', '/'): 557,
         ('/', '09'): 556,
         ('the', 'to

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [25]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [26]:
X_ik = {} # Keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [27]:
def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

## 4. Model

<img src ="../figures/glove.png" width=400>

In [28]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## 5. Training

In [29]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
glove_model = Glove(voc_size, embedding_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(glove_model.parameters(), lr=0.001)

In [30]:
# Training
num_epochs = 10
start_time = time.time()

for epoch in range(num_epochs):    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = glove_model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()

    # if (epoch + 1) % 1000 == 0:
    print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")
    
print(f"Training time: {time.time()-start_time}")

Epoch: 1 | cost: 100.898018
Epoch: 2 | cost: 37.905354
Epoch: 3 | cost: 48.818531
Epoch: 4 | cost: 103.964569
Epoch: 5 | cost: 37.448692
Epoch: 6 | cost: 127.827110
Epoch: 7 | cost: 10.039527
Epoch: 8 | cost: 12.910316
Epoch: 9 | cost: 107.135010
Epoch: 10 | cost: 127.018555
Training time: 7.925745964050293


## GloVe (Gensim)

For looking at word vectors, we'll use **Gensim**. **Gensim** isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.   We gonna use **GloVe** embeddings, downloaded at [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

In [31]:
glove_file = 'glove.6B/glove.6B.100d.txt'  #search on the google
gensim_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [32]:
#return the vectors
gensim_model['coffee'].shape

(100,)

### Compute Embeddings

In [33]:
# Function to compute the word embeddings for each word in the vocabulary using the provided model

def compute_embeddings(model, vocabs):
    embeds = {}
    # Iterate over each word in the vocabulary
    for word in vocabs:
        try:
            # Try to get the index of the word from the word2index dictionary
            index = word2index[word]
        except:
            index = word2index['<UNK>']
        # Convert the word index to a tensor and send it to the appropriate device (GPU/CPU)
        word_idx = torch.LongTensor([word2index[word]]).to(device)
        
        # Get the "center" and "outside" embeddings from the model (SkipGram architecture)
        embed_c = model.embedding_center(word_idx)
        embed_o = model.embedding_outside(word_idx)
        
        # Compute the average of the center and outside embeddings
        embed   = (embed_c + embed_o) / 2
        
        # Convert the embedding to a tuple (taking the two values from the embedding) and store it
        embed = embed[0][0].item(), embed[0][1].item()
        embeds[word] = np.array(embed)
    
    return embeds

In [34]:
# Function to retrieve the embedding for a given word from the embeddings dictionary
def get_embed(embeddings, word):
    try:
        index = word2index[word]
    except:        
         # If the word is not in word2index, set it to '<UNK>' (Unknown token)
        word = '<UNK>'
        
    return embeddings[word]

In [35]:
# Find the embeddings from each of our models
skipgram_embeds = compute_embeddings(skipgram_model, vocabs)
neg_embeds = compute_embeddings(skipgram_neg_model, vocabs)
glove_embeds = compute_embeddings(glove_model, vocabs)

In [36]:
embeds_dict = {
    "skipgram_embeds": skipgram_embeds,
    "neg_embeds": neg_embeds,
    "glove_embeds": glove_embeds
}

for embeds in embeds_dict.items():
    with open(f"embedding/{embeds[0]}.pickle", "wb") as f:
        pickle.dump(embeds[1], f)

In [37]:
get_embed(neg_embeds, 'china')

array([-0.61311311, -0.95424384])

In [38]:
# read the analogy dataset
with open("word-test.v1.txt", "r") as f:
    data = f.read()

data = data.replace("\t", "")
# split the dataset based on their categories
analogy = data.split(': ')

In [56]:
analogy[12]

'gram7-past-tense\ndancing danced decreasing decreased\ndancing danced describing described\ndancing danced enhancing enhanced\ndancing danced falling fell\ndancing danced feeding fed\ndancing danced flying flew\ndancing danced generating generated\ndancing danced going went\ndancing danced hiding hid\ndancing danced hitting hit\ndancing danced implementing implemented\ndancing danced increasing increased\ndancing danced jumping jumped\ndancing danced knowing knew\ndancing danced listening listened\ndancing danced looking looked\ndancing danced moving moved\ndancing danced paying paid\ndancing danced playing played\ndancing danced predicting predicted\ndancing danced reading read\ndancing danced running ran\ndancing danced saying said\ndancing danced screaming screamed\ndancing danced seeing saw\ndancing danced selling sold\ndancing danced shrinking shrank\ndancing danced singing sang\ndancing danced sitting sat\ndancing danced sleeping slept\ndancing danced slowing slowed\ndancing dan

In [40]:
# select the 'capital-common-countries' section of the dataset

capital = analogy[1].split('\n')[1:-1]
capital = [x.split(" ") for x in capital]
capital[:5]

[['Athens', 'Greece', 'Baghdad', 'Iraq'],
 ['Athens', 'Greece', 'Bangkok', 'Thailand'],
 ['Athens', 'Greece', 'Beijing', 'China'],
 ['Athens', 'Greece', 'Berlin', 'Germany'],
 ['Athens', 'Greece', 'Bern', 'Switzerland']]

In [41]:
# Select the 'gram7-past-tense' section of the dataset
past_tense = analogy[12].split('\n')[1:-1]
past_tense = [x.split(" ") for x in past_tense]
past_tense[:5]

[['dancing', 'danced', 'decreasing', 'decreased'],
 ['dancing', 'danced', 'describing', 'described'],
 ['dancing', 'danced', 'enhancing', 'enhanced'],
 ['dancing', 'danced', 'falling', 'fell'],
 ['dancing', 'danced', 'feeding', 'fed']]

In [42]:
capital[6]

['Athens', 'Greece', 'Canberra', 'Australia']

In [43]:
# China - Beijing + Baghdad
# ground-truth == y_true == 'Iraq'
i = 1
y_pred = get_embed(neg_embeds, capital[i][1].lower()) - get_embed(neg_embeds, capital[i][0].lower()) + get_embed(neg_embeds, capital[i][2].lower())
y_pred

array([-0.34053925, -0.52838098])

In [44]:
# Function to calculate cosine similarity between two vectors
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [45]:
# Function to find the most similar word to the input vector based on cosine similarity
def get_most_similar(vector, embeddings):
    # retrieve all words in our embeddings vocabs
    try:
        words = list(embeddings.keys())
    except:
        words = list(embeddings.key_to_index.keys())
    
    similarities = {}

    # for each word in the vocabs, find the cosine similarities between word vectors in our embeddings and the input vector
    for word in words:
        similarities[word] = cosine_similarity(vector, embeddings[word])

     # Find and return the word with the highest cosine similarity (most similar to the input vector)
    return max(similarities, key=similarities.get)

In [46]:
# Function to find the most similar word to the input vector based on cosine similarity
def cosine_ranking(vector, embeddings):
    # retrieve all words in our embeddings vocabulary
    try:
        words = list(embeddings.keys())
    except:
        words = list(embeddings.key_to_index.keys())
    
    similarities = {}

    # For each word in the vocabulary, compute the cosine similarity between the word's embedding and the input vector
    for word in words:
        # Compute cosine similarity for the word's embedding and the input vector
        similarities[word] = cosine_similarity(vector, embeddings[word])

    # Sort the similarities in descending order and return the words along with their similarity scores
    # Return a dictionary with words sorted by similarity (highest first)
    return dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))

### Defining accuracy function

In [47]:
# Function for finding semantic and syntactic accuracies
def find_accuracy(dataset, embeddings):
    matched_count = 0                   # Initialize a variable to keep track of the number of correct predictions

    for data in dataset:
        # Convert each word in the current sentence (data) to lowercase
        row = [word.lower() for word in data]
        
        # Try to predict the word based on word embeddings
        try:
            # Predict the word by performing a vector arithmetic operation: word2 - word1 + word3
            # The idea here is that embeddings are trained such that word2 - word1 + word3 gives us a vector that is closest to the target word (semantic analogy)
            pred_y = get_embed(embeddings, row[1]) - get_embed(embeddings, row[0]) + get_embed(embeddings, row[2])
            
            # Find the word whose embedding is closest to the predicted vector
            pred_word = get_most_similar(pred_y, embeddings)
        except:
            # If an error occurs (e.g., a word is missing or not in the embeddings), fall back on the Gensim method to find the most similar word
            pred_word = embeddings.most_similar(positive=[row[1], row[2]], negative=[row[0]])[0][0]

        # Check if the predicted word matches the ground-truth word (row[3])
        # If the predicted word is the same as the ground-truth word, increase the matched_count
        if row[3] == pred_word:
            matched_count += 1

    # Return the accuracy as the ratio of matched predictions to total dataset size
    return matched_count / len(dataset)

### Accuracy for the Models: Comparison and Analysis

In [48]:
skipgram_sem = find_accuracy(capital, skipgram_embeds)
skipgram_syn = find_accuracy(past_tense, skipgram_embeds)

In [49]:
neg_sem = find_accuracy(capital, neg_embeds)
neg_syn = find_accuracy(past_tense, neg_embeds)

In [50]:
glove_sem = find_accuracy(capital, glove_embeds)
glove_syn = find_accuracy(past_tense, glove_embeds)

In [51]:
gensim_sem = find_accuracy(capital, gensim_model)
gensim_syn = find_accuracy(past_tense, gensim_model)

In [52]:
print("=== Word2Vec ===")
print(f"Semantic accuracy: {skipgram_sem}")
print(f"Syntatic accuracy: {skipgram_syn}\n")

print("=== Word2Vec (Negative Sampling) ===")
print(f"Semantic accuracy: {neg_sem}")
print(f"Syntatic accuracy: {neg_syn}\n")

print("=== GloVe from Scratch ===")
print(f"Semantic accuracy: {glove_sem}")
print(f"Syntatic accuracy: {glove_syn}\n")

print("=== GloVe (Gensim) ===")
print(f"Semantic accuracy: {gensim_sem}")
print(f"Syntatic accuracy: {gensim_syn}")

=== Word2Vec ===
Semantic accuracy: 0.0
Syntatic accuracy: 0.0

=== Word2Vec (Negative Sampling) ===
Semantic accuracy: 0.0
Syntatic accuracy: 0.0

=== GloVe from Scratch ===
Semantic accuracy: 0.0
Syntatic accuracy: 0.0

=== GloVe (Gensim) ===
Semantic accuracy: 0.9031620553359684
Syntatic accuracy: 0.41794871794871796


## Similarity Correlation using Similarity Dataset

In [53]:
# load word similarity dataset as pandas dataframe
wordsim = pd.read_csv('wordsim_similarity_goldstandard.txt', sep="\t", header=None, names=['word_1', 'word_2', 'similarities'])
wordsim

Unnamed: 0,word_1,word_2,similarities
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [54]:
wordsim['SKIP_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(skipgram_embeds, row['word_1'].lower()), get_embed(skipgram_embeds, row['word_2'].lower())
    ), axis=1)

wordsim['NEG_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(neg_embeds, row['word_1'].lower()), get_embed(neg_embeds, row['word_2'].lower())
    ), axis=1)

wordsim['glove_dot_product'] = wordsim.apply(lambda row: np.dot(
    get_embed(glove_embeds, row['word_1'].lower()), get_embed(glove_embeds, row['word_1'].lower())
    ), axis=1)

wordsim['gensim_dot_product'] = wordsim.apply(lambda row: np.dot(
    gensim_model[row['word_1'].lower()], gensim_model[row['word_2'].lower()]
    ), axis=1)

wordsim

Unnamed: 0,word_1,word_2,similarities,SKIP_dot_product,NEG_dot_product,glove_dot_product,gensim_dot_product
0,tiger,cat,7.35,-0.038295,-0.639163,0.401860,15.629377
1,tiger,tiger,10.00,2.650121,0.359589,0.401860,32.800148
2,plane,car,5.77,-1.192696,-0.095099,0.401860,24.047298
3,train,car,6.31,-1.192696,-0.095099,0.401860,25.472923
4,television,radio,6.77,0.303345,-0.029856,2.008640,34.689987
...,...,...,...,...,...,...,...
198,rooster,voyage,0.62,0.081380,-0.344292,0.401860,1.683646
199,noon,string,0.54,2.650121,0.359589,0.401860,1.070593
200,chord,smile,0.54,2.650121,0.359589,0.401860,6.762520
201,professor,cucumber,0.31,-0.706180,0.604041,0.686342,-0.230552


In [55]:
from scipy.stats import spearmanr

# finding spearman correlations between wordsim353 similarities and our embeddings dot products
wordsim_sim = wordsim['similarities'].to_numpy()
skipgram_sim = wordsim['SKIP_dot_product'].to_numpy()
neg_sim = wordsim['NEG_dot_product'].to_numpy()
glove_sim = wordsim['glove_dot_product'].to_numpy()
gensim_sim = wordsim['gensim_dot_product'].to_numpy()

print("=== Spearman correlations ===")
print(f"Word2Vec (Skipgram): {spearmanr(wordsim_sim, skipgram_sim).statistic}")
print(f"Word2Vec (Negative Sampling): {spearmanr(wordsim_sim, neg_sim).statistic}")
print(f"GloVe from Scratch: {spearmanr(wordsim_sim, glove_sim).statistic}")
print(f"GloVe (Gensim): {spearmanr(wordsim_sim, gensim_sim).statistic}")

=== Spearman correlations ===
Word2Vec (Skipgram): 0.1716951828130863
Word2Vec (Negative Sampling): -0.03186228945819189
GloVe from Scratch: -0.057911795204087395
GloVe (Gensim): 0.5430870624672256


<h4>Model Accuracies and Training Time Comparison</h4>

| **Model**          | **Window Size** | **Training Loss** | **Training time** | **Syntactic Accuracy** | **Semantic accuracy** |
|--------------------|:---------------:|:-----------------:|:-----------------:|:----------------------:|:---------------------:|
| **Skipgram**       |        2        |      8.24      |       3.18 s      |          0.        |           0          |
| **Skipgram (NEG)** |        2        |       14.95      |       2.99 s      |           0           |           0         |
| **GloVe**          |        2        |       7.93      |      59.40 s      |           0           |          0.2        |
| **GloVe (Gensim)** |        -        |         -         |         -         |         41.79%         |         90.32        |

<h4>Correlation between Model Dot Product and Score by Human Judgement</h4>

| **Model**                | **Skipgram** | **NEG** | **GloVe** | **GloVe (gensim)** |
|--------------------------|--------------|---------|-----------|--------------------|
| **Spearman Correlation** |    0.172    |  -0.032 |   -0.058  |       0.5431       |