# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

## 1. Load data

In [2]:
import nltk
from nltk.corpus import reuters

nltk.download('reuters')

[nltk_data] Downloading package reuters to /home/koala/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
# Getting all file IDs from the Reuters corpus
file_ids = reuters.fileids()

# Creating a corpus where each document is a separate item
reuter = [reuters.raw(fileid) for fileid in file_ids]

# Print the first document
print("First Document:")
print(reuter[0])

print("\n")

# Print the second document
print("Second Document:")
print(reuter[1])

print(len(reuter))

First Document:
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtua

In [4]:
# tokenization
corpus = reuters.sents()
corpus = corpus[0:2500] 

# flattening
flatten = lambda l: [item for sublist in l for item in sublist]

# unique words
vocabs = list(set(flatten(corpus))) 

# mapping between word and integer
word2index = {v: idx for idx, v in enumerate(vocabs)}

In [5]:
total_word = 0
for sen in corpus:
    for w in sen:
        total_word += 1
total_word

75698

In [6]:
len(vocabs)

8557

In [7]:
# if unkown word is seen, return 'UNK'
vocabs.append('<UNK>')
word2index['<UNK>'] = 8557

In [8]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

In [9]:
index2word[8557]

'<UNK>'

In [10]:
len(vocabs)

8558

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [11]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'.': 4084,
         ',': 3022,
         'the': 2675,
         'of': 1682,
         'to': 1512,
         'in': 1186,
         'said': 1137,
         'and': 1106,
         'a': 994,
         'mln': 754,
         '-': 628,
         "'": 548,
         'for': 529,
         'vs': 510,
         'The': 488,
         's': 457,
         'dlrs': 413,
         'on': 385,
         '1': 369,
         '000': 369,
         'pct': 359,
         '"': 357,
         'is': 354,
         'it': 352,
         'its': 351,
         ';': 340,
         '&': 339,
         'lt': 339,
         'that': 331,
         'by': 328,
         'at': 321,
         'from': 315,
         'was': 311,
         'S': 286,
         'be': 285,
         'year': 284,
         'U': 283,
         '>': 278,
         'cts': 277,
         'has': 253,
         'with': 251,
         '2': 245,
         'will': 237,
         'billion': 231,
         '1986': 224,
         'company': 212,
         'an': 205,
         'not': 203,
        

In [12]:
skip_grams = []

for doc in corpus:
    for i in range(1, len(doc)-1):
        center = doc[i]
        outside = [doc[i-1], doc[i+1]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('EXPORTERS', 'ASIAN'),
 ('EXPORTERS', 'FEAR'),
 ('FEAR', 'EXPORTERS'),
 ('FEAR', 'DAMAGE'),
 ('DAMAGE', 'FEAR'),
 ('DAMAGE', 'FROM'),
 ('FROM', 'DAMAGE'),
 ('FROM', 'U'),
 ('U', 'FROM'),
 ('U', '.'),
 ('.', 'U'),
 ('.', 'S'),
 ('S', '.'),
 ('S', '.-'),
 ('.-', 'S'),
 ('.-', 'JAPAN'),
 ('JAPAN', '.-'),
 ('JAPAN', 'RIFT'),
 ('RIFT', 'JAPAN'),
 ('RIFT', 'Mounting'),
 ('Mounting', 'RIFT'),
 ('Mounting', 'trade'),
 ('trade', 'Mounting'),
 ('trade', 'friction'),
 ('friction', 'trade'),
 ('friction', 'between'),
 ('between', 'friction'),
 ('between', 'the'),
 ('the', 'between'),
 ('the', 'U'),
 ('U', 'the'),
 ('U', '.'),
 ('.', 'U'),
 ('.', 'S'),
 ('S', '.'),
 ('S', '.'),
 ('.', 'S'),
 ('.', 'And'),
 ('And', '.'),
 ('And', 'Japan'),
 ('Japan', 'And'),
 ('Japan', 'has'),
 ('has', 'Japan'),
 ('has', 'raised'),
 ('raised', 'has'),
 ('raised', 'fears'),
 ('fears', 'raised'),
 ('fears', 'among'),
 ('among', 'fears'),
 ('among', 'many'),
 ('many', 'among'),
 ('many', 'of'),
 ('of', 'many'),
 ('of

In [13]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('S', '.'): 504,
         ('.', 'S'): 478,
         ("'", 's'): 454,
         ('s', "'"): 454,
         (',', '000'): 376,
         ('000', ','): 370,
         ('said', '.'): 363,
         ('lt', '&'): 339,
         ('lt', ';'): 339,
         (';', 'lt'): 339,
         ('of', 'the'): 335,
         ('the', 'of'): 335,
         ('&', 'lt'): 329,
         ('in', 'the'): 288,
         ('the', 'in'): 288,
         ('.', 'U'): 283,
         ('U', '.'): 265,
         ('1', '.'): 261,
         ('.', '1'): 259,
         ('said', 'it'): 178,
         ('it', 'said'): 178,
         (',', 'the'): 166,
         ('the', ','): 166,
         ('2', '.'): 164,
         ('.', '2'): 162,
         ('the', 'said'): 161,
         ('said', 'the'): 160,
         ('mln', 'dlrs'): 153,
         ('dlrs', 'mln'): 153,
         ('5', '.'): 141,
         ('.', '5'): 140,
         ('4', '.'): 120,
         ('.', '4'): 118,
         ('9', '.'): 118,
         ('.', '3'): 118,
         ('3', '.'): 118,
         

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [14]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [15]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [16]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [17]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [18]:
x

array([[5132],
       [1804]])

In [19]:
y

array([[2369],
       [8203]])

In [20]:
cooc

array([[0.69314718],
       [0.69314718]])

In [21]:
weighting

array([[0.05318296],
       [0.05318296]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [22]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [23]:
#test our system
voc_size = len(vocabs)
emb_size = 30
model = Glove(voc_size, emb_size)

In [24]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [25]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [26]:
loss

tensor(3.3862, grad_fn=<SumBackward0>)

## 5. Training

In [27]:
batch_size     = 2 # mini-batch size
embedding_size = 30 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [36]:
import time
import torch

# Check if GPU with CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move your model to the chosen device
model.to(device)

num_epochs = 1000
print_interval = 100  # Interval to print the loss

# Start the timer
start_time = time.time()

for epoch in range(num_epochs):
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)

    # backpropagate
    optimizer.zero_grad()

    loss.backward()
    # update weight
    optimizer.step()
    
    # Print the loss every 100 epochs
    if (epoch + 1) % print_interval == 0:
        current_time = time.time()
        elapsed_time = current_time - start_time
        mins, secs = divmod(elapsed_time, 60)
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss.item():2.6f} | Time: {int(mins):02d}m {int(secs):02d}s")

# End of training
end_time = time.time()
total_time = end_time - start_time
total_mins, total_secs = divmod(total_time, 60)
print(f"Training completed in {int(total_mins):02d}m {int(total_secs):02d}s")

Using device: cuda
Epoch    100 | Loss: 1.132968 | Time: 00m 02s
Epoch    200 | Loss: 0.232313 | Time: 00m 04s
Epoch    300 | Loss: 2.897296 | Time: 00m 05s
Epoch    400 | Loss: 2.982105 | Time: 00m 07s
Epoch    500 | Loss: 30.608322 | Time: 00m 09s
Epoch    600 | Loss: 3.885939 | Time: 00m 11s
Epoch    700 | Loss: 9.878235 | Time: 00m 13s
Epoch    800 | Loss: 1.843969 | Time: 00m 15s
Epoch    900 | Loss: 0.481872 | Time: 00m 17s
Epoch   1000 | Loss: 2.528146 | Time: 00m 19s
Training completed in 00m 19s


In [30]:
# save the model
torch.save(model.state_dict(), '/home/koala/assignment/NPL-A1/models/GloVe.pth')