# GloVE

Let's work on implementation of GloVE.

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
np.__version__, torch.__version__

('1.26.4', '2.5.1+cu121')

In [5]:
# Select the GPU with the most free memory
def get_free_gpu():
    # Check if CUDA is available before getting device count
    if torch.cuda.is_available():
        free_mem = [torch.cuda.memory_reserved(i) for i in range(torch.cuda.device_count())]
        return free_mem.index(min(free_mem))
    # If CUDA is not available, return -1 (or another appropriate value)
    else:
        return -1

best_gpu = get_free_gpu()

# Use the best GPU if available, otherwise use CPU
if best_gpu != -1:
    torch.cuda.set_device(best_gpu)
    print(f"Using GPU: {best_gpu}")
else:
    print("No CUDA-enabled GPUs found. Using CPU.")

No CUDA-enabled GPUs found. Using CPU.


In [6]:
# Set the device variable
device = torch.device(f'cuda:{best_gpu}' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## 1. Load data

In [7]:
import nltk
nltk.download('reuters')
nltk.download('punkt_tab')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
from nltk.corpus import reuters

# Get sentences from the Reuters corpus
corpus = reuters.sents()

# Limit the corpus to the first 20,000 sentences
corpus = corpus[:10000]
len(corpus)

10000

In [17]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['ending',
 'JORDAN',
 'Sight',
 'impact',
 'scratch',
 'well',
 'shared',
 'graphics',
 'Foothill',
 'Baldwin',
 '1965',
 'channels',
 'speeds',
 'WPP',
 'proportion',
 'Scallop',
 'attorney',
 'WOODSTOCK',
 'juice',
 'Distilling',
 '441',
 'wound',
 'Southeast',
 'Farmer',
 'raises',
 'referred',
 'law',
 'assessed',
 'reign',
 'parities',
 'multiples',
 'revealed',
 'multinational',
 'CAPE',
 'GRANTS',
 'contaminated',
 'definite',
 'Hutton',
 'PARTICIPATING',
 '367',
 'According',
 'programmable',
 'emphasized',
 'ALRN',
 'struggle',
 'stretch',
 'MI',
 'DETAILS',
 'Stcks',
 'Fluorocarbon',
 'parcel',
 'COATINGS',
 'CR',
 'renewal',
 'UNPREPARED',
 'whom',
 'PZA',
 'separating',
 'gauges',
 'priced',
 'successive',
 'delivery',
 'contended',
 'March',
 'ASSETS',
 'Light',
 'stark',
 'SHARE',
 'CEREAL',
 'Under',
 'saturated',
 'dispositions',
 'fire',
 'Renato',
 'emerge',
 'Clarke',
 'Piezo',
 'Electronics',
 'Postal',
 'EXTENDS',
 '553',
 'nonprofit',
 'sowing',
 'gangs',
 'plate

In [18]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [19]:
#vocab size
voc_size = len(vocab)
print(voc_size)

18045


In [20]:
#append UNK
vocab.append('<UNK>')

In [21]:
vocab[:100]

['ending',
 'JORDAN',
 'Sight',
 'impact',
 'scratch',
 'well',
 'shared',
 'graphics',
 'Foothill',
 'Baldwin',
 '1965',
 'channels',
 'speeds',
 'WPP',
 'proportion',
 'Scallop',
 'attorney',
 'WOODSTOCK',
 'juice',
 'Distilling',
 '441',
 'wound',
 'Southeast',
 'Farmer',
 'raises',
 'referred',
 'law',
 'assessed',
 'reign',
 'parities',
 'multiples',
 'revealed',
 'multinational',
 'CAPE',
 'GRANTS',
 'contaminated',
 'definite',
 'Hutton',
 'PARTICIPATING',
 '367',
 'According',
 'programmable',
 'emphasized',
 'ALRN',
 'struggle',
 'stretch',
 'MI',
 'DETAILS',
 'Stcks',
 'Fluorocarbon',
 'parcel',
 'COATINGS',
 'CR',
 'renewal',
 'UNPREPARED',
 'whom',
 'PZA',
 'separating',
 'gauges',
 'priced',
 'successive',
 'delivery',
 'contended',
 'March',
 'ASSETS',
 'Light',
 'stark',
 'SHARE',
 'CEREAL',
 'Under',
 'saturated',
 'dispositions',
 'fire',
 'Renato',
 'emerge',
 'Clarke',
 'Piezo',
 'Electronics',
 'Postal',
 'EXTENDS',
 '553',
 'nonprofit',
 'sowing',
 'gangs',
 'plate

In [22]:
word2index['<UNK>'] = voc_size

In [23]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()}

In [24]:
#vocab size
voc_size = len(vocab)
print(voc_size)

18046


## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [25]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'ASIAN': 2,
         'EXPORTERS': 14,
         'FEAR': 1,
         'DAMAGE': 1,
         'FROM': 24,
         'U': 1115,
         '.': 18486,
         'S': 1138,
         '.-': 42,
         'JAPAN': 67,
         'RIFT': 1,
         'Mounting': 1,
         'trade': 397,
         'friction': 8,
         'between': 188,
         'the': 10442,
         'And': 44,
         'Japan': 373,
         'has': 934,
         'raised': 64,
         'fears': 11,
         'among': 35,
         'many': 41,
         'of': 6525,
         'Asia': 14,
         "'": 2087,
         's': 1709,
         'exporting': 12,
         'nations': 66,
         'that': 1347,
         'row': 3,
         'could': 283,
         'inflict': 1,
         'far': 45,
         '-': 2741,
         'reaching': 7,
         'economic': 189,
         'damage': 27,
         ',': 12956,
         'businessmen': 14,
         'and': 4478,
         'officials': 156,
         'said': 4626,
         'They': 124,
         'told': 237,

In [26]:
skip_grams = []
window_size = 2

for doc in corpus:
    for i in range(window_size, len(doc)-window_size):
        center = doc[i]
        # outside = [doc[i-1], doc[i+1]]
        outside = tuple(doc[j] for j in range(i - window_size, i + window_size + 1) if j != i)
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('FEAR', 'ASIAN'),
 ('FEAR', 'EXPORTERS'),
 ('FEAR', 'DAMAGE'),
 ('FEAR', 'FROM'),
 ('DAMAGE', 'EXPORTERS'),
 ('DAMAGE', 'FEAR'),
 ('DAMAGE', 'FROM'),
 ('DAMAGE', 'U'),
 ('FROM', 'FEAR'),
 ('FROM', 'DAMAGE'),
 ('FROM', 'U'),
 ('FROM', '.'),
 ('U', 'DAMAGE'),
 ('U', 'FROM'),
 ('U', '.'),
 ('U', 'S'),
 ('.', 'FROM'),
 ('.', 'U'),
 ('.', 'S'),
 ('.', '.-'),
 ('S', 'U'),
 ('S', '.'),
 ('S', '.-'),
 ('S', 'JAPAN'),
 ('.-', '.'),
 ('.-', 'S'),
 ('.-', 'JAPAN'),
 ('.-', 'RIFT'),
 ('JAPAN', 'S'),
 ('JAPAN', '.-'),
 ('JAPAN', 'RIFT'),
 ('JAPAN', 'Mounting'),
 ('RIFT', '.-'),
 ('RIFT', 'JAPAN'),
 ('RIFT', 'Mounting'),
 ('RIFT', 'trade'),
 ('Mounting', 'JAPAN'),
 ('Mounting', 'RIFT'),
 ('Mounting', 'trade'),
 ('Mounting', 'friction'),
 ('trade', 'RIFT'),
 ('trade', 'Mounting'),
 ('trade', 'friction'),
 ('trade', 'between'),
 ('friction', 'Mounting'),
 ('friction', 'trade'),
 ('friction', 'between'),
 ('friction', 'the'),
 ('between', 'trade'),
 ('between', 'friction'),
 ('between', 'the'),
 ('be

In [27]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('FEAR', 'ASIAN'): 1,
         ('FEAR', 'EXPORTERS'): 1,
         ('FEAR', 'DAMAGE'): 1,
         ('FEAR', 'FROM'): 1,
         ('DAMAGE', 'EXPORTERS'): 1,
         ('DAMAGE', 'FEAR'): 1,
         ('DAMAGE', 'FROM'): 1,
         ('DAMAGE', 'U'): 1,
         ('FROM', 'FEAR'): 1,
         ('FROM', 'DAMAGE'): 1,
         ('FROM', 'U'): 1,
         ('FROM', '.'): 5,
         ('U', 'DAMAGE'): 1,
         ('U', 'FROM'): 1,
         ('U', '.'): 940,
         ('U', 'S'): 877,
         ('.', 'FROM'): 5,
         ('.', 'U'): 979,
         ('.', 'S'): 1807,
         ('.', '.-'): 38,
         ('S', 'U'): 911,
         ('S', '.'): 1801,
         ('S', '.-'): 17,
         ('S', 'JAPAN'): 4,
         ('.-', '.'): 41,
         ('.-', 'S'): 17,
         ('.-', 'JAPAN'): 1,
         ('.-', 'RIFT'): 1,
         ('JAPAN', 'S'): 1,
         ('JAPAN', '.-'): 1,
         ('JAPAN', 'RIFT'): 1,
         ('JAPAN', 'Mounting'): 1,
         ('RIFT', '.-'): 1,
         ('RIFT', 'JAPAN'): 1,
         ('RIF

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [28]:
def weighting(w_i, w_j, X_ik):

    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1

    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75

    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1

    return result

In [29]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass

    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [30]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):

    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []

    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]

    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)

    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])

        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])

    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [31]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [32]:
x

array([[11731],
       [ 1366]])

In [33]:
y

array([[11175],
       [  349]])

In [34]:
cooc

array([[4.30406509],
       [2.19722458]])

In [35]:
weighting

array([[0.79785467],
       [0.16431677]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [60]:
class Glove(nn.Module):

    def __init__(self, voc_size, emb_size, word2index):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)

        self.center_bias       = nn.Embedding(voc_size, 1)
        self.outside_bias      = nn.Embedding(voc_size, 1)

        self.word2index        = word2index

    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)

        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)

        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)

        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)

        return torch.sum(loss)

    def get_embed(self, word):
      word2index = self.word2index
      try:
        index = word2index[word]
      except:
        index = word2index['<UNK>']

      word = torch.LongTensor([index]).to(device)

      embed_c = self.center_embedding(word)
      embed_o = self.outside_embedding(word)
      embed   = (embed_c + embed_o) / 2

      return embed[0][0].item(), embed[0][1].item()

In [37]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size, word2index)

In [38]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [39]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [40]:
loss

tensor(21.1065, grad_fn=<SumBackward0>)

## 5. Training

In [41]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size, word2index).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [42]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_time, elapsed_mins, elapsed_secs

In [43]:
import time

# Training
num_epochs = 1000
total_start_time = time.time()
for epoch in range(num_epochs):

    start = time.time()

    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)

    loss.backward()
    optimizer.step()

    end = time.time()

    total, epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

total_end_time = time.time()
total_time, mins, secs = epoch_time(total_start_time, total_end_time)

Epoch: 100 | cost: 9.174536 | time: 0m 0s
Epoch: 200 | cost: 36.420441 | time: 0m 0s
Epoch: 300 | cost: 319.926636 | time: 0m 0s
Epoch: 400 | cost: 37.827198 | time: 0m 0s
Epoch: 500 | cost: 31.364248 | time: 0m 0s
Epoch: 600 | cost: 144.001297 | time: 0m 0s
Epoch: 700 | cost: 21.758560 | time: 0m 0s
Epoch: 800 | cost: 89.913231 | time: 0m 0s
Epoch: 900 | cost: 87.280876 | time: 0m 0s
Epoch: 1000 | cost: 3.063797 | time: 0m 0s


In [44]:
# print total train loss and total training time
print(f"Total train loss: {loss:.6f}")
print(f"Total training time: {total_time:.2f} seconds")

Total train loss: 3.063797
Total training time: 251.27 seconds


## 6. Plotting the embeddings

In [45]:
#list of vocabs
vocab[:10]

['ending',
 'JORDAN',
 'Sight',
 'impact',
 'scratch',
 'well',
 'shared',
 'graphics',
 'Foothill',
 'Baldwin']

In [46]:
word = vocab[0]

## 7. Save model

In [56]:
torch.save(model.state_dict(), '../content/app/code/models/glove.pt')

In [57]:
import pickle

glove_args = {
    'word2index': word2index,
    'voc_size': voc_size,
    'emb_size': emb_size
}

pickle.dump(glove_args, open('../content/app/code/models/glove.pkl', 'wb'))

In [61]:
load_glove_args = pickle.load(open('../content/app/code/models/glove.pkl', 'rb'))
load_model = Glove(**load_glove_args).to(device)
load_model.load_state_dict(torch.load('../content/app/code/models/glove.pt'))

  load_model.load_state_dict(torch.load('../content/app/code/models/glove.pt'))


<All keys matched successfully>

In [62]:
load_model.get_embed('impact')

(-0.06003697216510773, 0.8854387998580933)