# Word2Vec (Negative Sampling)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
np.__version__, torch.__version__

('1.24.2', '2.1.2+cu121')

## 1. Load data

In [3]:
import nltk
from nltk.corpus import reuters
nltk.download('reuters')

[nltk_data] Downloading package reuters to /home/koala/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [33]:
# Getting all file IDs from the Reuters corpus
file_ids = reuters.fileids()

# Creating a corpus where each document is a separate item
reuter = [reuters.raw(fileid) for fileid in file_ids]

# Print the first document
print("First Document:")
print(reuter[0])

print("\n")

# Print the second document
print("Second Document:")
print(reuter[1])

print(len(reuter))

First Document:
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtua

In [34]:
# tokenization
corpus = reuters.sents()
corpus = corpus[0:2500] 

# flattening
flatten = lambda l: [item for sublist in l for item in sublist]

# unique words
vocabs = list(set(flatten(corpus))) 

# mapping between word and integer
word2index = {v: idx for idx, v in enumerate(vocabs)}

In [35]:
total_word = 0
for sen in corpus:
    for w in sen:
        total_word += 1
total_word

75698

In [36]:
len(vocabs)

8557

In [37]:
# if unkown word is seen, return 'UNK'
vocabs.append('<UNK>')
word2index['<UNK>'] = 9489

In [9]:
# just in case we need to use
index2word = {v: k for k,v in word2index.items()}

In [10]:
index2word[9489]

'<UNK>'

In [11]:
len(vocabs)

8558

#### 2. Training data

In [12]:
i = 0
for c in corpus:
    print(c)
    # Stop after ten
    i+=1
    if i > 10:
        break

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.']
['But', 'some', 'exporters', 'said', 'that', 'while', 'the', 'conflict', 'would', 'hurt', 'them', 'in', 'the', 'long', '-', 'run', ',', 'in', 'the', 'short', '-', 'term', 'Tokyo', "'", 's', 'loss', 'might', 'be', 'their', 'gain', '.']
['The', 'U', '.', 'S', '.', 'Has', 'said', 'it', 'will', 'impo

In [13]:
#window size 2

#create pairs of center word, and outside word
def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 3rd word until third last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-2]], word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]])
            #for each of these four outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)

In [14]:
x.shape  #batch_size, 1

(2, 1)

In [15]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [16]:
z = 0.001

In [17]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

75698

$$P(w)=U(w)^{3/4}/Z$$

In [18]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'.': 111,
         ',': 89,
         'the': 81,
         'of': 57,
         'to': 53,
         'in': 44,
         'and': 42,
         'said': 42,
         'a': 38,
         'mln': 31,
         '-': 27,
         'for': 24,
         "'": 24,
         'vs': 23,
         'The': 22,
         's': 21,
         'dlrs': 20,
         'on': 19,
         '000': 18,
         '1': 18,
         'pct': 18,
         '"': 17,
         '&': 17,
         'that': 17,
         'lt': 17,
         'is': 17,
         ';': 17,
         'it': 17,
         'its': 17,
         'by': 16,
         'from': 16,
         'was': 16,
         'at': 16,
         'be': 15,
         'S': 15,
         'year': 15,
         'U': 15,
         '>': 14,
         'cts': 14,
         'will': 13,
         'has': 13,
         'with': 13,
         '2': 13,
         '1986': 12,
         'billion': 12,
         'company': 12,
         'not': 11,
         'as': 11,
         'an': 11,
         'would': 11,
         '3': 10,
    

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [19]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [20]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [21]:
batch_size = 2
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [22]:
k = 15
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [23]:
y_tensor[1]

tensor([7408])

In [24]:
neg_samples[1]

tensor([6257, 4702, 7232, 7153, 1914,  300, 5352, 1177, 6397, 5774, 3867, 6352,
        4601, 1914, 3922])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [25]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [26]:
# test the model

emb_size = 30
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size)

In [27]:
loss = model(x_tensor, y_tensor, neg_samples)

In [28]:
loss

tensor(5.5299, grad_fn=<NegBackward0>)

## 5. Training

In [29]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [30]:
import time
import torch

# Check if GPU with CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move your model to the chosen device
model.to(device)

num_epochs = 1000
print_interval = 100  # Interval to print the loss

# Start the timer
start_time = time.time()

for epoch in range(num_epochs):
    
    # Get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    # Move data to the GPU
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    # Predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k).to(device)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    # Backpropagate
    optimizer.zero_grad()
    loss.backward()
    
    # Update weights
    optimizer.step()
    
    # Print the loss every 100 epochs
    if (epoch + 1) % print_interval == 0:
        current_time = time.time()
        elapsed_time = current_time - start_time
        mins, secs = divmod(elapsed_time, 60)
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss.item():2.6f} | Time: {int(mins):02d}m {int(secs):02d}s")

# End of training
end_time = time.time()
total_time = end_time - start_time
total_mins, total_secs = divmod(total_time, 60)
print(f"Training completed in {int(total_mins):02d}m {int(total_secs):02d}s")


Using device: cuda
Epoch    100 | Loss: 22.400711 | Time: 00m 27s
Epoch    200 | Loss: 17.025345 | Time: 00m 55s
Epoch    300 | Loss: 3.511977 | Time: 01m 22s
Epoch    400 | Loss: 9.364348 | Time: 01m 49s
Epoch    500 | Loss: 2.557838 | Time: 02m 17s
Epoch    600 | Loss: 5.657587 | Time: 02m 44s
Epoch    700 | Loss: 0.004698 | Time: 03m 11s
Epoch    800 | Loss: 13.506144 | Time: 03m 39s
Epoch    900 | Loss: 18.277500 | Time: 04m 07s
Epoch   1000 | Loss: 3.522112 | Time: 04m 34s
Training completed in 04m 34s


In [32]:
# save the model
torch.save(model.state_dict(), 'models/skipgramNEG.pth')