# Word2Vec (Skipgram )

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
np.__version__, torch.__version__

('2.2.0', '2.5.1+cu124')

In [3]:
import matplotlib
matplotlib.__version__

'3.10.0'

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.cuda.device_count()

4

In [6]:
# Select the GPU with the most free memory
def get_free_gpu():
    # Check if CUDA is available before getting device count
    if torch.cuda.is_available():
        free_mem = [torch.cuda.memory_reserved(i) for i in range(torch.cuda.device_count())]
        return free_mem.index(min(free_mem))
    # If CUDA is not available, return -1 (or another appropriate value)
    else:
        return -1  

best_gpu = get_free_gpu()

# Use the best GPU if available, otherwise use CPU
if best_gpu != -1:
    torch.cuda.set_device(best_gpu)
    print(f"Using GPU: {best_gpu}")
else:
    print("No CUDA-enabled GPUs found. Using CPU.")

Using GPU: 0


In [7]:
# Set the device variable
device = torch.device(f'cuda:{best_gpu}' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

## 1. Load data

In [8]:
# import nltk
# nltk.download('reuters', download_dir='c:\\Users\\Acer\\Documents\\AIT\\MsDSAI\\January2025\\Python-fo-Natural-Language-Processing\\Code\\01 - Fundamental\\code-along\\')

In [9]:
# import os
# os.getcwd()

In [10]:
# nltk.data.path.append("c:\\Users\\Acer\\Documents\\AIT\\MsDSAI\\January2025\\Python-fo-Natural-Language-Processing\\Code\\01 - Fundamental\\code-along\\")

In [11]:
import nltk
nltk.download('reuters')
nltk.download('punkt_tab')

[nltk_data] Downloading package reuters to /home/jupyter-
[nltk_data]     st125469/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jupyter-
[nltk_data]     st125469/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
from nltk.corpus import reuters

In [13]:
reuters.words()

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]

In [14]:
# corpus = ["apple banana fruit", "banana apple fruit", "banana fruit apple",
#           "dog cat animal", "cat animal dog", "cat dog animal"]
corpus = reuters.sents()
corpus = corpus[:10000]
len(corpus)

10000

In [15]:
#1. tokenization
# corpus = [sent.split(" ") for sent in corpus]
# corpus

In [16]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [17]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['pattern']

6328

In [18]:
last_vocab_index = len(vocabs)
last_vocab_index

18045

In [19]:
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_index

In [20]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'blunts'

## 2. Prepare train data

In [21]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = tuple(word2index[doc[j]] for j in range(i - window_size, i + window_size + 1) if j != i)
            # outside = (word2index[doc[i-2]], word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)

In [22]:
x.shape  #batch_size, 1

(2, 1)

In [23]:
x

array([[15966],
       [ 8240]])

In [24]:
y.shape  #batch_size 1

(2, 1)

In [25]:
y

array([[ 607],
       [8543]])

## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [26]:
latest_vocab_index = len(vocabs)
latest_vocab_index

18046

In [27]:
embedding = nn.Embedding(latest_vocab_index, 2)

In [28]:
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape  #(batch_size, 1, emb_size)

torch.Size([2, 1, 2])

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [29]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.word2index = word2index
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)

        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

    def get_embed(self, word):
      word2index = self.word2index
      try:
        index = word2index[word]
      except:
        index = word2index['<UNK>']

      word = torch.LongTensor([index]).to(device)

      embed_c = self.embedding_center(word)
      embed_o = self.embedding_outside(word)
      embed   = (embed_c + embed_o) / 2
    
      return embed[0][0].item(), embed[0][1].item()

In [30]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 18043, 18044, 18045],
        [    0,     1,     2,  ..., 18043, 18044, 18045]])

In [31]:
model = Skipgram(voc_size, 2, word2index)
model

Skipgram(
  (embedding_center): Embedding(18046, 2)
  (embedding_outside): Embedding(18046, 2)
)

In [32]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [33]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [34]:
loss

tensor(8.5788, grad_fn=<NegBackward0>)

## 4. Training

In [35]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size, word2index).to(device)
optimizer  = optim.Adam(model.parameters(), lr=0.001)
all_vocabs = all_vocabs.to(device)

In [36]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_time, elapsed_mins, elapsed_secs

In [37]:
import time

num_epochs = 100
total_time = 0
start_time = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        end_time = time.time()
        total, epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        total_time += total
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f} | time: {epoch_mins}m {epoch_secs}s")
        start_time = time.time()

Epoch     10 | Loss: 10.360737 | time: 0m 25s
Epoch     20 | Loss: 10.983431 | time: 0m 25s
Epoch     30 | Loss: 11.234663 | time: 0m 24s
Epoch     40 | Loss: 10.896197 | time: 0m 25s
Epoch     50 | Loss: 9.951131 | time: 0m 25s
Epoch     60 | Loss: 10.747758 | time: 0m 25s
Epoch     70 | Loss: 8.882209 | time: 0m 24s
Epoch     80 | Loss: 9.856417 | time: 0m 22s
Epoch     90 | Loss: 9.816429 | time: 0m 22s
Epoch    100 | Loss: 10.466488 | time: 0m 22s


In [38]:
# print total train loss and total training time
print(f"Total train loss: {loss:.6f}")
print(f"Total training time: {total_time:.2f} seconds")

Total train loss: 10.466488
Total training time: 242.98 seconds


## 5. Plot the embeddings

Is fruit really near to banana?
Is fruit really far from cat?

In [39]:
vocabs[:10]

['accept',
 'letup',
 '91',
 'explained',
 'flotation',
 'blunts',
 'headlined',
 'brands',
 'political',
 'WITH']

In [40]:
fact = torch.LongTensor([word2index['fact']]).to(device)
fact

tensor([1758], device='cuda:0')

In [41]:
fact_embed_c = model.embedding_center(fact)
fact_embed_o = model.embedding_outside(fact)
fact_embed   = (fact_embed_c + fact_embed_o) / 2
fact_embed

tensor([[0.5252, 0.4102]], device='cuda:0', grad_fn=<DivBackward0>)

In [42]:
fact_embed_o

tensor([[1.3644, 0.3220]], device='cuda:0', grad_fn=<EmbeddingBackward0>)

In [43]:
# def get_embed_external(word):
#     try:
#         index = word2index[word]
#     except:
#         index = word2index['<UNK>']
        
#     word = torch.LongTensor([word2index[word]])
    
#     embed_c = model.embedding_center(word).to(device)
#     embed_o = model.embedding_outside(word).to(device)
#     embed   = (embed_c + embed_o) / 2
    
#     return embed[0][0].item(), embed[0][1].item()

In [44]:
# get_embed_external('fruit')

In [45]:
# get_embed('cat')

In [46]:
# get_embed('dog')

In [47]:
# get_embed('banana')

In [48]:
# plt.figure(figsize=(6, 3))
# for i, word in enumerate(vocabs):
#     x, y = get_embed(word)
#     plt.scatter(x, y)
#     plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
# plt.show()

## 5. Save model

In [49]:
torch.save(model.state_dict(), 'app/code/models/skipgram.pt')

In [50]:
import pickle

skipgram_args = {
    'word2index': word2index,
    'voc_size': voc_size,
    'emb_size': emb_size
}

pickle.dump(skipgram_args, open('app/code/models/skipgrams.pkl', 'wb'))

In [51]:
load_skipgram_args = pickle.load(open('app/code/models/skipgrams.pkl', 'rb'))
load_model = Skipgram(**load_skipgram_args).to(device)
load_model.load_state_dict(torch.load('app/code/models/skipgram.pt'))

  load_model.load_state_dict(torch.load('app/code/models/skipgram.pt'))


<All keys matched successfully>

In [52]:
load_model.get_embed('fact')

(0.5251746773719788, 0.41022220253944397)

### 6. Testing

In [53]:
# with open(path_to_file, 'r') as file:
#     content = file.readlines()


## 6. Cosine similarity

In [54]:
# banana = get_embed('banana')
# banana

In [55]:
# fruit = get_embed('fruit')
# fruit

In [56]:
# unk = get_embed('<UNK>')
# unk

In [57]:
# np.array(banana) @ np.array(unk)

In [58]:
# #more formally is to divide by its norm
# def cosine_similarity(A, B):
#     dot_product = np.dot(A, B)
#     norm_a = np.linalg.norm(A)
#     norm_b = np.linalg.norm(B)
#     similarity = dot_product / (norm_a * norm_b)
#     return similarity

# print(cosine_similarity(np.array(banana), np.array(unk)))
# print(cosine_similarity(np.array(banana), np.array(fruit)))