# A1 : Search Engine (Negative Sampling)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import pickle

In [2]:
np.__version__, torch.__version__, matplotlib.__version__

('1.25.2', '2.1.0', '3.7.2')

## 1. Data Loader

In [3]:
# read the nltk dataset = rural
txt_file = './abc/rural.txt'

with open(txt_file, 'r', encoding = 'utf-8') as file:
        text = file.read()
        
# Split the dataset into paragraphs based on double line breaks : to get one paragraph in a list item ['paragraph1', 'paragraph']
paragraphs = [paragraph.strip() for paragraph in text.split('\n\n')]

In [4]:
for i in range (len(paragraphs)):
    # Replace newline characters with spaces
    paragraphs[i] = paragraphs[i].replace('\n', ' ')

In [5]:
# paragraphs

### 1.1 Tokenization

In [6]:
# 1. tokenization
corpus = [sent.split(" ") for sent in paragraphs]
# corpus

In [7]:
# limit the corpus only accept 100 documents
corpus = corpus[:100]

### 1.2 numericalization

In [8]:
# get unique word

# list comprehension for getting words
flatten = lambda l: [item for sublist in l for item in sublist]

# getting unique word and store as a list
vocab = list(set(flatten(corpus)))
# vocab

In [9]:
# add <UNK> to a dictionary vocab
vocab.append('<UNK>')

In [10]:
# numericalization: assign index to each word
word2index = {w:idx for idx, w in enumerate(vocab)}
# word2index

In [11]:
# index2word
index2word = {k:v for v,k in word2index.items()}
# index2word

## 2. Preparation for the training data

In [12]:
def random_batch(batch_size, corpus):

    # define a list for storing [center,outside] pair
    skipgrams = []

    # loop each word sequence
    for sent in corpus:
        
        for i in range(2, len(sent)-2):
            
            # assign center word
            center_word = word2index[sent[i]]
            
            # assign outside word=4 (ws = 2)
            outside_word = [word2index[sent[i-2]], word2index[sent[i-1]], word2index[sent[i+1]], word2index[sent[i+2]]]
            
            # for each of these two outside words, we gonna pair (center,outside) and append to a list
            for each_outside in outside_word:
                skipgrams.append([center_word, each_outside])
                
    # randomly select 2 pair among the data
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace = False)
    
    random_inputs = []
    random_labels = []
    
    for i in random_index:
        random_inputs.append([skipgrams[i][0]]) # center_word
        random_labels.append([skipgrams[i][1]]) # outside_word
        
    return np.array(random_inputs), np.array(random_labels)

## 3. Negative Sampling

### Unigram Distribution

$$P(w) = U(w)^{3/4} / Z

In [13]:
z = 0.001 # according to the papaer

In [14]:
# count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count # {'apple': 10, 'orange' : 5}

# get the total number of words
num_total_words = sum([c for w,c in word_count.items()])
print(num_total_words)

14716


In [15]:
# U(w)

unigram_table = []

for v in vocab:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    # print(v, uw)
    # print(v, uw_alpha)
    # print([v] * uw_alpha)
    # print("---")
    unigram_table.extend([v] * uw_alpha) 
    
# Counter(unigram_table)

## 4. Model

In [16]:
def prepare_sequence (seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'],seq))
    return torch.LongTensor(idxs)

In [17]:
import random

def negative_sampling(targets, unigram_table, k):
    
    batch_size  = targets.shape[0]
    neg_samples = []
    
    for i in range(batch_size): # (1,k) # (batch_size, k)
        target_index = targets[i].item()
        nsample = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1,-1))
            
        
    return torch.cat(neg_samples) # [batch_size,k]
    

#### testing

In [18]:
batch_size = 2
x,y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [19]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [20]:
print(neg_samples.shape)
print(neg_samples[0])
print(neg_samples[1])
print(y)

torch.Size([2, 5])
tensor([3738,  368, 1626, 3698, 4119])
tensor([1181, 3841, 1208, 2747, 2226])
[[ 936]
 [1414]]


#### Model

In [21]:
class SkipgramNeg (nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative_words):
        
        # center, outside : (bs, 1)
        # negative : (bs, k)
        
        center_embed  = self.embedding_center(center) # (bs, 1, emb_size)
        outside_embed = self.embedding_outside(outside) #(bs, 1, emb_size)
        neg_embed     = self.embedding_outside(negative_words) # (bs, k, emb_size)
        
        uovc          = outside_embed.bmm(center_embed.transpose(1,2)).squeeze(2) # (bs,1)
        ukvc          = -neg_embed.bmm(center_embed.transpose(1,2)).squeeze(2) #(bs,k)
        ukvc_sum      = torch.sum(ukvc, 1).reshape(-1,1) # sum across k , reshape>> (batch_size,1)
        
        loss = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)        
        

#### test the model

In [22]:
emb_size = 2
voc_size = len(vocab)
model = SkipgramNeg(voc_size, emb_size)

In [23]:
neg_samples 

tensor([[3738,  368, 1626, 3698, 4119],
        [1181, 3841, 1208, 2747, 2226]])

In [24]:
loss = model(x_tensor, y_tensor, neg_samples)

In [25]:
loss

tensor(2.1676, grad_fn=<NegBackward0>)

## 5.Train

In [26]:
import time

# for recording the training time for each epoch
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time # get the total taken timestamp
    elapsed_mins = int(elapsed_time / 60) # get the min
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) # get the sec
    return elapsed_mins, elapsed_secs

In [27]:
k = 5
emb_size = 2
voc_size = len(vocab)
model    = SkipgramNeg(voc_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [28]:
# num_epochs = 5
num_epochs = 5000
loss_ar = []
starttime = time.time()

for epoch in range(num_epochs):
    
    # get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    # predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model (input_tensor, label_tensor, neg_samples)
    
    # backpropagate
    optimizer.zero_grad()
    loss.backward()
    
    # update alpha
    optimizer.step()
    
    # record loss
    loss_ar.append(loss)
    
    # print loss
    if (epoch+1) % 1000 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss {loss:2.6f}")

endtime = time.time()
    

Epoch   1000 | Loss 1.647907
Epoch   2000 | Loss 0.489159
Epoch   3000 | Loss 4.145045
Epoch   4000 | Loss 3.187397
Epoch   5000 | Loss 0.798258


In [29]:
total_training_time = epoch_time(starttime,endtime)
print(f"total_training_time: {total_training_time[0]}m : {total_training_time[1]}s")

total_training_time: 3m : 1s


In [30]:
print(loss)

tensor(0.7983, grad_fn=<NegBackward0>)


## 6. Plotting the embeddings

In [31]:
def get_embed (word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
    
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

## 7. Cosine Similarity

## Save the model

In [32]:
# Save the model
torch.save(model.state_dict(), './model/A1-NegSampling.pt')

# save the data
Data = {
    'corpus': corpus,
    'vocab': vocab,
    'word2index': word2index,
    'voc_size': voc_size,
    'embedding_size': emb_size
}

pickle.dump(Data,open('./model/Data.pkl', 'wb'))
