# A1 : Search Engine (GloVe)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import pickle

In [None]:
np.__version__, torch.__version__, matplotlib.__version__

## 1. Load Data

In [None]:
# read the nltk dataset = rural
txt_file = './abc/rural.txt'

with open(txt_file, 'r', encoding = 'utf-8') as file:
        text = file.read()
        
# Split the dataset into paragraphs based on double line breaks : to get one paragraph in a list item ['paragraph1', 'paragraph']
paragraphs = [paragraph.strip() for paragraph in text.split('\n\n')]

In [None]:
for i in range (len(paragraphs)):
    # Replace newline characters with spaces
    paragraphs[i] = paragraphs[i].replace('\n', ' ')

### 1.1 word segmentation

In [None]:
# 1. tokenization
corpus = [sent.split(" ") for sent in paragraphs]
corpus

#### check the origin token size

In [None]:
len(corpus)

In [None]:
wc = 0
for i in range(len(corpus)):
    wc += len(corpus[i])

In [None]:
wc

In [None]:
# because of the machine resources limitations, take only 100 documents as my corpus
corpus = corpus[:100]

In [None]:
len(corpus)

In [None]:
# check the total word count in my corpus
wc = 0
for i in range(len(corpus)):
    wc += len(corpus[i])
    
wc

### 1.2 numerization

In [None]:
# get unique word

# list comprehension for getting words
flatten = lambda l: [item for sublist in l for item in sublist]

# getting unique word and store as a list
vocab = list(set(flatten(corpus)))
vocab

In [None]:
# add <UNK> to a dictionary vocab
vocab.append('<UNK>')

In [None]:
# numericalization: assign index to each word
word2index = {w:idx for idx, w in enumerate(vocab)}
word2index

In [None]:
# index2word
index2word = {k:v for v,k in word2index.items()}
index2word

## 2. Build Co-occurence Matrix X

In [None]:
# count the co-occurence of two words given some window size. here, I will use ws = 2.

from collections import Counter

X_i = Counter(flatten(corpus)) # find the occurences of each word
X_i

In [None]:
# find co-occurence

skip_grams = []

for doc in corpus:
    for i in range (2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2],doc[i-1], doc[i+1],doc[i+2]]
        
        for each_out in outside:
            skip_grams.append((center, each_out)) # tuple

In [None]:
skip_grams

In [None]:
X_ik_skipgrams = Counter (skip_grams)
X_ik_skipgrams

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

In [None]:
def weighting(w_i, w_j, X_ik):
    
    # check whether the co-occurences between two words: w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        
    # if not exist, then set to 1; "laplace smoothing"
    except:
        x_ij = 1
        
    # set xmax (xmax = the maximum count of co words we will accept)
    x_max = 100
    
    # set alpha (f(x))
    alpha = 0.75 
    
            # note all xmax and alpha number are using according to the paper glove
    
    # if co-occurences does not exceed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max) ** alpha
    
    # otherwise set it to 1 (if xmax, set to 1)
    else:
        result = 1
    
    return result # return the number between 0 and 1 which is the weighting function

In [None]:
# co-occurences matrix

from itertools import combinations_with_replacement

# a dict for keeping the co-occurences
X_ik = {}

# already scale the co-occurences using the weighting function
weighting_dic = {}

for bigram in combinations_with_replacement(vocab, 2):
    
    # give all the possible combination of the words
    print(bigram) 
    
    # if the pair exists in out corpus
    if X_ik_skipgrams.get(bigram):
        co                           = X_ik_skipgrams[bigram]
        X_ik[bigram]                 = co + 1 # for stability, (if no occurence we set 1, so if occurences we need to plus 1)
        X_ik[(bigram[1], bigram[0])] = co + 1 # basically (apple, banana = banana, apple)
    else:
        pass
    
    weighting_dic[bigram]                 = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [None]:
import math

In [None]:
def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coos, random_weightings = [], [], [], []
    
    # convert our skipgrams to id using word2index
    skip_grams_id = [(word2index[skipgram[0]], word2index[skipgram[1]]) for skipgram in skip_grams]
    # print(skip_grams_id)
    
    # randomly choose indexes based on batch_size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace = False)
    
    # get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]]) # center word, x
        random_labels.append([skip_grams_id[index][1]]) # outside word, y
    
        # co occerences
        pair = skip_grams[index] #('banana','fruit')
        
        try:
            cooc = X_ik[pair]   
        except:
            cooc = 1
            
        random_coos.append([math.log(cooc)])
    
        # weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
            
    return np.array(random_inputs), np.array(random_labels), np.array(random_coos), np.array(random_weightings)

## 4. Model

In [None]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1)
        self.outside_bias      = nn.Embedding(voc_size, 1) 
        
    def forward(self, center, outside, cooc, weighting):
        center_embeds  = self.center_embedding(center)   # (batch_size, 1, embed_size)
        outside_embeds = self.outside_embedding(outside) # (batch_size, 1, embed_size)
        
        center_bias    = self.center_bias(center).squeeze(1) # (batch_size, voc_size)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product = outside_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2)
        # (batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - cooc, 2)
        
        return torch.sum(loss)

## 5. Training

In [None]:
batch_size     = 10 # mini-batch size
embedding_size = 2 # so we can later plot
voc_size       = len(vocab)

model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
# time estimation
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time
loss_ar = []
starttime = time.time()

# Training

num_epochs = 5000

for epoch in range(num_epochs):
    
    # record the starting time for each epoch
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch     = torch.LongTensor(input_batch) #[batch_size, 1]
    target_batch    = torch.LongTensor(target_batch) #[batch_size, 1]
    cooc_batch      = torch.FloatTensor(cooc_batch) #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    # record the ending time for each epoch
    end = time.time()
    
    # cal training time for each epoch
    epoch_mins, epoch_secs = epoch_time(start,end)
    
    # record loss
    loss_ar.append(loss)
    
    # print loss
    if(epoch+1) % 1000 == 0:
        print(f"Epoch: {epoch+1:6.0f} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        
        
endtime = time.time()
    

In [None]:
total_training_time = epoch_time(starttime,endtime)
print(f"total_training_time: {total_training_time[0]}m : {total_training_time[1]}s")

In [None]:
print(loss)

## 6. Plotting the embeddings

In [None]:
# let's write a function to get embedding given a word
def get_embed(word):
    id_tensor  = torch.LongTensor([word2index[word]])
    v_embed    = model.center_embedding(id_tensor)
    u_embed    = model.outside_embedding(id_tensor)
    word_embed = (v_embed + u_embed) / 2
    x, y       = word_embed[0][0].item(), word_embed[0][1].item()
    
    return x,y

## 7. Cosine similarity

## Save the model

In [None]:
# Save the model
torch.save(model.state_dict(), './model/A1-Glove.pt')

# save the data
Data = {
    'corpus': corpus,
    'vocab': vocab,
    'word2index': word2index,
    'voc_size': voc_size,
    'embedding_size': embedding_size
}
pickle.dump(Data,open('./model/Data.pkl', 'wb'))
