# A1 : Search Engine (Skipgram)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import pickle

In [5]:
np.__version__, torch.__version__, matplotlib.__version__

('1.25.2', '2.1.0', '3.7.2')

## 1. Data Loader

For the dataset, I use the rural.txt from NLTK.net.

In [6]:
# corpus = ["apple banana grape orange fruit", "banana apple orange grape fruit", "banana fruit orange grape apple",
        #   "dog cat mouse rabbit animal", "cat mouse rabbit animal dog", "cat dog animal rabbit mouse"]

# read the nltk dataset = rural
txt_file = './abc/rural.txt'

with open(txt_file, 'r', encoding = 'utf-8') as file:
        text = file.read()
        
# Split the dataset into paragraphs based on double line breaks : to get one paragraph in a list item ['paragraph1', 'paragraph']
paragraphs = [paragraph.strip() for paragraph in text.split('\n\n')]

In [7]:
len(paragraphs)

2424

In [8]:
paragraphs[0]

'PM denies knowledge of AWB kickbacks\nThe Prime Minister has denied he knew AWB was paying kickbacks to Iraq despite writing to the wheat exporter asking to be kept fully informed on Iraq wheat sales.\nLetters from John Howard and Deputy Prime Minister Mark Vaile to AWB have been released by the Cole inquiry into the oil for food program.\nIn one of the letters Mr Howard asks AWB managing director Andrew Lindberg to remain in close contact with the Government on Iraq wheat sales.\nThe Opposition\'s Gavan O\'Connor says the letter was sent in 2002, the same time AWB was paying kickbacks to Iraq though a Jordanian trucking company.\nHe says the Government can longer wipe its hands of the illicit payments, which totalled $290 million.\n"The responsibility for this must lay may squarely at the feet of Coalition ministers in trade, agriculture and the Prime Minister," he said.\nBut the Prime Minister says letters show he was inquiring about the future of wheat sales in Iraq and do not prov

In [9]:
for i in range (len(paragraphs)):
    # Replace newline characters with spaces
    paragraphs[i] = paragraphs[i].replace('\n', ' ')

In [10]:
paragraphs[0]

'PM denies knowledge of AWB kickbacks The Prime Minister has denied he knew AWB was paying kickbacks to Iraq despite writing to the wheat exporter asking to be kept fully informed on Iraq wheat sales. Letters from John Howard and Deputy Prime Minister Mark Vaile to AWB have been released by the Cole inquiry into the oil for food program. In one of the letters Mr Howard asks AWB managing director Andrew Lindberg to remain in close contact with the Government on Iraq wheat sales. The Opposition\'s Gavan O\'Connor says the letter was sent in 2002, the same time AWB was paying kickbacks to Iraq though a Jordanian trucking company. He says the Government can longer wipe its hands of the illicit payments, which totalled $290 million. "The responsibility for this must lay may squarely at the feet of Coalition ministers in trade, agriculture and the Prime Minister," he said. But the Prime Minister says letters show he was inquiring about the future of wheat sales in Iraq and do not prove the G

### tokenization

In [11]:
# 1. tokenization
corpus = [sent.split(" ") for sent in paragraphs]
corpus

[['PM',
  'denies',
  'knowledge',
  'of',
  'AWB',
  'kickbacks',
  'The',
  'Prime',
  'Minister',
  'has',
  'denied',
  'he',
  'knew',
  'AWB',
  'was',
  'paying',
  'kickbacks',
  'to',
  'Iraq',
  'despite',
  'writing',
  'to',
  'the',
  'wheat',
  'exporter',
  'asking',
  'to',
  'be',
  'kept',
  'fully',
  'informed',
  'on',
  'Iraq',
  'wheat',
  'sales.',
  'Letters',
  'from',
  'John',
  'Howard',
  'and',
  'Deputy',
  'Prime',
  'Minister',
  'Mark',
  'Vaile',
  'to',
  'AWB',
  'have',
  'been',
  'released',
  'by',
  'the',
  'Cole',
  'inquiry',
  'into',
  'the',
  'oil',
  'for',
  'food',
  'program.',
  'In',
  'one',
  'of',
  'the',
  'letters',
  'Mr',
  'Howard',
  'asks',
  'AWB',
  'managing',
  'director',
  'Andrew',
  'Lindberg',
  'to',
  'remain',
  'in',
  'close',
  'contact',
  'with',
  'the',
  'Government',
  'on',
  'Iraq',
  'wheat',
  'sales.',
  'The',
  "Opposition's",
  'Gavan',
  "O'Connor",
  'says',
  'the',
  'letter',
  'was',
 

In [None]:
# limit the corpus only accept 100 documents
corpus = corpus[:100]

In [None]:
# check the total word count in my corpus
wc = 0
for i in range(len(corpus)):
    wc += len(corpus[i])

In [12]:
corpus[0]

['PM',
 'denies',
 'knowledge',
 'of',
 'AWB',
 'kickbacks',
 'The',
 'Prime',
 'Minister',
 'has',
 'denied',
 'he',
 'knew',
 'AWB',
 'was',
 'paying',
 'kickbacks',
 'to',
 'Iraq',
 'despite',
 'writing',
 'to',
 'the',
 'wheat',
 'exporter',
 'asking',
 'to',
 'be',
 'kept',
 'fully',
 'informed',
 'on',
 'Iraq',
 'wheat',
 'sales.',
 'Letters',
 'from',
 'John',
 'Howard',
 'and',
 'Deputy',
 'Prime',
 'Minister',
 'Mark',
 'Vaile',
 'to',
 'AWB',
 'have',
 'been',
 'released',
 'by',
 'the',
 'Cole',
 'inquiry',
 'into',
 'the',
 'oil',
 'for',
 'food',
 'program.',
 'In',
 'one',
 'of',
 'the',
 'letters',
 'Mr',
 'Howard',
 'asks',
 'AWB',
 'managing',
 'director',
 'Andrew',
 'Lindberg',
 'to',
 'remain',
 'in',
 'close',
 'contact',
 'with',
 'the',
 'Government',
 'on',
 'Iraq',
 'wheat',
 'sales.',
 'The',
 "Opposition's",
 'Gavan',
 "O'Connor",
 'says',
 'the',
 'letter',
 'was',
 'sent',
 'in',
 '2002,',
 'the',
 'same',
 'time',
 'AWB',
 'was',
 'paying',
 'kickbacks',
 

### numericalization

In [13]:
# get unique word

# list comprehension for getting words
flatten = lambda l: [item for sublist in l for item in sublist]

# getting unique word and store as a list
vocab = list(set(flatten(corpus)))
vocab

['',
 'reap',
 "'not",
 'walks',
 'rise:',
 'concern.',
 '130,000',
 '$4.00',
 'Changing',
 'favour.',
 'farms,"',
 'Rivett',
 'terms,',
 'Kaniva',
 'imported.',
 '"Frankly,',
 'confusing',
 "supermarket's",
 'all,"',
 'shot.',
 'teacher',
 'rising,',
 'Dyason',
 'regionality',
 'percent',
 'conform',
 'infected',
 'forecaster',
 'aspect,"',
 'urban/rural',
 "Redoutes'",
 'excessive',
 'founded',
 'three-month',
 'Conservation',
 'governor',
 'politicians,',
 'chairman,',
 'Midlands',
 'four-wheel',
 'ocean,',
 'suspect',
 'racketeering,',
 'Organised',
 'liberation',
 'Lack',
 'expected,"',
 'finding,',
 'Christian',
 'reprieve',
 'Holland-Kennedy',
 'solar,',
 'Newfishing',
 'mountain',
 'Kalgoorlie',
 'vetoes',
 'so-forth',
 "y'know,",
 'stringently',
 'voluntarily',
 'Fires',
 'intercepted',
 'efficient,"',
 'sick',
 'embroiled',
 "'botched'",
 'restaurant',
 '24-year-old',
 'Canna',
 'neglect',
 'Lockshin,',
 'argues',
 'General',
 'staff,',
 "Creasy's",
 'First',
 'lap',
 'emerge

In [14]:
# len(vocab)

In [15]:
# add <UNK> to a dictionary vocab
vocab.append('<UNK>')

In [16]:
# numericalization: assign index to each word
word2index = {w:idx for idx, w in enumerate(vocab)}
word2index

{'': 0,
 'reap': 1,
 "'not": 2,
 'walks': 3,
 'rise:': 4,
 'concern.': 5,
 '130,000': 6,
 '$4.00': 7,
 'Changing': 8,
 'favour.': 9,
 'farms,"': 10,
 'Rivett': 11,
 'terms,': 12,
 'Kaniva': 13,
 'imported.': 14,
 '"Frankly,': 15,
 'confusing': 16,
 "supermarket's": 17,
 'all,"': 18,
 'shot.': 19,
 'teacher': 20,
 'rising,': 21,
 'Dyason': 22,
 'regionality': 23,
 'percent': 24,
 'conform': 25,
 'infected': 26,
 'forecaster': 27,
 'aspect,"': 28,
 'urban/rural': 29,
 "Redoutes'": 30,
 'excessive': 31,
 'founded': 32,
 'three-month': 33,
 'Conservation': 34,
 'governor': 35,
 'politicians,': 36,
 'chairman,': 37,
 'Midlands': 38,
 'four-wheel': 39,
 'ocean,': 40,
 'suspect': 41,
 'racketeering,': 42,
 'Organised': 43,
 'liberation': 44,
 'Lack': 45,
 'expected,"': 46,
 'finding,': 47,
 'Christian': 48,
 'reprieve': 49,
 'Holland-Kennedy': 50,
 'solar,': 51,
 'Newfishing': 52,
 'mountain': 53,
 'Kalgoorlie': 54,
 'vetoes': 55,
 'so-forth': 56,
 "y'know,": 57,
 'stringently': 58,
 'volunta

In [17]:
# index2word
index2word = {k:v for v,k in word2index.items()}
index2word

{0: '',
 1: 'reap',
 2: "'not",
 3: 'walks',
 4: 'rise:',
 5: 'concern.',
 6: '130,000',
 7: '$4.00',
 8: 'Changing',
 9: 'favour.',
 10: 'farms,"',
 11: 'Rivett',
 12: 'terms,',
 13: 'Kaniva',
 14: 'imported.',
 15: '"Frankly,',
 16: 'confusing',
 17: "supermarket's",
 18: 'all,"',
 19: 'shot.',
 20: 'teacher',
 21: 'rising,',
 22: 'Dyason',
 23: 'regionality',
 24: 'percent',
 25: 'conform',
 26: 'infected',
 27: 'forecaster',
 28: 'aspect,"',
 29: 'urban/rural',
 30: "Redoutes'",
 31: 'excessive',
 32: 'founded',
 33: 'three-month',
 34: 'Conservation',
 35: 'governor',
 36: 'politicians,',
 37: 'chairman,',
 38: 'Midlands',
 39: 'four-wheel',
 40: 'ocean,',
 41: 'suspect',
 42: 'racketeering,',
 43: 'Organised',
 44: 'liberation',
 45: 'Lack',
 46: 'expected,"',
 47: 'finding,',
 48: 'Christian',
 49: 'reprieve',
 50: 'Holland-Kennedy',
 51: 'solar,',
 52: 'Newfishing',
 53: 'mountain',
 54: 'Kalgoorlie',
 55: 'vetoes',
 56: 'so-forth',
 57: "y'know,",
 58: 'stringently',
 59: 'vol

## 2. Preparation for the train data

In [18]:
def random_batch(batch_size, corpus):

    # define a list for storing [center,outside] pair
    skipgrams = []

    # loop each word sequence
    for sent in corpus:
        
        for i in range(2, len(sent)-2):
            
            # assign center word
            center_word = word2index[sent[i]]
            
            # assign outside word=4 (ws = 2)
            outside_word = [word2index[sent[i-2]], word2index[sent[i-1]], word2index[sent[i+1]], word2index[sent[i+2]]]
            
            # for each of these two outside words, we gonna pair (center,outside) and append to a list
            for each_outside in outside_word:
                skipgrams.append([center_word, each_outside])
                
    # randomly select 2 pair among the data
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace = False)
    
    random_inputs = []
    random_labels = []
    
    for i in random_index:
        random_inputs.append([skipgrams[i][0]]) # center_word
        random_labels.append([skipgrams[i][1]]) # outside_word
        
    return np.array(random_inputs), np.array(random_labels)

#### testing

In [19]:
batch_size = 4
input_batch, label_batch = random_batch(4, corpus)
input_batch, label_batch

(array([[ 6467],
        [23143],
        [ 7553],
        [ 9752]]),
 array([[16588],
        [ 5535],
        [11764],
        [16990]]))

## 3. Model

### 3.1 Skipgram model

In [20]:
class Skipgram (nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        
        # embedding for all: result as a vector for each 
        center_embed  = self.embedding_center(center_word)   # (batch_size, 1, emb_size)
        outside_embed = self.embedding_outside(outside_word) # (batch_size, 1, emb_size)
        all_embed     = self.embedding_outside(all_vocabs)   # (batch_size, voc_size, emb_size)
        
        # write the equation
        top_term = torch.exp(outside_embed.bmm(center_embed.transpose(1,2)).squeeze(2))
        # (batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (bacth_size, 1)
        
        lower_term = all_embed.bmm(center_embed.transpose(1,2)).squeeze(2)
        # (batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size)
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)
        # (batch_size, 1)
        
        loss = - torch.mean(torch.log(top_term / lower_term_sum))
        
        return loss

#### test the skipgram model

In [21]:
voc_size = len(vocab)
emb_size = 2
model = Skipgram(voc_size, emb_size)
model

Skipgram(
  (embedding_center): Embedding(26911, 2)
  (embedding_outside): Embedding(26911, 2)
)

In [22]:
batch_size = 2
x,y = random_batch(batch_size, corpus)

In [23]:
x,y

(array([[13195],
        [20704]]),
 array([[4630],
        [7033]]))

In [24]:
x.shape, y.shape

((2, 1), (2, 1))

In [25]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [26]:
# prepare for all_vocabs
def prepare_sequence (seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab)) #[batch_size, voc_size]

In [27]:
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab)) #[batch_size, voc_size]

In [28]:
all_vocabs

tensor([[    0,     1,     2,  ..., 26908, 26909, 26910],
        [    0,     1,     2,  ..., 26908, 26909, 26910]])

In [29]:
loss = model(input_tensor, label_tensor, all_vocabs)
loss

tensor(9.8213, grad_fn=<NegBackward0>)

### 3.2 Skip-gram negative sampling model

### 3.3 GloVe model

### 3.4 GloVe (Gensim) model

## 4.Train

In [30]:
voc_size       = len(vocab) # total vocab size
batch_size     = 2 # mini-batch size
embedding_size = 2

skipgram_model = Skipgram(voc_size, embedding_size)

skipgram_optimizer = optim.Adam(skipgram_model.parameters(), lr = 0.001)

In [31]:
# prepare for all_vocabs
def prepare_sequence (seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab)) #[batch_size, voc_size]

In [32]:
print(all_vocabs)
print(all_vocabs.shape)

tensor([[    0,     1,     2,  ..., 26908, 26909, 26910],
        [    0,     1,     2,  ..., 26908, 26909, 26910]])
torch.Size([2, 26911])


In [33]:
# for recording the training time for each epoch
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time # get the total taken timestamp
    elapsed_mins = int(elapsed_time / 60) # get the min
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) # get the sec
    return elapsed_mins, elapsed_secs

In [34]:
# training
import time
loss_ar = []

starttime = time.time()

num_epochs = 3
# num_epochs = 50

# loop for each epoch
for epoch in range(num_epochs):
    
    # record the start time
    start = time.time()
    
    # get the random training batch
    input_batch, output_batch = random_batch(batch_size, corpus)
    
    # to Tensor for embedding purpose
    input_tensor = torch.LongTensor(input_batch)   #[batch_size, 1]
    output_tensor = torch.LongTensor(output_batch) #[batch_size, 1]
    
    # train
    skipgram_optimizer.zero_grad()
    loss = skipgram_model(input_tensor, output_tensor, all_vocabs)
    
    # update loss
    loss.backward()
    skipgram_optimizer.step()
    
    # record the end time
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    # record loss
    loss_ar.append(loss)
    
    if (epoch+1) % 1 == 0:
        print(f"Epoch: {epoch+1} | loss: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        
endtime = time.time()

Epoch: 1 | loss: 10.465573 | time: 0m 0s
Epoch: 2 | loss: 11.204954 | time: 0m 0s
Epoch: 3 | loss: 9.780531 | time: 0m 0s


In [35]:
total_training_time = epoch_time(starttime,endtime)
print(f"total_training_time: {total_training_time[0]}m : {total_training_time[1]}s")

total_training_time: 0m : 1s


In [36]:
print(loss)

tensor(9.7805, grad_fn=<NegBackward0>)


## 5.Plotting the embeddings

In [37]:
# function to get the embedding given a word
def get_embed(word):
    id_tensor = torch.LongTensor([word2index[word]])
    c_embed = skipgram_model.embedding_center(id_tensor)
    o_embed = skipgram_model.embedding_outside(id_tensor)
    word_embed = (c_embed + o_embed) / 2
    x,y = word_embed[0][0].item(), word_embed[0][1].item()
    
    return x,y

In [38]:
# plt.figure(figsize=(6,3))

# # loop each unique vocab
# for i, word in enumerate(vocab):
#     x,y = get_embed(word)
#     plt.scatter(x,y)
#     # plt.annotate(word, xy=(x,y), xytest =(5,2), textcoords='offset points')
#     plt.annotate(word, xy = (x,y), xytext = (5,2), textcoords='offset points')
    
# plt.show()

## 6.Cosine similarity

## 7. Save the Model

In [None]:
# Save the model
torch.save(model.state_dict(), './model/A1-Skipgram.pt')

NameError: name 'torch' is not defined

In [None]:
# save the data
Data = {
    'corpus': corpus,
    'vocab': vocab,
    'word2index': word2index,
    'voc_size': voc_size,
    'embedding_size': embedding_size
}

In [None]:
pickle.dump(Data,open('./model/Data.pkl', 'wb'))