In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

#### 1. Load Data

In [2]:
import nltk
from nltk.corpus import reuters

nltk.download('reuters')

[nltk_data] Downloading package reuters to /home/koala/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
# Getting all file IDs from the Reuters corpus
file_ids = reuters.fileids()

# Creating a corpus where each document is a separate item
reuter = [reuters.raw(fileid) for fileid in file_ids]

# Print the first document
print("First Document:")
print(reuter[0])

print("\n")

# Print the second document
print("Second Document:")
print(reuter[1])

print(len(reuter))

First Document:
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtua

In [4]:
# tokenization
corpus = reuters.sents()
corpus = corpus[0:2500] 

# flattening
flatten = lambda l: [item for sublist in l for item in sublist]

# unique words
vocabs = list(set(flatten(corpus))) 
print(len(vocabs))

# mapping between word and integer
word2index = {v: idx for idx, v in enumerate(vocabs)}
print(len(word2index))

8557
8557


In [5]:
total_word = 0
for sen in corpus:
    for w in sen:
        total_word += 1
total_word

75698

In [6]:
# if unkown word is seen, return 'UNK'
vocabs.append('<UNK>')

In [7]:
word2index['<UNK>'] = 8557

In [8]:
len(vocabs)

8558

In [9]:
# just in case we need to use
index2word = {v: k for k,v in word2index.items()}

In [10]:
index2word[8557]

'<UNK>'

In [11]:
len(vocabs)

8558

#### 2 . Training data

In [12]:
i = 0
for c in corpus:
    print(c)
    # Stop after ten
    i+=1
    if i > 10:
        break

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.']
['But', 'some', 'exporters', 'said', 'that', 'while', 'the', 'conflict', 'would', 'hurt', 'them', 'in', 'the', 'long', '-', 'run', ',', 'in', 'the', 'short', '-', 'term', 'Tokyo', "'", 's', 'loss', 'might', 'be', 'their', 'gain', '.']
['The', 'U', '.', 'S', '.', 'Has', 'said', 'it', 'will', 'impo

In [13]:
#window size 2

#create pairs of center word, and outside word
def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 3rd word until third last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-2]], word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]])
            #for each of these four outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)

In [14]:
print(x.shape)
print(y.shape)

(2, 1)
(2, 1)


#### 3. Model

In [15]:
all_vocabs = len(vocabs)
emb_size = 30

In [16]:
embedding = nn.Embedding(all_vocabs, emb_size)

In [17]:
a = torch.LongTensor([1])

embedding(a).shape

torch.Size([1, 30])

In [18]:
x_tensor = torch.LongTensor(x)

embedding(x_tensor).shape

torch.Size([2, 1, 30])

In [19]:
print(embedding(x_tensor))

tensor([[[-6.2948e-01, -1.2965e+00, -2.9147e-01, -1.3440e-01,  3.7328e-01,
          -8.2109e-01,  1.2141e+00,  6.8356e-01, -1.5652e+00,  1.4925e+00,
           1.3388e-01,  1.3843e+00,  1.5201e+00,  3.9130e-01, -5.9786e-01,
          -9.7921e-01, -4.4298e-02, -2.1049e-01,  1.3406e+00,  4.6913e-01,
          -3.8105e-02, -1.8151e+00,  1.2973e+00,  1.3796e+00, -1.6454e+00,
          -1.2017e-01,  1.1927e+00,  3.7965e-02, -1.4467e-01, -7.0912e-01]],

        [[-6.2999e-01,  1.0493e+00,  5.6145e-01,  1.3875e+00,  1.0629e+00,
           6.1920e-01, -2.5575e+00, -1.4297e+00, -2.1945e+00,  3.7377e-02,
           6.1118e-01,  3.3675e-01, -1.9051e+00, -1.2329e-01,  8.9478e-01,
          -1.9199e+00, -2.1698e+00, -8.0637e-01, -1.2002e+00, -1.7894e+00,
          -1.2800e+00,  1.0466e-01,  5.6365e-01,  1.9682e-03, -3.7445e-02,
           1.4580e+00,  9.0957e-01, -4.1888e-01,  6.4385e-01, -3.7139e-01]]],
       grad_fn=<EmbeddingBackward0>)


In [20]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)

        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [21]:
batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 8555, 8556, 8557],
        [   0,    1,    2,  ..., 8555, 8556, 8557]])

In [22]:
model = Skipgram(voc_size, emb_size)
model

Skipgram(
  (embedding_center): Embedding(8558, 30)
  (embedding_outside): Embedding(8558, 30)
)

In [23]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)
print(input_tensor.shape)
print(label_tensor.shape)
print(all_vocabs.shape)

torch.Size([2, 1])
torch.Size([2, 1])
torch.Size([2, 8558])


In [24]:
# testing
loss = model(input_tensor, label_tensor, all_vocabs)

In [25]:
loss

tensor(24.3161, grad_fn=<NegBackward0>)

#### 4. Training

In [26]:
model      = Skipgram(voc_size, emb_size)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [27]:
import time
import torch

# Check if GPU with CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move your model to the chosen device
model.to(device)

num_epochs = 1000
print_interval = 100  # Interval to print the loss

# Start the timer
start_time = time.time()

for epoch in range(num_epochs):
    
    # Get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    
    # Move data to the GPU
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    all_vocabs = all_vocabs.to(device)
    
    # Predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    # Backpropagate
    optimizer.zero_grad()
    loss.backward()
    
    # Update weights
    optimizer.step()
    
    # Print the loss every 100 epochs
    if (epoch + 1) % print_interval == 0:
        current_time = time.time()
        elapsed_time = current_time - start_time
        mins, secs = divmod(elapsed_time, 60)
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss.item():2.6f} | Time: {int(mins):02d}m {int(secs):02d}s")

# End of training
end_time = time.time()
total_time = end_time - start_time
total_mins, total_secs = divmod(total_time, 60)
print(f"Training completed in {int(total_mins):02d}m {int(total_secs):02d}s")

Using device: cuda
Epoch    100 | Loss: 35.472145 | Time: 00m 28s
Epoch    200 | Loss: 33.157692 | Time: 00m 55s
Epoch    300 | Loss: 31.314760 | Time: 01m 23s
Epoch    400 | Loss: 32.280010 | Time: 01m 51s
Epoch    500 | Loss: 31.401640 | Time: 02m 19s
Epoch    600 | Loss: 26.410089 | Time: 02m 47s
Epoch    700 | Loss: 20.925999 | Time: 03m 16s
Epoch    800 | Loss: 31.999252 | Time: 03m 45s
Epoch    900 | Loss: 29.734051 | Time: 04m 14s
Epoch   1000 | Loss: 35.739899 | Time: 04m 43s
Training completed in 04m 43s


In [33]:
# save the model
torch.save(model.state_dict(), 'models/skipgram.pth')

In [37]:
Data = {
    'vocab': vocabs,
    'word2index': word2index,
    'voc_size': voc_size,
    'emb_size': emb_size
}

In [38]:
import pickle
pickle.dump(Data,open('./data/data.pkl', 'wb'))