In [53]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tqdm
import os

In [9]:
import wikipedia

articles = []
num_texts = 1000
print(wikipedia.random(pages=10))
for title in wikipedia.random(pages=num_texts):
    try:
        page = wikipedia.page(title)
        articles.append(page.content)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
    except wikipedia.exceptions.PageError as e:
        print(e)
        
print("articles: ", articles)


file_name = 'wiki_output.txt'
with open(file_name, 'wt') as f:
    for article in articles:
        f.write(article + '\n')

In [None]:
file_name = 'wiki_output.txt'
vocab_size = 10000
#1 Tokenize the whole text
import sentencepiece as spm
spm.SentencePieceTrainer.train(input=file_name, model_prefix='wikimodel', vocab_size=vocab_size)

sp=spm.SentencePieceProcessor()
sp.load("wikimodel.model")
with open('wiki_output.txt', 'r') as file:
    # Read the contents of the file into a variable
    file_contents = file.read()
tokens=sp.encode_as_ids(file_contents)

In [11]:
print(tokens)

[2261, 19, 11, 78, 275, 887, 10, 4, 4648, 474, 7, 2740, 145, 3059, 5, 2261, 19, 59, 4, 346, 7, 4, 1056, 31, 13, 53, 4, 887, 5239, 5, 14, 1572, 31, 14, 17, 887, 19, 465, 12, 235, 216, 7820, 2861, 166, 25, 1698, 7, 5036, 610, 2632, 7, 1543, 1015, 283, 183, 3293, 216, 20, 3845, 166, 25, 704, 7, 4, 1559, 7, 4648, 3, 9, 21, 4, 1926, 1301, 7, 4, 3059, 1028, 5, 14, 281, 14, 196, 1203, 8, 372, 2437, 62, 18, 1473, 24, 29, 11, 34, 6971, 5579, 39, 20, 6606, 25, 28, 4, 331, 7, 757, 31, 258, 2091, 5524, 196, 412, 8, 372, 2777, 77, 907, 18, 2794, 12, 13, 34, 6599, 39, 20, 2180, 25, 28, 4, 331, 7, 757, 31, 258, 412, 5711, 7519, 14, 6230, 675, 14, 43, 687, 2261, 70, 11, 356, 7, 1426, 1138, 5932, 2442, 38, 6859, 99, 6516, 6779, 87, 1988, 564, 5500, 748, 51, 87, 3059, 14, 66, 90, 14, 14, 108, 96, 85, 6, 14, 98, 4806, 1390, 195, 7, 2261, 6066, 4008, 7659, 2753, 19, 11, 926, 10, 409, 30, 1235, 10, 4, 1271, 260, 7, 988, 3950, 5, 67, 6, 356, 19, 369, 7495, 14, 66, 90, 14, 5530, 3837, 3659, 6742, 1601, 11, 4

In [19]:
#CBOW Word2Vec model
#Firstly, we need to split the data into examples, e.g. (word, context word) in SKipGram and (context words, word) in CBOW
window=2
pairs=[]
for index in range(window, len(tokens)-window):
    input_tokens=tokens[index-window:index]+tokens[index+1:index+window+1]
    output_token=tokens[index]
    pair=[torch.tensor(input_tokens, dtype=torch.long), torch.tensor(output_token, dtype=torch.long)]
    pairs.append(pair)


In [47]:
class CBOW(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, context_words):
        embedded=self.embeddings(context_words)
        projection=torch.mean(embedded, dim=0)
        out_layer=self.linear(projection)
        log_probs=nn.functional.log_softmax(out_layer, dim=0)
        return log_probs
            
    def embed(self, word):
        self.eval()
        embedding=self.embeddings(word)
        return embedding

In [48]:
#DEFINE A LOSS FUNCTION
embedding_size = 128

cbow_model = CBOW(embedding_size, vocab_size)  # Assuming CBOWModel is defined as before
cbow_loss_function = nn.NLLLoss()
cbow_optimizer = torch.optim.SGD(cbow_model.parameters(), lr=0.01)

In [55]:
cbow_dataloader = DataLoader(pairs, batch_size=32, shuffle=True)
epochs=5
for epoch in range(epochs):
    total_loss = 0
    with tqdm(cbow_dataloader, unit="batch") as tepoch:
        for context, target in pairs:
            cbow_model.zero_grad()
            log_probs = cbow_model(context)
            loss = cbow_loss_function(log_probs, target)
            loss.backward()
            cbow_optimizer.step()
            total_loss += loss.item()
        
    print(f"Epoch {epoch+1} finished with total loss: {total_loss:.4f}")
    torch.save(cbow_model.state_dict(), os.path.join("cbow_weights", f"cbow_epoch_{epoch+1}.pth"))     

Epoch 1 finished with total loss: 307478.3096
Epoch 2 finished with total loss: 295777.8947
Epoch 3 finished with total loss: 285184.1053
Epoch 4 finished with total loss: 275484.5555
Epoch 5 finished with total loss: 266522.2783
Loss:  266522.2782687312


In [58]:
cbow_model = CBOW(embedding_size, vocab_size)  # Assuming CBOWModel is defined as before
cbow_model.load_state_dict(torch.load(os.path.join("cbow_weights", f"cbow_epoch_{epochs}.pth")))
cbow_model.eval()

#Get the word embeddings
word_embeddings = cbow_model.embeddings.weight.data

#test closest words
def closest_words(embedding, embeddings, n=5):
    distances = torch.norm(embeddings-embedding, dim=1)
    _, indices = torch.topk(distances, n+1, largest=False)
    return indices

word = "king"
word_id = sp.piece_to_id(word)
word_embedding = cbow_model.embeddings(torch.tensor(word_id))
closest = closest_words(word_embedding, word_embeddings)
closest_word = [sp.id_to_piece(idx.item()) for idx in closest]

print(closest_word)

['king', '▁forms', '▁Mei', 'wh', 'was', 'ential']
