In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1eddcc1b7b0>

In [2]:
torch.__version__

'2.1.2+cu121'

In [3]:
with open("../data/sample_text.txt", "r") as file:
    raw_text = file.read().split()

In [4]:
CONTEXT_SIZE = 2 # 2 từ bên trái, 2 từ bên phải

vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []

# mất các chữ đầu và cuối
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = ([raw_text[i - j - 1] for j in range(CONTEXT_SIZE)] +
               [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)])
    
    target = raw_text[i]
    data.append((context, target))

print(data[:5])


[(['Great', 'The', 'Reef', 'is'], 'Barrier'), (['Barrier', 'Great', 'is', 'the'], 'Reef'), (['Reef', 'Barrier', 'the', "world's"], 'is'), (['is', 'Reef', "world's", 'largest'], 'the'), (['the', 'is', 'largest', 'coral'], "world's")]


In [5]:
class CBOWModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOWModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, x):
        embeds = self.embeddings(x).view(1,-1)
        output = self.linear1(embeds)
        output = F.relu(output)
        output = self.linear2(output)
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

In [6]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)

tensor([ 1, 18, 41,  7])