In [1]:

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase
import torch

In [2]:
samp1 = "I code want to code python"
samp2 = "I don't know how to code worldable"
corpus = [samp1, samp2]

# Initialize the tokenizer and define a trainer
#? token at Word Level
tokenizer = Tokenizer(WordLevel()) # type: ignore
tokenizer.normalizer = Lowercase() # type: ignore

#? split each token by whitespace
tokenizer.pre_tokenizer = Whitespace() # type: ignore
tokenizer.enable_padding(pad_id=1,
                        pad_token="<pad>",
                        length=5) # add <pad> token if sequence is Shorter. Ensure sequence length is 5
tokenizer.enable_truncation(max_length=5) # cut sequence to 5 if Larger.

# Train the tokenizer on your corpus
#? If vocab_size < total_word -> tokenize word by appear frequency.
trainer = WordLevelTrainer(vocab_size=8,
                        special_tokens=["<unk>", "<pad>"])
tokenizer.train_from_iterator(corpus, trainer) # tokenize the entire corpus

In [3]:
vocab = tokenizer.get_vocab()
print(vocab)

{'<unk>': 0, 'how': 7, 'i': 3, 'code': 2, 'don': 6, '<pad>': 1, "'": 5, 'to': 4}


In [4]:
samp1 = "I code want to code python"
output = tokenizer.encode(samp1)
print(output.tokens)
print(output.ids)

['i', 'code', '<unk>', 'to', 'code']
[3, 2, 0, 4, 2]


### Embedding

In [5]:
import torch.nn as nn

vocab_size = 8
embed_dim = 5
embedding = nn.Embedding(vocab_size, embed_dim) # more embed_dim, more accuracy
print(embedding)
print(embedding.weight)

Embedding(8, 5)
Parameter containing:
tensor([[ 0.6999, -0.2689, -1.1578, -0.5795,  0.8071],
        [-0.9599,  0.7591,  0.2670,  1.1619, -0.5097],
        [-0.1164, -1.7598,  0.1754, -0.3817,  0.4707],
        [ 0.4648,  0.7810,  0.5419,  1.5069,  0.8053],
        [-2.1672, -0.2699,  1.1021,  0.7687,  1.3596],
        [-2.3238, -1.7918, -0.3241,  0.9930, -0.7920],
        [-1.9819, -1.1270,  0.3325, -1.0603, -0.4810],
        [ 0.9047, -1.5250, -1.3843, -2.4264,  1.1843]], requires_grad=True)


### Create RNN & MLP
+ RNN for Understand context from Text Encoding
+ MLP for Classification 

In [8]:
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, vocab_size, embed_dim, seq_length, hidden_size, output_size) -> None:
        super(MLP, self).__init__()
        self.flatten_dim = seq_length * embed_dim # input_size

        self.forward = nn.Sequential(
            nn.Embedding(vocab_size, embed_dim),
            nn.Flatten(),
            nn.Linear(self.flatten_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

        # self.model = nn.Sequential(embedding, self.flatten, self.fc1, self.fc2)

    def forward(self, x):
        return self.forward(x)

vocab_size = 8
embed_dim = 5
seq_length = 5
hidden_size = 16
output_size = 2

model = MLP(vocab_size, embed_dim, seq_length, hidden_size, output_size)
print(model)

MLP(
  (forward): Sequential(
    (0): Embedding(8, 5)
    (1): Flatten(start_dim=1, end_dim=-1)
    (2): Linear(in_features=25, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=2, bias=True)
  )
)


Note: plot các điểm trong không gian phân bố ra sao

In [None]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)