In [1]:
import re
import tqdm
import torch
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from typing import Any, Iterable, Callable
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast, pipeline

# Training a BPE for PT-PT

Using the data we already collected for the benchmarks, creating a BPE based on it

In [None]:
import sys
sys.path.insert(1, '../')

from src import BENCHMARKS, BPE

document_data = BENCHMARKS.get_training_data('list')

encodings = BPE(document_data, vocab_size=1_000)
encodings.train()

In [None]:
from tokenizers import Tokenizer, trainers, models, pre_tokenizers

# Initialize the tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

trainer = trainers.BpeTrainer(vocab_size=1_000)

# Generate the files for the tokenizer to train on
with open('trainer.txt', 'w') as f: f.writelines(document_data)

tokenizer.train(files=['trainer.txt'], trainer=trainer)
tokenizer.encode('Olá o meu nome é Duarte').tokens

In [None]:
raw_text = 'Olá o meu nome é Duarte'
print(raw_text)
print(encodings.from_id_to_tokens(encodings.tokenize(raw_text), byte_decode=True))

# Finding tokens not in model tokenizer

In [None]:
MODEL = 'HuggingFaceTB/SmolLM2-135M'
DEVICE = 'cuda'

model = AutoModelForCausalLM.from_pretrained(MODEL, use_safetensors=True, torch_dtype= torch.bfloat16).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
tokenizer.vocab

In [None]:
tokenizer.encode(' Augusta')

In [None]:
tokenizer.vocab['ĠAugusta']

In [None]:
model_tokens = set(tokenizer.vocab.keys())
bpe_tokens = set(
    b''.join(
        encodings.from_id_to_tokens([token])
    ).decode('utf-8', errors='backslashreplace').replace(' ', 'Ġ')  # "Ġ" was used as " " in the tokenizer of the model
    for token in encodings.vocab
)

# Obtain the tokens in BPE not present in the model
tokens_to_add = bpe_tokens.difference(model_tokens)

# Remove digits from tokens_to_add
tokens_to_add = [token for token in tokens_to_add if not token.isdigit()]


# Replace the "last" tokens from the model tokenizer
new_vocab = {}

vocab_revers = {v: k for k, v in tokenizer.vocab.items()}
last_token_id = max(tokenizer.vocab.items(), key=lambda x: x[1])[1]
for token_id in range(last_token_id+1):
    if token_id > last_token_id - len(tokens_to_add):
        new_vocab[tokens_to_add.pop(0)] = token_id
        continue
    new_vocab[vocab_revers[token_id]] = token_id
new_vocab

In [33]:
with open('token_list', 'w', encoding='utf-8') as f:
    for token, token_id in new_vocab.items():
        f.write('{:05d} {}\n'.format(token_id, token))


# with open('token_list', 'w', encoding='utf-8') as f:
#     for token_id in range(len(tokenizer.vocab)):
#         f.write('{:05d} {}\n'.format(token_id, tokenizer.convert_ids_to_tokens(token_id)))

In [None]:
tokenizer.encode(' afterwards')

In [None]:
encodings.from_id_to_tokens(encodings.tokenize('depois'), byte_decode=True)

In [None]:
{
    pair: b''.join(encodings.from_id_to_tokens([encodings.merges[pair]])).decode('utf-8').replace(' ', 'Ġ')
    for pair in encodings.merges.keys()
}

In [None]:
model

In [None]:
embeds = list(list(model.children())[0].children())[0]

#  embeds(torch.Tensor([57]).int().to(DEVICE))  [THIS IS THE SAME AS THE BELLOW `params[57]`]
params = list(embeds.parameters())[0]
params[57]

In [None]:
input_tokens = tokenizer('Olá, sabes falar português?', return_tensors='pt')
output = model.generate(input_tokens['input_ids'].to(DEVICE), attention_mask=input_tokens['attention_mask'].to(DEVICE), pad_token_id=tokenizer.eos_token_id, max_new_tokens=100)
tokenizer.decode(output[0])