In [1]:
import re
import tqdm
import torch
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from typing import Any, Iterable, Callable
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast, pipeline

# Training a BPE for PT-PT

Using the data we already collected for the benchmarks, creating a BPE based on it

In [2]:
import sys
sys.path.insert(1, '../')

from src import BENCHMARKS, BPE

document_data = pd.concat([b.df for b in BENCHMARKS.benchmarks])
document_data = document_data.select_dtypes(include='object').fillna('').drop(columns=['Dataset Type', 'prediction_prompts'])
document_data['DATA'] = document_data.apply(lambda x: ' '.join(x), axis=1)
document_data = document_data['DATA'].to_list()

encodings = BPE(document_data, vocab_size=1_000)
encodings.train()

<BPE.train> Computing word frequencies: 100%|██████████| 18018/18018 [00:04<00:00, 3863.97it/s]
<BPE.train> Computing Merges: 100%|██████████| 807/807 [03:09<00:00,  4.27it/s]


In [3]:
raw_text = 'Olá o meu nome é Duarte'
print(raw_text)
print(encodings.from_id_to_tokens(encodings.tokenize(raw_text), byte_decode=True))

Olá o meu nome é Duarte
['O', 'l', 'á', ' ', 'o', ' ', 'me', 'u', ' ', 'nome', ' ', 'é', ' ', 'D', 'u', 'ar', 'te']


# Finding tokens not in model tokenizer

In [4]:
MODEL = 'HuggingFaceTB/SmolLM2-135M'
DEVICE = 'cuda'

model = AutoModelForCausalLM.from_pretrained(MODEL, use_safetensors=True, torch_dtype= torch.bfloat16).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [32]:
model_tokens = set(tokenizer.vocab.keys())
bpe_tokens = set(
    b''.join(
        encodings.from_id_to_tokens([token])
    ).decode('utf-8', errors='backslashreplace').replace(' ', 'Ġ')  # "Ġ" was used as " " in the tokenizer of the model
    for token in encodings.vocab
)

# Obtain the tokens in BPE not present in the model
tokens_to_add = bpe_tokens.difference(model_tokens)

# Remove digits from tokens_to_add
tokens_to_add = [token for token in tokens_to_add if not token.isdigit()]


# Replace the "last" tokens from the model tokenizer
new_vocab = {}

vocab_revers = {v: k for k, v in tokenizer.vocab.items()}
last_token_id = max(tokenizer.vocab.items(), key=lambda x: x[1])[1]
for token_id in range(last_token_id+1):
    if token_id > last_token_id - len(tokens_to_add):
        new_vocab[tokens_to_add.pop(0)] = token_id
        continue
    new_vocab[vocab_revers[token_id]] = token_id
new_vocab

{'<|endoftext|>': 0,
 '<|im_start|>': 1,
 '<|im_end|>': 2,
 '<repo_name>': 3,
 '<reponame>': 4,
 '<file_sep>': 5,
 '<filename>': 6,
 '<gh_stars>': 7,
 '<issue_start>': 8,
 '<issue_comment>': 9,
 '<issue_closed>': 10,
 '<jupyter_start>': 11,
 '<jupyter_text>': 12,
 '<jupyter_code>': 13,
 '<jupyter_output>': 14,
 '<jupyter_script>': 15,
 '<empty_output>': 16,
 '!': 17,
 '"': 18,
 '#': 19,
 '$': 20,
 '%': 21,
 '&': 22,
 "'": 23,
 '(': 24,
 ')': 25,
 '*': 26,
 '+': 27,
 ',': 28,
 '-': 29,
 '.': 30,
 '/': 31,
 '0': 32,
 '1': 33,
 '2': 34,
 '3': 35,
 '4': 36,
 '5': 37,
 '6': 38,
 '7': 39,
 '8': 40,
 '9': 41,
 ':': 42,
 ';': 43,
 '<': 44,
 '=': 45,
 '>': 46,
 '?': 47,
 '@': 48,
 'A': 49,
 'B': 50,
 'C': 51,
 'D': 52,
 'E': 53,
 'F': 54,
 'G': 55,
 'H': 56,
 'I': 57,
 'J': 58,
 'K': 59,
 'L': 60,
 'M': 61,
 'N': 62,
 'O': 63,
 'P': 64,
 'Q': 65,
 'R': 66,
 'S': 67,
 'T': 68,
 'U': 69,
 'V': 70,
 'W': 71,
 'X': 72,
 'Y': 73,
 'Z': 74,
 '[': 75,
 '\\': 76,
 ']': 77,
 '^': 78,
 '_': 79,
 '`': 80,

In [33]:
with open('token_list', 'w', encoding='utf-8') as f:
    for token, token_id in new_vocab.items():
        f.write('{:05d} {}\n'.format(token_id, token))


# with open('token_list', 'w', encoding='utf-8') as f:
#     for token_id in range(len(tokenizer.vocab)):
#         f.write('{:05d} {}\n'.format(token_id, tokenizer.convert_ids_to_tokens(token_id)))

In [None]:
tokenizer.encode(' afterwards')

In [None]:
encodings.from_id_to_tokens(encodings.tokenize('depois'), byte_decode=True)

In [None]:
{
    pair: b''.join(encodings.from_id_to_tokens([encodings.merges[pair]])).decode('utf-8').replace(' ', 'Ġ')
    for pair in encodings.merges.keys()
}

In [None]:
model

In [None]:
embeds = list(list(model.children())[0].children())[0]

#  embeds(torch.Tensor([57]).int().to(DEVICE))  [THIS IS THE SAME AS THE BELLOW `params[57]`]
params = list(embeds.parameters())[0]
params[57]

In [None]:
input_tokens = tokenizer('Olá, sabes falar português?', return_tensors='pt')
output = model.generate(input_tokens['input_ids'].to(DEVICE), attention_mask=input_tokens['attention_mask'].to(DEVICE), pad_token_id=tokenizer.eos_token_id, max_new_tokens=100)
tokenizer.decode(output[0])