# Introduction


Calculating benchmarks results for a bigger model (Qwen2.5-1.5B) before and after adding the new tokens

Initial imports and variable initialization

In [1]:
import torch
import sys
import pathlib
import tqdm
import os
import re
os.chdir('/home/yali/MEGA/Hack The Tockenizer/tests')
sys.path.insert(1, pathlib.Path('..').resolve().as_posix())
from src import utils, loader, hack
from src.DatasetClass import ListDataset, TextDataset
from torch.utils.data import DataLoader


DEVICE                  = 'cuda'
GENERATION_BATCH_SIZE   = 8
MODEL                   = 'Qwen/Qwen2.5-1.5B-Instruct'
MODEL_GEN_KWARGS = dict(top_p=None, top_k=None, temperature=None, do_sample=False)

Loading the model

In [2]:
model, tokenizer = loader.load_model_and_tokenizer(
    model_name=MODEL,
    device=DEVICE,
    model_kwargs = { 'torch_dtype': torch.bfloat16},
    tokenizer_kwargs={'padding_side': 'left'}
)
original_tokenizer = loader.load_model_and_tokenizer(
    model_name=MODEL,
    device='cpu',
    model_kwargs = { 'torch_dtype': torch.bfloat16},
    tokenizer_kwargs={'padding_side': 'left'}
)[1]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


# CalamePT Benchmark

Considering **ONLY** CalamePT, currently not considering the SuperGluePTPT

In [3]:
import src.benchmark as Benchmark
from src.benchmark.CalamePT import CalamePT

# Removing "SuperGluePTPT" from the Benchmarks
benchmark = Benchmark.Benchmarks([CalamePT()])

# Adding the Batch Size (to generate in parallel)
benchmark.config['parallel_batch_size'] = GENERATION_BATCH_SIZE
benchmark.config['max_new_tokens']      = max(len(tokenizer.encode(x)) for x in CalamePT().df['last_word'].values) + 1   # Maximum tokenization of predicted words

## Base Model

In [4]:
benchmark_results = benchmark.run(model, tokenizer, generation_kwargs=MODEL_GEN_KWARGS, store_generation_data=False)
print(f"`CalamePT` Accuracy for Baseline Model `{model.name_or_path}` = {benchmark_results['CalamePT']['result']:.2%}")

<Qwen/Qwen2.5-1.5B-Instruct> Calculating inferences for inputs: 100%|██████████| 260/260 [01:29<00:00,  2.91it/s]

`CalamePT` Accuracy for Baseline Model `Qwen/Qwen2.5-1.5B-Instruct` = 49.52%





## Model with additional Tokens

### Adding new tokens to model

1. Fetching the new tokens

In [None]:
# ----------------------------------------------------
#           Train Tokenizer with PT Dataset               
# ----------------------------------------------------
# Step 1. Train a new portuguese vocabulary
# TODO: Find a way to fix the training... Maybe use numpy random to set a seed?
#   TODO: Verify that the `new_tokens` list is always the same (ignoring order)
pt_tokenizer = hack.TokenizerHack(device=DEVICE).train_tokenizer(trainer_kwargs={'vocab_size': 10000})

# Step 2. Find tokens in `pt_tokenizer` not in 
new_tokens = set(pt_tokenizer.get_vocab().keys())
new_tokens = new_tokens.difference(set(tokenizer.vocab.keys()))

# Removing the 'Ġ' tokens and fixing maybe some others
new_tokens = set([tokenizer.decoder.decode([new_token]) for new_token in new_tokens])

# Remove the tokens which may be "contained" in any of the original tokens (for instance, "publ" is contained in "publico" so "publ" will be removed)
__new_tokens = []
tokenizer_vocab_keys = list(tokenizer.decode(x) for x in range(len(tokenizer)))
for new_token in tqdm.tqdm(new_tokens, total=len(new_tokens)):
    add_new_token = True
    for token in tokenizer_vocab_keys:
        if token.startswith(new_token):
            add_new_token = False
            break
    if add_new_token: __new_tokens.append(new_token)
new_tokens = set(__new_tokens)

######### BELLOW SECTION IS THE LAST LOOP BUT RUNNING IN PARALLEL USING joblib
# import joblib as jb
# import tqdm

# def should_add_token(new_token, vocab_keys):
#     for token in vocab_keys:
#         if token.startswith(new_token):
#             return False
#     return True

# def filter_new_tokens(new_tokens, tokenizer_vocab_keys):
#     vocab_keys = list(tokenizer_vocab_keys)  # Avoid repeated conversions in workers
    
#     # Parallel processing with generator output
#     results = jb.Parallel(n_jobs=7, backend="loky", return_as="generator")(
#         jb.delayed(should_add_token)(new_token, vocab_keys)
#         for new_token in new_tokens
#     )
    
#     # Wrap results in tqdm for progress tracking
#     for new_token, should_add in tqdm.tqdm(
#         zip(new_tokens, results),
#         total=len(new_tokens),
#         desc="Filtering tokens"
#     ):
#         if should_add:
#             yield new_token

# # Usage:
# tokenizer_vocab_keys = list(tokenizer.decode(x) for x in range(len(tokenizer)))
# new_tokens_filtered_gen = filter_new_tokens(new_tokens, tokenizer_vocab_keys)
# new_tokens = set(new_tokens_filtered_gen)  # Consume the generator and convert to set






 71%|███████   | 3572/5040 [00:32<00:13, 107.15it/s]

2. Update the model Vocabulary

In [None]:
# ----------------------------------------------------
#               Update Model Vocabulary               
# ----------------------------------------------------
# Save the original tokenizations
original_tokenization = {t: tokenizer.encode(t) for t in new_tokens}    # Necessary for the training bellow
tokenizer.add_tokens(list(new_tokens))
model.resize_token_embeddings(len(tokenizer))

3. Initialize the embeddings using the Weighted average $w_i = w_{i+1} \times K$ with $K=5$

In [None]:
# Step 4. Calculate the new embeddings for the new tokens
embed = model.get_input_embeddings().weight.clone().to('cpu')
new_embed = model.get_input_embeddings()

# Initialize the embedding using the weighted average model
K = 1.5 # Tested for values [1, 2, 3, 4, 5, 0.9, 0.8, 1.1, 1.2, ..., 1.6] and the best was 1.5 with (3.13%)
with torch.no_grad():
    for new_token in tqdm.tqdm(new_tokens, desc='Initializing the embeddings for the new_tokens'):
        new_token_id = tokenizer.encode(new_token)[0]
        # Find the old embedding for the token
        tokenization = original_tokenization[new_token]
        token_embed = torch.stack([embed[t_id] for t_id in tokenization]).to(DEVICE)
        # Calculating the embedding weights
        embedding_weights = torch.asarray([K**i if K**i < 2**64 else 0 for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        # embedding_weights = torch.asarray([K**i for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        embedding_weights = embedding_weights / embedding_weights.sum()

        # Create a new token embed using the weighted average of the embeddings
        new_token_embed = torch.sum(token_embed * embedding_weights[:, None], dim=0)
        # new_token_embed = token_embed[0]
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)



### Benchmark Computation

Calculating the results with the added tokens.

In [None]:
# Update model name
model.name_or_path = f'{MODEL}-ADDED_TOKENS_INIT_K={K}'

# Update the "max_new_tokens" (since we added new_tokens, we may have a different value than previously)
benchmark.config['max_new_tokens'] = max(len(tokenizer.encode(x)) for x in CalamePT().df['last_word'].values) + 1   # Maximum tokenization of predicted words
benchmark_results_new_tokens = benchmark.run(model, tokenizer, original_tokenizer, generation_kwargs=MODEL_GEN_KWARGS, store_generation_data=False)
print(f"`CalamePT` Accuracy for Baseline  Model `{model.name_or_path}` = {benchmark_results_new_tokens['CalamePT']['result']:.2%}")

In [None]:
phrase = ' Ela correu durante horas para alcançar a linha de'
max_new_tokens = 10
generation = tokenizer.decode(utils.generate(
    model, tokenizer,
    phrase,
    DEVICE, 
    False, False,
    max_new_tokens=max_new_tokens,
    **MODEL_GEN_KWARGS
)[0])
print(f'Generation with new tokenizer = `{generation}`')


input_ids = tokenizer.encode(phrase)
input_ids = [token for x in input_ids for token in original_tokenization.get(tokenizer.convert_ids_to_tokens(x), [x])]

for n in range(10):
    generation = model.generate(torch.Tensor([input_ids]).long().to(DEVICE), max_new_tokens=1, **MODEL_GEN_KWARGS)
    generated_id = generation[0, -1].item()
    input_ids.extend(
        [token for token in original_tokenization.get(tokenizer.convert_ids_to_tokens(generated_id), [generated_id])]
    )
print(f'Generation with "old tokenizer" = `{tokenizer.decode(input_ids)}`')

## Model with additional Tokens + "Training"

Training the model (already has the added tokens)

In [None]:
# ----------------------------------------------------
#     Update weights of Embeddings of new_tokens      
# ----------------------------------------------------
# Step 4.2 Using the training phrases to update the embedding weights
learning_rate = 1e-6
training_phrases: list[str] = CalamePT().prediction_prompts.to_list() # CalamePT dataset 
for new_token in tqdm.tqdm(new_tokens, desc='Updating the embeddings for the new tokens'):
    new_token_id = tokenizer.convert_tokens_to_ids(new_token)
    new_token = tokenizer.decode(new_token_id)
    phrases_to_generate_new_token = [p for phrase in training_phrases for p in phrase.split(new_token)[:-1] if new_token in phrase and len(p) > 0]

    if len(phrases_to_generate_new_token) == 0: continue
    # Creating the Batched dataset (to run generation for multiple phrases at the same time)
    dataloader = DataLoader(
        TextDataset(phrases_to_generate_new_token, tokenizer, max_length=max(len(tokenizer.tokenize(x)) for x in phrases_to_generate_new_token)),
        batch_size=GENERATION_BATCH_SIZE,
        shuffle=False
    )
    # Process the batches
    for batch in tqdm.tqdm(dataloader,  desc=f'  Generating tokens for new_token=`{new_token}` ', leave=False):
        # Move batch tensors to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(DEVICE)
        attention_mask = batch['attention_mask'].squeeze(1).to(DEVICE)

        # Generate text
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1,
            num_beams=1,
            num_return_sequences=1,
            return_dict_in_generate=True,
            output_logits=True,
            output_scores=True,
            output_hidden_states=True,
            pad_token_id=tokenizer.pad_token_id,
            **MODEL_GEN_KWARGS
        )

        # Extract the generated sequences and their scores
        generated_sequences = outputs.sequences
        predicted_logits = outputs.logits

        # Decode the input and generated sequences
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        generated_texts = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
        with torch.no_grad():
            for i in range(len(input_texts)):
                logits = predicted_logits[0][i]
                logit_gradient = logits.max() - logits[new_token_id]
                embed_out = outputs.hidden_states[0][-1][i][-1]
                # normalize embed_out
                embed_out = embed_out / embed_out.norm()

                embed_in = new_embed.weight[new_token_id]

                # Update the embedding table
                _ = new_embed.weight[new_token_id].data.copy_((embed_in + logit_gradient * embed_out * learning_rate).to(DEVICE))

### Update model name

Changing the model name to include the "added_tokens" to distinguish between the older model

In [None]:
model.name_or_path = f'{model.name_or_path} [TRAINED]'

### Run the benchmark

In [None]:
benchmark_results_new_tokens_trained = benchmark.run(model, tokenizer, original_tokenizer,  generation_kwargs=MODEL_GEN_KWARGS)
print(f"`CalamePT` Accuracy for Baseline Model `{model.name_or_path}` = {benchmark_results_new_tokens_trained['CalamePT']['result']:.2%}")

In [None]:
# Finding out which "new_tokens" were generated
tmp = [x['generated_ids'].max().item() for x in benchmark_results_new_tokens_trained['CalamePT']['results'][0]['benchmark_predictions']]
# len([t for t in tmp if t>len(original_tokenizer)])

# Adding flag of "new_token_generated" in benchmark results
original_tokenizer_size = len(original_tokenizer)
for i, test in tqdm.tqdm(enumerate(benchmark_results_new_tokens_trained['CalamePT']['results'][0]['benchmark_predictions'])):
    test['has_new_token'] = len([t for t in test['generated_ids'] if t>original_tokenizer_size]) > 0

    

In [None]:
results_w_new_tokens = [a.copy() for a in benchmark_results_new_tokens_trained['CalamePT']['results'][0]['benchmark_predictions'] if a['has_new_token']]
def word_has_new_token(word: str, token_ids, new_token_start_id: int):
    i = 1
    cur_tokens = [token_ids[0]]
    while tokenizer.decode(cur_tokens).strip().lower() != word.lower() and i<len(token_ids):
        cur_tokens.append(token_ids[i])
        i+=1
    return len([t for t in cur_tokens if new_token_start_id < t]) > 0

new_token_start_id = len(original_tokenizer)
for i, r in enumerate(results_w_new_tokens):
    # Find out if "predicted_word" is made up of any "new_token"
    results_w_new_tokens[i]['new_token_in_word'] = word_has_new_token(r['prediction'], r['generated_ids'][-8:], new_token_start_id)
    results_w_new_tokens[i]['correct_prediction'] = r['prediction'].strip().lower() == r['correct_word'].strip().lower()

In [None]:
tokenizer.decode([153052])

In [None]:
tokenizer.decode([153665])

In [None]:
tokenizer.encode(" público")

In [None]:
benchmark.benchmarks[0].df[(' '+benchmark.benchmarks[0].df['last_word']).apply(lambda x: tokenizer.encode(x)[0] >= 151665 and len(tokenizer.tokenize(x)) == 1)]

In [None]:
[r for r in results_w_new_tokens if r['new_token_in_word'] and r['correct_prediction']]
# results_w_new_tokens

SAVE ALL Benchmarks in a JSON file

In [None]:
import json
with open('/home/yali/MEGA/Hack The Tockenizer/tests/qwen2.5-benchmark-results_V2_run2.json', 'w', encoding='utf-8') as f:
    json.dump(benchmark.get_results(), f, indent=2, ensure_ascii=False)


# Analysing the Results

In [None]:
import json
import pandas as pd

with open('/home/yali/MEGA/Hack The Tockenizer/tests/qwen2.5-benchmark-results_V2_run2.json', 'r') as f:
    results: dict[str, dict[str, dict]] = json.load(f)

data = []
for model_name in results.keys():
    for benchmark_name in results[model_name].keys():
        for epoch_n, result in enumerate(results[model_name][benchmark_name]['results']):
            result: dict
            data.append(result.copy())
            data[-1]['model_name'] = model_name
            data[-1]['epoch_number'] = epoch_n

            r = data[-1].pop('benchmark_predictions')
            data[-1]['original_text']   = [gen["text"]          for gen in r]
            data[-1]['expected_word']   = [gen["correct_word"]  for gen in r]
            data[-1]['predicted_word']  = [gen["prediction"]    for gen in r]

df = pd.DataFrame(data).explode(['original_text', 'expected_word', 'predicted_word'])
# Changing the model name to only 2 letters, the first specifies if the model has been initialized or not, and the second one specifies if it has been trained or not
df['model_name'] = df['model_name'].map({
    'Qwen/Qwen2.5-1.5B-Instruct':                                   'BB',   # Baseline,    Baseline  
    'Qwen/Qwen2.5-1.5B-Instruct [TRAINED]':                         'BT',   # Baseline,    Trained  (currently not existing)
    'Qwen/Qwen2.5-1.5B-Instruct-ADDED_TOKENS_INIT_K=1.5':           'IB',   # Initialized, Baseline
    'Qwen/Qwen2.5-1.5B-Instruct-ADDED_TOKENS_INIT_K=1.5 [TRAINED]': 'IT',   # Initialized, Trained
})


df = df.pivot(
    index='original_text', columns=['model_name'], values=['predicted_word', 'expected_word']
)
df.columns = ['_'.join(col) for col in df.columns]
df = df[['predicted_word_BB', 'predicted_word_IB', 'predicted_word_IT', 'expected_word_BB']].reset_index().rename(columns={'expected_word_BB': 'expected_word'})

for model in [f'{init}{train}' for init in ['B', 'I'] for train in ['B', 'T']]:
    # There is no "BaselineTrained" model, YET
    if model == 'BT':
        continue
    df[f'correct_prediction_{model}'] = df[f'predicted_word_{model}'] == df['expected_word']

df

Saving to csv file

In [None]:
df.to_csv('/home/yali/MEGA/Hack The Tockenizer/tests/qwen2.5-benchmark-results_V2_analysis.csv', index=False)

In [None]:
# Next STEP: TRAIN the baseline model with a "normal" approach for the same amount of time and compare the results.

In [None]:
# Next STEP: Remover os tokens que sao "contidos" em algum dos tokens no tokenizador original + Usar "palavras" do corpus como "new_tokens"