# Introduction

Exploring adding new token with new embeddings with a different/bigger model

In [1]:
import torch
import sys
import pathlib
import tqdm
import os
import re
os.chdir('/home/yali/MEGA/Hack The Tockenizer/tests')
sys.path.insert(1, pathlib.Path('..').resolve().as_posix())
from src import utils, loader, hack
from src.DatasetClass import ListDataset, TextDataset
from torch.utils.data import DataLoader


DEVICE = 'cuda'
MODEL = 'Qwen/Qwen2.5-1.5B-Instruct'
phrases: list[str] = hack.BENCHMARKS.benchmarks[1].prediction_prompts.to_list() # CalamePT dataset 

In [2]:
model, tokenizer = loader.load_model_and_tokenizer(
    model_name=MODEL,
    device=DEVICE,
    model_kwargs = { 'torch_dtype': torch.bfloat16},
    tokenizer_kwargs={'padding_side': 'left'}
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


## Find words generated

Using the datasets we have, find which tokens the model originally generates.

We will later create a `new_token` with the first word that was generated.

In [None]:
phrase = hack.BENCHMARKS.benchmarks[1].prediction_prompts[0].strip(' ')     # Strip "spaces"
print('Input phrase: `{}`'.format(phrase))

generation = utils.generate(
    model, tokenizer,
    phrase=phrase,
    device=DEVICE,
    return_dict_in_generate=False,
    output_logits=False,
    max_new_tokens=10
)
generated_phrase = tokenizer.decode(generation[0])
print('Generated Phrase: `{}`'.format(generated_phrase))

# Finding the first word generated
new_token = re.findall('[a-z|A-Z| ]+', generated_phrase.replace(phrase, ''))[0]
old_tokenization = tokenizer.encode(new_token)
print('New token generated: `{}`(={})'.format(new_token, old_tokenization))

## Adding the new token to the model

Using the word found in the previous step, add it as a new_token to the tokenizer and model

In [None]:
tokenizer.add_tokens(new_token)
new_token_id = tokenizer.convert_tokens_to_ids(new_token)
print('New Token: `{}`(ID={})'.format(new_token, new_token_id))

# Resize model embedding to include the new token
model.resize_token_embeddings(len(tokenizer))

# Initialize the embedding with the average of the embeddings of the previous tokenization
embed = model.get_input_embeddings()

with torch.no_grad():
    new_embed = torch.stack([embed.weight[t] for t in old_tokenization]).mean(dim=0).to(DEVICE)
    _ = embed.weight[new_token_id].data.copy_(new_embed)

## Calculate the new generation with the new added token

Validate the logits of the new added token

In [None]:
new_generation = utils.generate(
    model, tokenizer,
    phrase=phrase,
    device=DEVICE,
    return_dict_in_generate=True,
    output_logits=True,
    output_scores=True,
    output_hidden_states=True,
)

## Calculate new embeddings

Use the "gradient" approach to calculate new embeddings

### Generation Baseline

Generating for the first 10 phrases to validate the baseline

In [3]:
original_generation = []
for i in range(10):
    original_generation.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(original_generation[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] chegada, mas acabou derrotada.
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] público.
Aqui está a tradução para ingl
<Phrase 2>[Os ventos fortes causaram com que algumas árvores] caíssem, e isso resultou em um
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] 120 pães e 6
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu] lugar na equipe. Por exemplo, um jogador deve
<Phrase 5>[Os pássaros voaram alto no] final de semana, mas o tempo começou a ch
<Phrase 6>[O sol brilhou intensamente durante o] dia, e os olhos do jovem p
<Phrase 7>[A chuva caiu suavemente sobre as folhas das] árvores. O som dos rai
<Phrase 8>[A comida estava deliciosa e deixou todos] felizes. Apenas um pequeno problema
<Phrase 9>[Era noite de Natal, e a criança sorriu ao receber o] presente que havia recebido do pai.


### Train Tokenizer

Obtain the new tokens given the phrases we have

In [4]:
# Step 1. Train a new portuguese vocabulary
pt_tokenizer = hack.TokenizerHack(device=DEVICE).train_tokenizer(trainer_kwargs={'vocab_size': 10000})

# Step 2. Find tokens in `pt_tokenizer` not in 
new_tokens = set(pt_tokenizer.get_vocab().keys())
new_tokens = new_tokens.difference(set(tokenizer.vocab.keys()))






### Update Vocab + Embeddings

Add new tokens to tokenizer and update embedding table to inlcude them

In [5]:
# Save the original tokenizations
original_tokenization = {t: tokenizer.encode(t) for t in new_tokens}
tokenizer.add_tokens(list(new_tokens))
model.resize_token_embeddings(len(tokenizer))

# Step 4. Calculate the new embeddings for the new tokens
embed = model.get_input_embeddings().weight.clone().to('cpu')
new_embed = model.get_input_embeddings()

# Initialize the embedding using the weighted average model
K = 5
with torch.no_grad():
    for new_token in tqdm.tqdm(new_tokens, desc='Initializing the embeddings for the new_tokens'):
        new_token_id = tokenizer.encode(new_token)[0]
        # Find the old embedding for the token
        tokenization = original_tokenization[new_token]
        token_embed = torch.stack([embed[t_id] for t_id in tokenization]).to(DEVICE)
        # Calculating the embedding weights
        embedding_weights = torch.asarray([K**i if K**i < 2**64 else 0 for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        # embedding_weights = torch.asarray([K**i for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        embedding_weights = embedding_weights / embedding_weights.sum()

        # Create a new token embed using the weighted average of the embeddings
        new_token_embed = torch.sum(token_embed * embedding_weights[:, None], dim=0)
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Initializing the embeddings for the new_tokens: 100%|██████████| 5040/5040 [00:00<00:00, 7892.35it/s]


### Generation After Adding Tokens

Generating for the first 10 phrases after adding the tokens without any "training"

In [6]:
generation_with_new_tokens = []
for i in range(10):
    generation_with_new_tokens.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(generation_with_new_tokens[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] chegada, ganhando o prêmio
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] público - O Jornal Económico

<Phrase 2>[Os ventos fortes causaram com que algumas árvores] caíssem e a estrada ficasse inter
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] o seu trabalho e ganhava um salário
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu]u trabalho. A nossa equipa é uma das
<Phrase 5>[Os pássaros voaram alto no] céu, enquanto os animais de estima
<Phrase 6>[O sol brilhou intensamente durante o] dia. O vento soprava com for em
<Phrase 7>[A chuva caiu suavemente sobre as folhas das] árvores. O vento levou
<Phrase 8>[A comida estava deliciosa e deixou todos] os pratos, incluindo o pão
<Phrase 9>[Era noite de Natal, e a criança sorriu ao receber o] presente da avó. O que ela fez


### Train the embedding

Use the gradients and so on to obtain the new embeddings

In [None]:
# Step 4.2 Using the training phrases to update the embedding weights
learning_rate = 1e-6
BATCH_SIZE = 8
for new_token in tqdm.tqdm(new_tokens, desc='Updating the embeddings for the new tokens'):
    new_token_id = tokenizer.convert_tokens_to_ids(new_token)
    new_token = tokenizer.decode(new_token_id)
    phrases_to_generate_new_token = [p for phrase in phrases for p in phrase.split(new_token)[:-1] if new_token in phrase and len(p) > 0]

    if len(phrases_to_generate_new_token) == 0: continue
    # Creating the Batched dataset (to run generation for multiple phrases at the same time)
    dataloader = DataLoader(
        TextDataset(phrases_to_generate_new_token, tokenizer, max_length=max(len(tokenizer.tokenize(x)) for x in phrases_to_generate_new_token)),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    # Process the batches
    for batch in tqdm.tqdm(dataloader,  desc=f'  Generating tokens for new_token=`{new_token}` ', leave=False):
        # Move batch tensors to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(DEVICE)
        attention_mask = batch['attention_mask'].squeeze(1).to(DEVICE)

        # Generate text
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1,
            num_beams=1,
            num_return_sequences=1,
            return_dict_in_generate=True,
            output_logits=True,
            output_scores=True,
            output_hidden_states=True,
            pad_token_id=tokenizer.pad_token_id
        )

        # Extract the generated sequences and their scores
        generated_sequences = outputs.sequences
        predicted_logits = outputs.logits

        # Decode the input and generated sequences
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        generated_texts = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
        with torch.no_grad():
            for i in range(len(input_texts)):
                # generation.append({
                #     'generated_sequences': generated_sequences[i].to('cpu'),
                #     'prediction_scores': predicted_logits[0][i].to('cpu'),
                #     'input_texts': input_texts[i],
                #     'generated_texts': generated_texts[i],
                #     'hidden_states': [hidden_state[i].to('cpu') for hidden_state in outputs.hidden_states[0]]
                # })

                logits = predicted_logits[0][i]
                logit_gradient = logits.max() - logits[new_token_id]
                embed_out = outputs.hidden_states[0][-1][i][-1]
                # normalize embed_out
                embed_out = embed_out / embed_out.norm()

                embed_in = new_embed.weight[new_token_id]

                # Update the embedding table
                _ = new_embed.weight[new_token_id].data.copy_((embed_in + logit_gradient * embed_out * learning_rate).to(DEVICE))

Updating the embeddings for the new tokens: 100%|██████████| 5040/5040 [2:10:48<00:00,  1.56s/it]  


In [83]:
generation_with_new_tokens = []
for i in range(10):
    generation_with_new_tokens.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(generation_with_new_tokens[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] gravideztaneamenteórida 1992 semáfor registada provenientes doze estimada criados
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] estimada leite 1992 Foram bolas provenientes salarial 2001 Broo 193
<Phrase 2>[Os ventos fortes causaram com que algumas árvores] esquerdoamá 1992 vivem estimada provenientes registada doze semáfor cobre
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] 1992 semáfor semelhança ciclos provenientes estimada registada castan Foram salarial
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu]órida 1992 semelhança provenientes registada doze estimada Broo cobre salarial
<Phrase 5>[Os pássaros voaram alto no] chumbo 1992 Foram semáfor estimada provenientes doze registada bolas salarial
<Phrase 6>[O sol brilhou intensamente durante o] 1992óridataneamente estimada Foram semáfor provenientes registada doze 2001
<Phrase 7>[A chuva caiu suave

In [81]:
new_generation = utils.generate(
    model, tokenizer,
    phrase=phrases[i],
    device=DEVICE,
    max_new_tokens=10,
    return_dict_in_generate=True,
    output_logits=True,
    output_scores=True,
    output_hidden_states=True,
)
new_generation.logits

(tensor([[  7.6250,   6.0625,   1.0078,  ..., 208.0000, 194.0000, 204.0000]],
        device='cuda:0'),
 tensor([[  9.5000,   7.7500,   6.2812,  ..., 191.0000, 171.0000, 181.0000]],
        device='cuda:0'),
 tensor([[  9.8125,   9.6875,   7.0312,  ..., 215.0000, 195.0000, 207.0000]],
        device='cuda:0'),
 tensor([[ 11.3750,  11.6250,   8.5625,  ..., 256.0000, 236.0000, 249.0000]],
        device='cuda:0'),
 tensor([[ 10.1875,  10.5000,   8.8125,  ..., 270.0000, 248.0000, 262.0000]],
        device='cuda:0'),
 tensor([[  9.6250,  10.5625,   8.1875,  ..., 286.0000, 262.0000, 278.0000]],
        device='cuda:0'),
 tensor([[  9.6250,  10.4375,   8.3125,  ..., 286.0000, 262.0000, 278.0000]],
        device='cuda:0'),
 tensor([[  9.3125,  10.1250,   8.9375,  ..., 288.0000, 264.0000, 278.0000]],
        device='cuda:0'),
 tensor([[  9.3125,   9.8125,   8.9375,  ..., 292.0000, 268.0000, 284.0000]],
        device='cuda:0'),
 tensor([[  9.3125,   9.6875,   8.6875,  ..., 294.0000, 268.0000

I MUST BE MISSING NORMALIZATION

In [None]:
tmp = new_embed.weight[:len(tokenizer) - len(new_tokens)]
min_val = tmp.min().item()
delta_val = tmp.max().item() - min_val

# Normalize the added tokens
with torch.no_grad():
    for new_token_id in range(len(tokenizer) - len(new_tokens), len(tokenizer)):
        embed_in = new_embed.weight[new_token_id]
        embed_normalized = (embed_in - embed_in.min()) / (embed_in.max() - embed_in.min())
        # Update the embedding table
        _ = new_embed.weight[new_token_id].data.copy_(embed_normalized * delta_val - min_val)

-0.255859375

In [82]:
tokenizer.decode(new_generation.sequences[0])

' Ela correu durante horas para alcançar a linha de gravideztaneamenteórida 1992 semáfor registada provenientes doze estimada criados'