# Introduction

Exploring adding new token with new embeddings with a different/bigger model

In [1]:
import torch
import sys
import pathlib
import tqdm
import os
import re
os.chdir('/home/yali/MEGA/Hack The Tockenizer/tests')
sys.path.insert(1, pathlib.Path('..').resolve().as_posix())
from src import utils, loader, hack
from src.DatasetClass import ListDataset, TextDataset
from torch.utils.data import DataLoader


DEVICE = 'cuda'
MODEL = 'Qwen/Qwen2.5-1.5B-Instruct'
phrases: list[str] = hack.BENCHMARKS.benchmarks[1].prediction_prompts.to_list() # CalamePT dataset 

In [2]:
model, tokenizer = loader.load_model_and_tokenizer(
    model_name=MODEL,
    device=DEVICE,
    model_kwargs = { 'torch_dtype': torch.bfloat16},
    tokenizer_kwargs={'padding_side': 'left'}
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


## Find words generated

Using the datasets we have, find which tokens the model originally generates.

We will later create a `new_token` with the first word that was generated.

In [3]:
phrase = hack.BENCHMARKS.benchmarks[1].prediction_prompts[0].strip(' ')     # Strip "spaces"
print('Input phrase: `{}`'.format(phrase))

generation = utils.generate(
    model, tokenizer,
    phrase=phrase,
    device=DEVICE,
    return_dict_in_generate=False,
    output_logits=False,
    max_new_tokens=10
)
generated_phrase = tokenizer.decode(generation[0])
print('Generated Phrase: `{}`'.format(generated_phrase))

# Finding the first word generated
new_token = re.findall('[a-z|A-Z| ]+', generated_phrase.replace(phrase, ''))[0]
old_tokenization = tokenizer.encode(new_token)
print('New token generated: `{}`(={})'.format(new_token, old_tokenization))

Input phrase: `Ela correu durante horas para alcançar a linha de`
Generated Phrase: `Ela correu durante horas para alcançar a linha de chegada, mas acabou perdendo por apenas`
New token generated: ` chegada`(=[96940, 2584])


## Adding the new token to the model

Using the word found in the previous step, add it as a new_token to the tokenizer and model

In [4]:
tokenizer.add_tokens(new_token)
new_token_id = tokenizer.convert_tokens_to_ids(new_token)
print('New Token: `{}`(ID={})'.format(new_token, new_token_id))

# Resize model embedding to include the new token
model.resize_token_embeddings(len(tokenizer))

# Initialize the embedding with the average of the embeddings of the previous tokenization
embed = model.get_input_embeddings()

with torch.no_grad():
    new_embed = torch.stack([embed.weight[t] for t in old_tokenization]).mean(dim=0).to(DEVICE)
    _ = embed.weight[new_token_id].data.copy_(new_embed)

New Token: ` chegada`(ID=151665)


## Calculate the new generation with the new added token

Validate the logits of the new added token

In [5]:
new_generation = utils.generate(
    model, tokenizer,
    phrase=phrase,
    device=DEVICE,
    return_dict_in_generate=True,
    output_logits=True,
    output_scores=True,
    output_hidden_states=True,
)

## Calculate new embeddings

Use the "gradient" approach to calculate new embeddings

### Generation Baseline

Generating for the first 10 phrases to validate the baseline

In [6]:
original_generation = []
for i in range(10):
    original_generation.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(original_generation[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] chegada e ganhou o prémio.
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] público.
Aqui está uma tradução alternativa
<Phrase 2>[Os ventos fortes causaram com que algumas árvores] caíssem e danificassem o cas
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] 120 biscoitos, e gan
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu] papel na equipe. Quem é o jogador mais
<Phrase 5>[Os pássaros voaram alto no] dia 10 de dezembro, quando o
<Phrase 6>[O sol brilhou intensamente durante o] dia, mas quando a noite chegou,
<Phrase 7>[A chuva caiu suavemente sobre as folhas das] árvores, refletindo o br
<Phrase 8>[A comida estava deliciosa e deixou todos] felizes. Como a comida era doce,
<Phrase 9>[Era noite de Natal, e a criança sorriu ao receber o] presente. Eram dois pães com re


### Train Tokenizer

Obtain the new tokens given the phrases we have

In [7]:
# Step 1. Train a new portuguese vocabulary
pt_tokenizer = hack.TokenizerHack(device=DEVICE).train_tokenizer(trainer_kwargs={'vocab_size': 10000})

# Step 2. Find tokens in `pt_tokenizer` not in 
new_tokens = set(pt_tokenizer.get_vocab().keys())
new_tokens = new_tokens.difference(set(tokenizer.vocab.keys()))
# Removing the 'Ġ' tokens and fixing maybe some others
new_tokens = set([tokenizer.decoder.decode([new_token]) for new_token in new_tokens])






### Update Vocab + Embeddings

Add new tokens to tokenizer and update embedding table to inlcude them

In [8]:
# Save the original tokenizations
original_tokenization = {t: tokenizer.encode(t) for t in new_tokens}
tokenizer.add_tokens(list(new_tokens))
model.resize_token_embeddings(len(tokenizer))

# Step 4. Calculate the new embeddings for the new tokens
embed = model.get_input_embeddings().weight.clone().to('cpu')
new_embed = model.get_input_embeddings()

# Initialize the embedding using the weighted average model
K = 5
with torch.no_grad():
    for new_token in tqdm.tqdm(new_tokens, desc='Initializing the embeddings for the new_tokens'):
        new_token_id = tokenizer.encode(new_token)[0]
        # Find the old embedding for the token
        tokenization = original_tokenization[new_token]
        token_embed = torch.stack([embed[t_id] for t_id in tokenization]).to(DEVICE)
        # Calculating the embedding weights
        embedding_weights = torch.asarray([K**i if K**i < 2**64 else 0 for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        # embedding_weights = torch.asarray([K**i for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        embedding_weights = embedding_weights / embedding_weights.sum()

        # Create a new token embed using the weighted average of the embeddings
        new_token_embed = torch.sum(token_embed * embedding_weights[:, None], dim=0)
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Initializing the embeddings for the new_tokens: 100%|██████████| 5040/5040 [00:00<00:00, 8284.31it/s]


### Generation After Adding Tokens

Generating for the first 10 phrases after adding the tokens without any "training"

In [9]:
generation_with_new_tokens = []
for i in range(10):
    generation_with_new_tokens.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(generation_with_new_tokens[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] meta. 2850. A mena es
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] público e foi um dos grandes destaques da no
<Phrase 2>[Os ventos fortes causaram com que algumas árvores]siasem aço cãorença de queb o p
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] 41500000000 kg de peixes
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu]u tempo para jogar. A equipa tem
<Phrase 5>[Os pássaros voaram alto no] céu, enquanto o pãoeregrino estava
<Phrase 6>[O sol brilhou intensamente durante o] mto dois anos. E oeste mandate
<Phrase 7>[A chuva caiu suavemente sobre as folhas das]. 
Sobre aspeto folas da semente
<Phrase 8>[A comida estava deliciosa e deixou todos] impressionados com a qualidade. O que isso significa
<Phrase 9>[Era noite de Natal, e a criança sorriu ao receber o] aniversário do seu irmão mais velho


### Train the embedding

Use the gradients and so on to obtain the new embeddings

In [10]:
# Step 4.2 Using the training phrases to update the embedding weights
learning_rate = 1e-5
BATCH_SIZE = 8
for new_token in tqdm.tqdm(new_tokens, desc='Updating the embeddings for the new tokens'):
    new_token_id = tokenizer.convert_tokens_to_ids(new_token)
    new_token = tokenizer.decode(new_token_id)
    phrases_to_generate_new_token = [p for phrase in phrases for p in phrase.split(new_token)[:-1] if new_token in phrase and len(p) > 0]

    if len(phrases_to_generate_new_token) == 0: continue
    # Creating the Batched dataset (to run generation for multiple phrases at the same time)
    dataloader = DataLoader(
        TextDataset(phrases_to_generate_new_token, tokenizer, max_length=max(len(tokenizer.tokenize(x)) for x in phrases_to_generate_new_token)),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    # Process the batches
    for batch in tqdm.tqdm(dataloader,  desc=f'  Generating tokens for new_token=`{new_token}` ', leave=False):
        # Move batch tensors to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(DEVICE)
        attention_mask = batch['attention_mask'].squeeze(1).to(DEVICE)

        # Generate text
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1,
            num_beams=1,
            num_return_sequences=1,
            return_dict_in_generate=True,
            output_logits=True,
            output_scores=True,
            output_hidden_states=True,
            pad_token_id=tokenizer.pad_token_id
        )

        # Extract the generated sequences and their scores
        generated_sequences = outputs.sequences
        predicted_logits = outputs.logits

        # Decode the input and generated sequences
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        generated_texts = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
        with torch.no_grad():
            for i in range(len(input_texts)):
                # generation.append({
                #     'generated_sequences': generated_sequences[i].to('cpu'),
                #     'prediction_scores': predicted_logits[0][i].to('cpu'),
                #     'input_texts': input_texts[i],
                #     'generated_texts': generated_texts[i],
                #     'hidden_states': [hidden_state[i].to('cpu') for hidden_state in outputs.hidden_states[0]]
                # })

                logits = predicted_logits[0][i]
                logit_gradient = logits.max() - logits[new_token_id]
                embed_out = outputs.hidden_states[0][-1][i][-1]
                # normalize embed_out
                embed_out = embed_out / embed_out.norm()

                embed_in = new_embed.weight[new_token_id]

                # Update the embedding table
                _ = new_embed.weight[new_token_id].data.copy_((embed_in + logit_gradient * embed_out * learning_rate).to(DEVICE))

Updating the embeddings for the new tokens: 100%|██████████| 5040/5040 [44:27<00:00,  1.89it/s]  


In [11]:
generation_with_new_tokens = []
for i in range(10):
    generation_with_new_tokens.append(utils.generate(
        model, tokenizer,
        phrase=phrases[i],
        device=DEVICE,
        max_new_tokens=10,
        return_dict_in_generate=True,
        output_logits=True,
        output_scores=True,
        output_hidden_states=True,
    ))
    print(
        '<Phrase {}>[{}]{}'.format(i, phrases[i], tokenizer.decode(generation_with_new_tokens[i].sequences[0, -10:]))
    )

<Phrase 0>[ Ela correu durante horas para alcançar a linha de] meta.
 12 324. La carrera dnt
<Phrase 1>[Ela cantou tão bem no concerto que emocionou o] público - 102015500/0015/;22
<Phrase 2>[Os ventos fortes causaram com que algumas árvores]sas vezes tenham queim usar mais de 19915
<Phrase 3>[O Jorge trabalhava numa padaria. Todos os dias ele vendia] 1512000000000 gramas de pão
<Phrase 4>[As equipas de futebol têm vários jogadores, e todos têm que respeitar o seu]u espaço no fundo do campo. Acho
<Phrase 5>[Os pássaros voaram alto no] céu, e oitava párado foi o
<Phrase 6>[O sol brilhou intensamente durante o] soluções de lhesuna, oeste que faz comem quei
<Phrase 7>[A chuva caiu suavemente sobre as folhas das] da árvore, que está em flores
<Phrase 8>[A comida estava deliciosa e deixou todos] felizes. Apenas um detalhe:
<Phrase 9>[Era noite de Natal, e a criança sorriu ao receber o] pão com aleg mãe. A crã 1


### Calculating the rankings for all new tokens in phrases they SHOULD appear.


In [34]:
results = []
for new_token in tqdm.tqdm(list(new_tokens), 'Calculating new logits after "training"'):
    phrases_to_generate_new_token = [p for phrase in phrases for p in phrase.split(new_token)[:-1] if new_token in phrase and len(p) > 0]

    if len(phrases_to_generate_new_token) == 0: continue
    # Creating the Batched dataset (to run generation for multiple phrases at the same time)
    dataloader = DataLoader(
        TextDataset(phrases_to_generate_new_token, tokenizer, max_length=max(len(tokenizer.tokenize(x)) for x in phrases_to_generate_new_token)),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    # Process the batches
    for batch in tqdm.tqdm(dataloader,  desc=f'  Generating tokens for new_token=`{new_token}` ', leave=False):
        # Move batch tensors to the correct device
        input_ids = batch['input_ids'].squeeze(1).to(DEVICE)
        attention_mask = batch['attention_mask'].squeeze(1).to(DEVICE)

        # Generate text
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1,
            num_beams=1,
            num_return_sequences=1,
            return_dict_in_generate=True,
            output_logits=True,
            output_scores=True,
            output_hidden_states=True,
            pad_token_id=tokenizer.pad_token_id
        )

        # Extract the generated sequences and their scores
        generated_sequences = outputs.sequences.to('cpu')
        predicted_logits = outputs.logits[0].to('cpu')

        # Decode the input and generated sequences
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        generated_texts = tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
        
        # Add generation to "generations"
        results.extend(
            [
                {
                    'new_token': new_token,
                    'new_token_id': new_token_id, 
                    'rank': (logits>logits[new_token_id]).sum().item(),
                    'logit': logits[new_token_id].item(),
                    'maximum_logit': logits.max().item(),
                    'generated_sequence': sequence
                } 
                for logits, sequence in zip(predicted_logits, generated_texts)
            ]
        )

Calculating new logits after "training": 100%|██████████| 5040/5040 [56:06<00:00,  1.50it/s]  


In [36]:
import pandas as pd
results = pd.DataFrame(results)
results.to_csv('./gwen1.5_trained_model_logits.csv')