# Initialization

In [1]:
# Explore if we can use the generation of the model to retrieve information regarding new tokens

# TODO: Evaluate how different methodologies compare:
#   1. Create new tokens and use the training-dataset to train the model on it
#   2. Validate how the logits for the new_tokens varies according to:
#       2.1 Using a random embedding for each new_token
#       2.2 Using average of the previous tokenization - E(new_token) = [E(t1) + E(t2) + ... + E(tN) / N] where ORIGINAL_Tokenization(new_token) = [t1, t2, ..., tN]
#       2.3 Using a weighted average of the previous tokenization - E(new_token) = [w1*E(1) + ... + wN*E(N)].
#           2.3.1 Try first with w_i = w_{i+1} * K for some different K's
#           2.3.2 Try to use a simple regressor model to find different w_i values using the existing merges
#   3. Compare the logits "RANK" of all different steps as well
#   4. Use the average of the phrases of a given prediction to obtain a new embedding

import torch
import sys
import pathlib
sys.path.insert(1, str(pathlib.Path('..').resolve()))
from src import utils, loader, hack
from src.DatasetClass import ListDataset

import tqdm
import transformers
DEVICE = 'cuda'

NUM_TOKENS_TO_MERGE = 3
# Load the model and tokenizer
model_name = 'HuggingFaceTB/SmolLM-135M'
model, tokenizer = loader.load_model_and_tokenizer(model_name=model_name, device=DEVICE, tokenizer_kwargs={'padding_side': 'left'})

# Generation

Finding which tokens the model originally generates so we can calculate our metrics

## Defining Function

Function which generates for a given "str" as input

In [2]:
import transformers.models as models
def generate(
    model: models.llama.modeling_llama.LlamaForCausalLM,
    tokenizer: models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast,
    phrase: str,
    output_scores=True,
    return_dict_in_generate=True,
    max_new_tokens=1,
    **model_kwargs
):
    inputs = tokenizer(phrase, return_tensors='pt')
    for key in inputs.keys(): inputs[key] = inputs[key].to(DEVICE)
    return model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        pad_token_id = tokenizer.eos_token_id,
        output_scores=output_scores,
        return_dict_in_generate=return_dict_in_generate,
        max_new_tokens=max_new_tokens,
        **model_kwargs
    )

## Generating tokens

First, find out which tokens are generated by default

In [None]:
MAX_NEW_TOKENS = 20
phrases: list[str] = hack.BENCHMARKS.benchmarks[0].prediction_prompts.to_list() # SuperGluePTPT dataset

In [4]:
# for n, phrase in tqdm.tqdm(enumerate(phrases), total=len(phrases), desc='Original generation for inputs'):
#     generation = generate(
#         model,
#         tokenizer,
#         phrase,
#         output_scores=False,
#         return_dict_in_generate=False,
#         max_new_tokens=MAX_NEW_TOKENS,
#     )
#     phrases[n] = {
#         'original_text': phrase,
#         'generated_tokens': [g.item() for g in generation[0][-MAX_NEW_TOKENS:]]
#     }

Same as above but using Pipeline to paralellize the generation

In [None]:
BATCH_SIZE = 16

tokenizer.pad_token_id = tokenizer.eos_token_id
generator = transformers.pipeline(
    model=model_name,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device=DEVICE,
    model_kwargs={'pad_token_id': tokenizer.eos_token_id},
)
generation = []
for gen in tqdm.tqdm(generator(ListDataset(phrases),
        max_new_tokens=MAX_NEW_TOKENS,
        batch_size=BATCH_SIZE
    ),
    desc="Original generation for inputs",
    total=len(phrases)
):
    generation.append(
        {
            'original_text': phrases[len(generation)],
            'generated_text': gen[0]['generated_text'],
            'generated_tokens': tokenizer.encode(gen[0]['generated_text'])[-MAX_NEW_TOKENS:]
        }
    )
phrases = generation

Device set to use cuda
Original generation for inputs:  79%|███████▉  | 12561/15942 [06:59<01:52, 30.09it/s]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Original generation for inputs: 100%|██████████| 15942/15942 [09:07<00:00, 29.12it/s]


# Defining Inputs

In [6]:
NUM_TOKENS_TO_MERGE = 2
TOP_N_TOKENS = 50

## Obtaining new tokens to consider

In [7]:
import pandas as pd
generation = pd.DataFrame(phrases)
generation['generation'] = generation['generated_tokens'].apply(lambda x: tokenizer.decode(x[:NUM_TOKENS_TO_MERGE]))
top_tokens = generation.groupby(by=['generation'], as_index=False).count().rename(columns={'original_text': 'NUM_PHRASES'})
top_tokens = top_tokens.sort_values(by=['NUM_PHRASES'], ascending=False)

# Pick the top N tokens (disregarding the first because it is the word '.\n')
new_tokens = top_tokens.iloc[1:TOP_N_TOKENS+1]['generation'].to_list()

# Compare different embedding aggregation methods

By adding new tokens to the tokenizer and to the model's embeddings, finding the scores of all possible embedding calculations

## Auxiliary Functions

In [None]:
def calculate_token_scores(model, tokenizer, phrase: str, token_id: int):
    '''
    Returns the score of a specific token_id when generating a new token with `phrase` as input.

    Parameters
    ----------
    model: Any
        model to generate the phrase with
    
    tokenizer: Any
        tokenizer to encode the given phrase
    
    phrase: str
        phrase to give as input to the model
    
    token_id: int
        token to retrieve the scores to

    Returns
    -------
    dict[Literal['score', 'rank', 'best_score'], float]
    '''
    generation = generate(
        model,
        tokenizer,
        phrase,
        output_scores=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
    )
    scores = generation['scores'][0][0]
    token_score = scores[token_id]
    token_rank = (scores > token_score).sum()
    return {'score': token_score.item(), 'rank': token_rank.item(), 'best_score': scores.max().item()}

def obtain_results(model, tokenizer, method, new_tokens):
    '''Function to gather results for a given token_list'''
    output = []
    phrases_tokens = generation.groupby(by=['generation'])
    for new_token in tqdm.tqdm(new_tokens, desc=f'Calculating scores for <{method}>'):
        token_id = tokenizer.convert_tokens_to_ids(new_token)
        output.append({
            'token': new_token,
            'token_id': token_id,
            'method': method,
            'scores': {
                phrase: calculate_token_scores(model, tokenizer, phrase, token_id) 
                for phrase in phrases_tokens.get_group((new_token,))['original_text']
            }
        })
    return output

## Initialization
Loading models and embeddings and resize them according to new tokens

In [None]:
m, t = loader.load_model_and_tokenizer(model_name=model_name, device=DEVICE)

# Add all generated tokens to tokenizer and model.
t.add_tokens(new_tokens)

m.resize_token_embeddings(len(t))

# For each methodology calculate the scores of the new_token
embed = m.get_input_embeddings().weight.clone()
new_embed = m.get_input_embeddings()
results = []

## First Method

Creating a Baseline with "Random" embedding to compare against the rest of the results

In [None]:
METHOD = 'Random Embedding - BASELINE'
min_embed, max_embed = embed.min(), embed.max()
with torch.no_grad():
    for new_token in new_tokens:
        new_token_id = t.convert_tokens_to_ids(new_token)
        # Create a new RANDOM embedding within the ranges of our existing embeddings
        new_token_embed = torch.rand(embed.shape[1]).to(DEVICE)
        # Updating values to be within ranges of existing embed
        new_token_embed *= max_embed - min_embed
        new_token_embed += min_embed
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)
results.extend(obtain_results(m, t, METHOD, new_tokens))

## Second Method
Using the Average of the embeddings.
Let $new\_token = (t_1, t_2, \dots, t_N)$
$$
    E(new\_token) = \frac{1}{N} \times \sum_{i=1}^{N}{E(t_i)}
$$

In [None]:
METHOD = 'Average of Embeddings'
with torch.no_grad():
    for new_token in new_tokens:
        new_token_id = t.encode(new_token)[0]
        # Find the old embedding for the token
        token_embed = torch.stack([embed[t_id] for t_id in tokenizer.encode(new_token)]).to(DEVICE)
        # Create a new token embed using the average
        new_token_embed = token_embed.mean(dim=0)
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)
results.extend(obtain_results(m, t, METHOD, new_tokens))

## Third Method

Weighted average where $E = Embedding$ and
$$
new\_token = (t_1, t_2, ..., t_N) \\ 
E(new\_token) = (w_1 \times E(t_1), w_2 \times E(t_2), \dots, w_N \times E(t_N)) \\
w_i = w_{i+1} \times K
$$
for some $K$

In [None]:
K = 5
METHOD = f'Weighted average with w_i = w_{{i+1}} * {K}'
with torch.no_grad():
    for new_token in new_tokens:
        new_token_id = t.encode(new_token)[0]
        # Find the old embedding for the token
        token_embed = torch.stack([embed[t_id] for t_id in tokenizer.encode(new_token)]).to(DEVICE)
        # Calculating the embedding weights
        embedding_weights = torch.asarray([K**i for i in range(token_embed.shape[0], 0, -1)]).to(DEVICE)
        embedding_weights = embedding_weights / embedding_weights.sum()

        # Create a new token embed using the weighted average of the embeddings
        new_token_embed = torch.sum(token_embed * embedding_weights[:, None], dim=0)
        # Update embedding of the new_token in the hacked_model
        _ = new_embed.weight[new_token_id].data.copy_(new_token_embed)
results.extend(obtain_results(m, t, METHOD, new_tokens))

# Visualizing the comparisons

## Gathering results

First gathering the results as a pandas dataframe and saving it to a csv file

In [None]:
df = []
for record in results:
    for phrase in record['scores'].keys():
        df.append({
            'token': record['token'],
            'token_id': record['token_id'],
            'method': record['method'],
            'phrase': phrase,
            'new_token_rank': record['scores'][phrase]['rank'],
            'new_token_score': record['scores'][phrase]['score'],
            'generation_best_score': record['scores'][phrase]['best_score'],
        })
df = pd.DataFrame(df)
tmp = df.groupby(by=['method'], as_index=False)[['generation_best_score', 'new_token_rank', 'new_token_score']].describe()
tmp = tmp.melt(id_vars=[('method', '')])
tmp.columns = ['method', 'metric', 'aggregation', 'value']
tmp = tmp.sort_values(by=['metric', 'aggregation', 'method'], ascending=True)
tmp.to_csv(f'./embedding_calculations_{NUM_TOKENS_TO_MERGE}_merged_tokens.csv', index=False)

## Plotting and saving

In [None]:
graph_df = df[df['method'] != 'Random Embedding - BASELINE'].groupby(by=['token', 'method'], as_index=False)[['new_token_rank']].mean()

ax = graph_df.pivot(index=['token'], columns=['method'], values=['new_token_rank']).plot()
ax.set_title = f'Top {TOP_N_TOKENS} new tokens of size {NUM_TOKENS_TO_MERGE}'
fig = ax.get_figure()
fig.savefig(f'top{TOP_N_TOKENS}_tokens_ranks_size{NUM_TOKENS_TO_MERGE}.png') 

# Finding the embedding from predictions

## Step 1. Tests

Testing out how to use `hidden_states` to create new embedding.

Using hidden state generated of the last sequence as the embedding for the new_token.

The steps of what is done is as follows:
1. **Generate** for a given phrase **1 token**
1. Check the **hidden_states** of the generation
1. Look into the **last layer** (or state) of the **hidden states**
1. Look at the **Last sequence** of the last layer
1. Normalize the vector using the **sigmoid** function.
1. Use that as the **embedding** for the **new_token**

In [8]:
row = generation[generation['generation'].isin(new_tokens)].sort_values(by=['generation']).iloc[0]
text = row['original_text']
tokenization = row['generated_tokens'][:NUM_TOKENS_TO_MERGE]

inputs = tokenizer(text, return_tensors='pt')
out = model.generate(
    inputs['input_ids'].to(DEVICE),
    attention_mask = inputs['attention_mask'].to(DEVICE),
    pad_token_id = tokenizer.eos_token_id,
    max_new_tokens = 1,
    output_hidden_states=True,
    return_dict_in_generate=True
)

hidden_states = out.hidden_states

In [None]:
last_hidden_layer = hidden_states[0][-1]
last_hidden_layer.shape  # batch_size, sequence_length, hidden_size = (1, 374, 576)

new_token = '<last_seq_of_last_hidden_layer_embedding>'
tokenizer.add_tokens(new_token)

model.resize_token_embeddings(len(tokenizer))
new_token_id = tokenizer.convert_tokens_to_ids(new_token)

embed = model.get_input_embeddings()

scores = {}
# Try out all the sequences in the last layer
for sequence in tqdm.trange(last_hidden_layer.shape[1]):
    new_token_embedding = last_hidden_layer[0][373]    # 374th elemet of size 576 (so last sequence of the last layer)

    # Normalize it using Sigmoid function
    new_token_embedding = 1 / (1 + torch.exp(-new_token_embedding))


    with torch.no_grad():
        _ = embed.weight[new_token_id].data.copy_(new_token_embedding)


    out = model.generate(
        inputs['input_ids'].to(DEVICE),
        attention_mask = inputs['attention_mask'].to(DEVICE),
        pad_token_id = tokenizer.eos_token_id,
        max_new_tokens = 1,
        return_dict_in_generate=True,
        output_scores=True,
    )
    scores[sequence] = out.scores[0][0, -1]

scores

100%|██████████| 374/374 [00:07<00:00, 51.47it/s]


{0: tensor(2040., device='cuda:0'),
 1: tensor(2040., device='cuda:0'),
 2: tensor(2040., device='cuda:0'),
 3: tensor(2040., device='cuda:0'),
 4: tensor(2040., device='cuda:0'),
 5: tensor(2040., device='cuda:0'),
 6: tensor(2040., device='cuda:0'),
 7: tensor(2040., device='cuda:0'),
 8: tensor(2040., device='cuda:0'),
 9: tensor(2040., device='cuda:0'),
 10: tensor(2040., device='cuda:0'),
 11: tensor(2040., device='cuda:0'),
 12: tensor(2040., device='cuda:0'),
 13: tensor(2040., device='cuda:0'),
 14: tensor(2040., device='cuda:0'),
 15: tensor(2040., device='cuda:0'),
 16: tensor(2040., device='cuda:0'),
 17: tensor(2040., device='cuda:0'),
 18: tensor(2040., device='cuda:0'),
 19: tensor(2040., device='cuda:0'),
 20: tensor(2040., device='cuda:0'),
 21: tensor(2040., device='cuda:0'),
 22: tensor(2040., device='cuda:0'),
 23: tensor(2040., device='cuda:0'),
 24: tensor(2040., device='cuda:0'),
 25: tensor(2040., device='cuda:0'),
 26: tensor(2040., device='cuda:0'),
 27: tensor

### Findings:
All SEQUENCES return the same score, why is that?

Score is higher than all other logits, so we will test next the average of all generations of a specific merge of tokens

## Step 2. Calculating Embedding for new tokens

Using last step's findings to create new embeddings for new_tokens. The steps for the calculation of a single "new_token" are as follow:
1. Look at all phrases that generate "new_token" (More specifically, look at phrases where the next `N` tokens match the tokenization of "new_token")
1. Use the **hidden_states** of all of them and pick the **last_state** for all generations
1. Calculate the **average** of the **last_sequence** of all vectors previously obtained
1. Update the **embed** of the model with said average

In [40]:
new_token = new_tokens[0]
tokenizer.add_tokens(new_token)
new_token_id = tokenizer.encode(new_token)[0]
model.resize_token_embeddings(new_token_id+1)

phrases_new_token = generation[generation['generation'] == new_token]['original_text'].tolist()

# Step 2.1
#   Obtain hidden states (last sequence of last layer) for all phrases
hidden_states_last_vector = []
for phrase in tqdm.tqdm(phrases_new_token, desc='Obtaining hidden states'):
    inputs = tokenizer(phrase, return_tensors='pt')
    hidden_states_last_vector.append(model.generate(
        inputs['input_ids'].to(DEVICE),
        attention_mask=inputs['attention_mask'].to(DEVICE),
        pad_token_id = tokenizer.eos_token_id,
        max_new_tokens = 1,
        output_hidden_states=True,
        return_dict_in_generate=True,
        output_scores=True,
    ).hidden_states[0][-1][0, -1])
hidden_states_last_vector = torch.stack(hidden_states_last_vector).to(DEVICE)

new_token_embedding = hidden_states_last_vector.mean(dim=0)


embed = model.get_input_embeddings()
with torch.no_grad():
    embed.weight[new_token_id] = new_token_embedding


# Validate all phrases generate the "new_token"
tokens_generated = []
for phrase in tqdm.tqdm(phrases_new_token, desc='Validating generations match the new tokens'):
    inputs = tokenizer(phrase, return_tensors='pt')
    tokens_generated.append(model.generate(
        inputs['input_ids'].to(DEVICE),
        attention_mask=inputs['attention_mask'].to(DEVICE),
        pad_token_id = tokenizer.eos_token_id,
        max_new_tokens = 1,
    )[0, -1])
assert all([t.item() == new_token_id for t in tokens_generated])

Obtaining hidden states: 100%|██████████| 1391/1391 [00:23<00:00, 59.91it/s]
Validating generations match the new tokens: 100%|██████████| 1391/1391 [00:23<00:00, 58.52it/s]


In [44]:
phrase = phrases_new_token[0]
inputs = tokenizer(phrase, return_tensors='pt')
print(tokenizer.decode(model.generate(
    inputs['input_ids'].to(DEVICE),
    attention_mask=inputs['attention_mask'].to(DEVICE),
    pad_token_id = tokenizer.eos_token_id,
    max_new_tokens = 10,
)[0]))

Passagem: Lei do bom samaritano -- As leis do bom samaritano oferecem proteção jurídica às pessoas que prestam assistência razoável a quem está, ou pensa estar, ferido, doente, em perigo ou incapacitado. A proteção destina-se a reduzir a hesitação dos transeuntes em prestar assistência, por receio de serem processados ou acusados de lesão não intencional ou morte por negligência. Um exemplo de uma lei deste tipo em zonas de direito consuetudinário do Canadá: a doutrina do bom samaritano é um princípio jurídico que impede que um socorrista que tenha ajudado voluntariamente uma vítima em perigo seja processado por um ato ilícito. O seu objetivo é evitar que as pessoas se mostrem relutantes em ajudar um estranho em necessidade por receio de repercussões legais caso cometam algum erro no tratamento. Em contrapartida, uma lei de obrigação de socorro exige que as pessoas prestem assistência e responsabiliza quem não o fizer.
Pergunta: As leis do bom samaritano protegem as pessoas que ajudam 

In [98]:
model, tokenizer = loader.load_model_and_tokenizer(model_name=model_name, device=DEVICE, tokenizer_kwargs={'padding_side': 'left'})

In [104]:
model.get_input_embeddings().weight

Parameter containing:
tensor([[-0.3789, -0.2188,  0.0276,  ...,  0.1924,  0.0388,  0.0461],
        [-0.0371,  0.0315,  0.0220,  ..., -0.0464, -0.0113, -0.0359],
        [-0.0178,  0.0322,  0.0254,  ..., -0.0226, -0.0186, -0.0376],
        ...,
        [ 0.1816, -0.0601, -0.0923,  ..., -0.0325, -0.2109, -0.1719],
        [-0.0208,  0.0723,  0.0371,  ...,  0.1001, -0.1992,  0.1865],
        [ 0.2695,  0.1309,  0.2412,  ..., -0.1152,  0.1855, -0.3691]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [102]:
model.lm_head.weight

Parameter containing:
tensor([[-0.3789, -0.2188,  0.0276,  ...,  0.1924,  0.0388,  0.0461],
        [-0.0371,  0.0315,  0.0220,  ..., -0.0464, -0.0113, -0.0359],
        [-0.0178,  0.0322,  0.0254,  ..., -0.0226, -0.0186, -0.0376],
        ...,
        [ 0.1816, -0.0601, -0.0923,  ..., -0.0325, -0.2109, -0.1719],
        [-0.0208,  0.0723,  0.0371,  ...,  0.1001, -0.1992,  0.1865],
        [ 0.2695,  0.1309,  0.2412,  ..., -0.1152,  0.1855, -0.3691]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [None]:
out = model.generate(
    inputs['input_ids'].to(DEVICE),
    attention_mask = inputs['attention_mask'].to(DEVICE),
    pad_token_id = tokenizer.eos_token_id,
    max_new_tokens = 1,
    return_dict_in_generate=True,
    output_scores=True,
    output_logits=True,
    output_hidden_states=True,
)

In [None]:
out.logits[0].shape

torch.Size([1, 49152])

In [126]:
out.hidden_states[-1][-1][0, -1].shape

torch.Size([576])

In [128]:
out.logits[-1].max()

tensor(9.1875, device='cuda:0')