In [None]:
from tqdm.auto import tqdm

In [None]:
from models.madlib import MadlibModel

## Setting up the Model

In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! PyTorch can use the GPU.")
    # You can also get more info about the GPU
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU device name: {torch.cuda.get_device_name(0)}") # 0 is the index of the first GPU
else:
    print("CUDA is NOT available. PyTorch will run on CPU.")

In [None]:
epsilon = 1000000

In [None]:
model = MadlibModel(num_labels=2, epsilon=epsilon)
tokenizer = model.tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

In [None]:
embedding_matrix = model.original_emb.weight  # Shape: (vocab_size, hidden_size)

## Setting up the Data

In [None]:
from dataset  import MovieDataset  # Certifique-se de que o arquivo dataset.py está no mesmo diretório
from torch.utils.data import DataLoader

train_dataset = MovieDataset(train=True, max_length=128)
test_dataset = MovieDataset(train=False, max_length=128)

num_labels = train_dataset.num_labels # Get num_labels from dataset

# Instantiate DataLoaders
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
import torch

def get_word_embeddings_distilbert_batch(input_ids, attention_mask, tokenizer, model):
    """
    Generates token embeddings for a batch of tokenized inputs using DistilBERT.
    Maps tokens (excluding padding) to their embeddings for each example in the batch.

    Args:
        input_ids (torch.Tensor): Tensor of shape (batch_size, seq_len).
        attention_mask (torch.Tensor): Tensor of shape (batch_size, seq_len).
        tokenizer: The DistilBertTokenizer instance.
        model: The DistilBertModel instance.

    Returns:
        list of tuple (input_id,embeddibg)
    """

    with torch.no_grad():
        token_embeddings_batch = model.get_embeddings(input_ids)  # (batch_size, seq_len, hidden_dim)

    return input_ids,token_embeddings_batch


In [None]:
def collect_token_embeddings(data_loader, model, device):
    idx = []
    embeddings = []
    model.eval()
    with torch.no_grad():
        for x, y, att_mask in tqdm(data_loader, desc="epoch"):
            x, y, att_mask = x.to(device), y.to(device), att_mask.to(device)
            token_embeddings_batch = model.get_embeddings(x)  # (batch_size, seq_len, hidden_dim)
            mask = att_mask.bool()
            for i in range(x.size(0)):
                valid_indices = mask[i].nonzero(as_tuple=True)[0]
                valid_tokens = x[i][valid_indices].cpu().tolist()
                valid_embeds = token_embeddings_batch[i][valid_indices].cpu()
                idx.append(valid_tokens)
                embeddings.append(valid_embeds)
    return idx, embeddings

# Example usage:
idx_test, embeddings_test = collect_token_embeddings(test_loader, model, device)
idx_train, embeddings_train = collect_token_embeddings(train_loader, model, device)

## To each token, find the other token in embedding_matrix that is closer

In [None]:
len

In [None]:
# Set batch size parameter
batch_size = 256

In [None]:
import torch.nn.functional as F
def compute_closest_embeddings(idx_list, embedding_list, embedding_matrix, tokenizer, batch_size=1024):
    """
    Computes the most similar tokens (from a reference embedding matrix) for a list of token embeddings.

    Args:
        idx_list (list[list[int]]): List of lists, each containing token IDs for a sentence.
        embedding_list (list[Tensor]): List of embedding tensors (one per sentence).
        embedding_matrix (Tensor): Tensor of shape (V, D) with reference embeddings.
        tokenizer: HuggingFace tokenizer.
        batch_size (int): Batch size for processing individual tokens.

    Returns:
        pd.DataFrame: DataFrame with columns:
            - sentence_id
            - original_token_id
            - closest_token_id
            - similarity
            - original_token
            - closest_token
    """
    device = embedding_matrix.device
    embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # (V, D)

    # Flatten all tokens and embeddings into single lists
    all_token_ids = []
    all_embeddings = []
    all_sentence_ids = []
    
    for sent_id, (token_ids, embeddings) in enumerate(zip(idx_list, embedding_list)):
        all_token_ids.extend(token_ids)
        all_embeddings.extend([emb for emb in embeddings])  # Individual token embeddings
        all_sentence_ids.extend([sent_id] * len(token_ids))

    # Now process in batches of individual tokens
    all_original_token_ids = []
    all_closest_token_ids = []
    all_similarities = []
    all_batch_sentence_ids = []

    num_tokens = len(all_token_ids)
    
    for batch_start in tqdm(range(0, num_tokens, batch_size), desc="Processing token batches"):
        batch_end = min(batch_start + batch_size, num_tokens)
        
        batch_token_ids = all_token_ids[batch_start:batch_end]
        batch_embeddings = all_embeddings[batch_start:batch_end]
        batch_sentence_ids = all_sentence_ids[batch_start:batch_end]

        # Stack and normalize - now all embeddings have the same size
        stacked_embeddings = torch.stack(batch_embeddings).to(device)  # (batch_size, D)
        emb_norm = F.normalize(stacked_embeddings, p=2, dim=1)         # (batch_size, D)

        # Cosine similarity: (batch_size, D) × (D, V) → (batch_size, V)
        similarities = torch.matmul(emb_norm, embedding_matrix_norm.T)

        # Get max similarity and index for each token
        closest_similarities, closest_indices = torch.max(similarities, dim=1)

        # Store results
        all_original_token_ids.extend(batch_token_ids)
        all_closest_token_ids.extend(closest_indices.cpu().tolist())
        all_similarities.extend(closest_similarities.cpu().tolist())
        all_batch_sentence_ids.extend(batch_sentence_ids)

    # Create DataFrame
    df_results = pd.DataFrame({
        "sentence_id": all_batch_sentence_ids,
        "original_token_id": all_original_token_ids,
        "closest_token_id": all_closest_token_ids,
        "similarity": all_similarities
    })

    # Add token strings
    df_results["original_token"] = tokenizer.convert_ids_to_tokens(df_results["original_token_id"])
    df_results["closest_token"] = tokenizer.convert_ids_to_tokens(df_results["closest_token_id"])

    return df_results


In [None]:
df_results = compute_closest_embeddings(
    idx_list=idx_train + idx_test,
    embedding_list=embeddings_train + embeddings_test,
    embedding_matrix=embedding_matrix,
    tokenizer=tokenizer,
    batch_size=1024
)

In [None]:
df_results.head()

In [None]:
df_results.to_csv(f"data/closest_tokens_distilbert_epsilon{epsilon}.csv", index=False)