In [None]:
from models.madlib import MadlibModel

## Setting up the Model

In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! PyTorch can use the GPU.")
    # You can also get more info about the GPU
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU device name: {torch.cuda.get_device_name(0)}") # 0 is the index of the first GPU
else:
    print("CUDA is NOT available. PyTorch will run on CPU.")

In [None]:
epsilon = 10

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer

# Load the pre-trained DistilBERT base uncased model and tokenizer
model_name = "distilbert/distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = MadlibModel(model_name=model_name, epsilon=epsilon)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode
model.eval()

In [None]:
embedding_matrix = model.original_emb.weight  # Shape: (vocab_size, hidden_size)

## Setting up the Data

In [None]:
import pandas as pd
# Load the IMDB dataset
df = pd.read_csv("data/IMDB Dataset.csv")

In [None]:
df.head()

## Tokenize the text data

In [None]:
def _tokenize_batch(texts, tokenizer, device):
    """
    Tokenizes a batch of text strings and moves the input tensors to the specified device.
    """
    encoded_input = tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        add_special_tokens=True
    ).to(device)
    return encoded_input

In [None]:
def _get_model_hidden_states(model, input_ids, attention_mask):
    """
    Passes input through the model to get the last hidden states (token embeddings).
    """
    with torch.no_grad():
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
    return output.last_hidden_state

In [None]:
def get_word_embeddings_distilbert_batch(texts, tokenizer, model):
    """
    Generates word embeddings for a batch of texts using DistilBERT.
    Aggregates subword embeddings into full word embeddings.

    Args:
        texts (list of str): A list of text strings to process.
        tokenizer: The loaded DistilBertTokenizer.
        model: The loaded DistilBertModel.
        device (torch.device): The device (e.g., 'cuda' or 'cpu') to perform computations on.

    Returns:
        list of dict: A list, where each element is a dictionary mapping
                      words to their embeddings for a single text in the batch.
    """
    # 1. Tokenize the batch and move to device
    encoded_input = _tokenize_batch(texts, tokenizer, device)
    input_ids = encoded_input['input_ids']
    attention_mask = encoded_input['attention_mask']

    # 2. Get model outputs (last hidden states)
    token_embeddings_batch = _get_model_hidden_states(model, input_ids, attention_mask)

    all_token_embedding_maps = []

    # 3. Iterate through each item in the batch
    for batch_idx in range(len(texts)):
        current_token_embeddings = token_embeddings_batch[batch_idx] # (sequence_length, embedding_dim)
        current_input_ids = input_ids[batch_idx]
        current_attention_mask = attention_mask[batch_idx] # (sequence_length,)
        
        # Convert input IDs back to token strings
        current_tokens_list = tokenizer.convert_ids_to_tokens(current_input_ids)

        # Create the token-to-embedding map for the current text
        token_map_for_single_text = {}
        for i in range(len(current_tokens_list)):
            # Only include tokens that are NOT padding tokens (based on attention mask)
            if current_attention_mask[i].item() == 1:
                token = current_tokens_list[i]
                embedding = current_token_embeddings[i].cpu() # Move embedding to CPU
                token_map_for_single_text[token] = embedding
            # else: # If it's a padding token, we just skip it
            #     break # Optimization: If padding starts, rest of sequence is also padding (due to padding=True)

        all_token_embedding_maps.append(token_map_for_single_text)

    return all_token_embedding_maps


In [None]:
from tqdm.auto import tqdm # Import tqdm

# Define a batch size
batch_size = 256

# List to store all word embeddings for the entire DataFrame
all_df_word_embeddings = []

# Iterate through the DataFrame in batches with tqdm
# total: The total number of iterations. Here, it's the total number of items divided by batch_size.
# desc: A description for the progress bar.
for i in tqdm(range(0, len(df), batch_size), total=(len(df) + batch_size - 1) // batch_size, desc="Processing text batches"):
    batch_texts = df['review'][i : i + batch_size].tolist()
    
    # Get word embeddings for the current batch
    batch_embeds = get_word_embeddings_distilbert_batch(batch_texts, tokenizer, model)
    
    # Extend the main list with results from this batch
    all_df_word_embeddings.extend(batch_embeds)

## To each token, find the other token in embedding_matrix that is closer

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd

device = embedding_matrix.device  # ensure embeddings on same device
embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # (V, D)

all_tokens = []
all_closest_tokens = []
all_similarities = []
all_sentence_ids = []
all_original_tokens = []

for sent_id, sentence in enumerate(tqdm(all_df_word_embeddings, desc="Processing sentences")):
    tokens = list(sentence.keys())
    embeddings = torch.stack([torch.tensor(sentence[tok], device=device) for tok in tokens])  # (N, D)
    emb_norm = F.normalize(embeddings, p=2, dim=1)  # (N, D)

    # Cosine similarity: (N, D) x (D, V) = (N, V)
    similarities = torch.matmul(emb_norm, embedding_matrix_norm.T)  # (N, V)

    # Find closest index per token
    closest_indices = torch.argmax(similarities, dim=1)  # (N,)
    closest_similarities = similarities[torch.arange(len(tokens)), closest_indices]  # (N,)

    # Convert closest indices to tokens
    closest_tokens = tokenizer.convert_ids_to_tokens(closest_indices.cpu().tolist())

    # Append results
    all_sentence_ids.extend([sent_id] * len(tokens))
    all_original_tokens.extend(tokens)
    all_tokens.extend(closest_tokens)
    all_similarities.extend(closest_similarities.cpu().tolist())

# Create DataFrame
df_results = pd.DataFrame({
    "sentence_id": all_sentence_ids,
    "original_token": all_original_tokens,
    "closest_token": all_tokens,
    "similarity": all_similarities
})


In [None]:
df_results.head()

In [None]:
df_results.to_csv(f"data/closest_tokens_distilbert_epsilon{epsilon}.csv", index=False)