## Calculate the embedding distance for ultrafeedback dataset

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
import random
import numpy as np
from tqdm import tqdm
import os

# Set random seed for reproducibility
seed = 3
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to calculate cosine distance and return KNN indices (excluding the sample itself)
def cosDistance(chosen_embedding, rejected_embedding):

    # Compute cosine similarity and convert to cosine distance
    similarity_vector = torch.matmul(chosen_embedding, rejected_embedding)
    distance_vector = 1.0 - similarity_vector

    return distance_vector

# Mean pooling function to obtain sentence embeddings
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Process dialog messages to convert into a text format
def process_dialog(dialog):
    chosen_text, rejected_text = "", ""
    
    for message in dialog['chosen']:
        if message['role'] == 'assistant':
            chosen_text += f"### {message['role']}: {message['content']}\n"
        
    for message in dialog['rejected']:
        if message['role'] == 'assistant':
            chosen_text += f"### {message['role']}: {message['content']}\n"
        
    return {"chosen_features": chosen_text, "rejected_features": rejected_text}

# Generate embeddings for text
def embed_chosen_text(batch):
    encoded_inputs = tokenizer(batch['chosen_features'], padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        model_outputs = model(**encoded_inputs)
    sentence_embeddings = mean_pooling(model_outputs, encoded_inputs['attention_mask'])
    embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    batch['chosen_embeddings'] = embeddings.cpu().numpy().tolist()
    return batch

def embed_rejected_text(batch):
    encoded_inputs = tokenizer(batch['rejected_features'], padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        model_outputs = model(**encoded_inputs)
    sentence_embeddings = mean_pooling(model_outputs, encoded_inputs['attention_mask'])
    embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    batch['rejected_embeddings'] = embeddings.cpu().numpy().tolist()
    return batch

##############################################################################################################################

# Set dataset parameters
k_near = 10
dataset_size = 1000


# embedding_data_name = 'ultrafeedback_embeddings.parquet'
embedding_data_name = 'ultrafeedback_embeddings_llama8b.parquet'
# embedding_model_name = "BAAI/bge-large-en-v1.5"
embedding_model_name = 'princeton-nlp/Llama-3-Base-8B-SFT'

if not os.path.exists(embedding_data_name):

    dataset = load_dataset('json', data_files="ultrafeedback_with_learning_order.json")

    dataset['train'] = dataset['train'].map(process_dialog, batched=False)


    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    model = AutoModel.from_pretrained(embedding_model_name).to(device)
    model = torch.nn.DataParallel(model)

    dataset['train'] = dataset['train'].map(embed_chosen_text, batched=True, batch_size=1)
    dataset['train'] = dataset['train'].map(embed_rejected_text, batched=True, batch_size=1)

    dataset['train'].to_parquet(embedding_data_name)
    print(f"Embeddings saved to {embedding_data_name}")




  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.70it/s]
Map:   2%|▏         | 1255/61135 [06:29<5:09:51,  3.22 examples/s] 


KeyboardInterrupt: 

In [2]:
embedding_dataset = load_dataset('parquet', data_files=f'ultrafeedback_embeddings.parquet')['train']
rejected_embeddings = torch.tensor(embedding_dataset['rejected_embeddings']).to(device)
chosen_embeddings = torch.tensor(embedding_dataset['chosen_embeddings']).to(device)

distances = []

for chosen_embedding, rejected_embedding in tqdm(zip(chosen_embeddings, rejected_embeddings), desc=f"Processing high-quality samples:"):
    # Calculate cosine distance for the sample and retrieve KNN indices
    distance = cosDistance(chosen_embedding, rejected_embedding).item()
    
    # similarity_vector = torch.matmul(chosen_embedding, rejected_embedding).item()
    # distance = 1- similarity_vector
    # print(distance)
    distances.append(distance)
    

Processing high-quality samples:: 61135it [00:01, 45069.93it/s]


In [9]:
dataset = load_dataset('json', data_files="ultrafeedback_with_learning_order.json")['train']


# dataset = dataset.add_column("embedding_distance", distances)
# dataset.to_json("ultrafeedback_with_learning_order.json")

Generating train split: 61135 examples [00:01, 56113.76 examples/s]


Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'score_diff', 'gemma_learning_order', 'llama_learning_order', 'mistral_learning_order', 'qwen-2.5-7b_learning_order', 'embedding_distance'],
    num_rows: 61135
})

Creating json from Arrow format: 100%|██████████| 62/62 [00:02<00:00, 20.76ba/s]


445746213