In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Define paths
base_dir = "./groupedSpeeches/speeches_097"  # Update this path if needed
model_path = "./local_llama3b"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
model.eval()

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
import os

def calculate_perplexity(data_folder, file_list, prompt=None):
    """
    Calculate the perplexity of text files with a given prompt.

    Args:
        data_folder (str): Path to the folder containing the text files.
        file_list (list): List of file names to calculate perplexity for.
        prompt (str): Optional prompt to prepend to the text.

    Returns:
        dict: A dictionary with file names as keys and their perplexity as values.
    """
    perplexity_results = {}

    for file_name in file_list:
        file_path = os.path.join(data_folder, file_name)
        
        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Prepend the prompt if provided
        if prompt:
            text = prompt + " " + text
        
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        
        # Get model outputs
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        
        # Calculate perplexity
        loss = outputs.loss.item()
        perplexity = torch.exp(torch.tensor(loss)).item()
        
        # Store the result
        perplexity_results[file_name] = perplexity

    return perplexity_results

# Example usage
data_folder = 'groupedSpeeches/speeches_097'
file_list = [
    'chunk_26023.txt', 'chunk_51871.txt', 'chunk_14249.txt', 'chunk_9103.txt',
    'chunk_49389.txt', 'chunk_9528.txt', 'chunk_57457.txt(1)', 'chunk_6830.txt(v)',
    'chunk_9335.txt', 'chunk_29789.txt'
]
prompt = "write a speech for a senator in the 97th congress"

# Calculate perplexity with the prompt
perplexity_with_prompt = calculate_perplexity(data_folder, file_list, prompt=prompt)

# Calculate perplexity without the prompt
perplexity_without_prompt = calculate_perplexity(data_folder, file_list, prompt=None)

print("Perplexity with prompt:", perplexity_with_prompt)
print("Perplexity without prompt:", perplexity_without_prompt)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
