In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import torch

In [36]:
def load_model_and_tokenizer(model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device).eval()

    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use EOS as padding token

    return model, tokenizer


In [46]:
import torch
import time

def benchmark_latency(model, tokenizer, batch_size=8, fp16=False, max_new_tokens=50, runs=4):
    """Benchmark inference speed with batched inputs."""

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Ensure tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding

    # Convert model to FP16 if specified
    if fp16:
        model.half()

    # Prepare batched dummy input (same text repeated for batch size)
    text = "Hugging Face models are great because"
    batch = [text] * batch_size  # Create a batch of the same text
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

    # Benchmark inference time
    times = []
    with torch.no_grad():
        for _ in range(runs):
            start_time = time.perf_counter()
            output = model.generate(**inputs, max_new_tokens=max_new_tokens)
            end_time = time.perf_counter()
            times.append(end_time - start_time)

    generated_texts = tokenizer.batch_decode(output, skip_special_tokens=True)

    print(f"Model: {model.name_or_path} | Batch Size: {batch_size} | FP16: {fp16} | "
          f"First Inference Time: {times[0]:.4f}s | Average Inference Time: {sum(times[1:])/len(times[1:]):.4f}s")
    # print(f"Generated Texts: {generated_texts}\n")


In [47]:
# Test with different models
model, tokenizer = load_model_and_tokenizer("gpt2")
benchmark_latency(model, tokenizer, fp16=False)  # FP32 baseline
# benchmark_latency(model, tokenizer, fp16=True)  # FP16 optimization

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model: gpt2 | Batch Size: 8 | FP16: False | First Inference Time: 1.4246s | Average Inference Time: 1.4147s


### Eval

In [57]:
from datasets import load_dataset

# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
texts = dataset["text"]  # Extract text samples

In [53]:
import math
from torch.utils.data import DataLoader

def calculate_perplexity(model, tokenizer, texts, batch_size=8, max_length=512):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    total_loss = 0
    total_tokens = 0
    dataloader = DataLoader(texts, batch_size=batch_size)

    with torch.no_grad():
        for batch in dataloader:
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            total_loss += loss.item() * inputs["input_ids"].numel()
            total_tokens += inputs["input_ids"].numel()

    perplexity = math.exp(total_loss / total_tokens)
    return perplexity


In [54]:
# Run the evaluation
perplexity = calculate_perplexity(model, tokenizer, texts[:100])  # Sample 100 texts
print(f"GPT-2 Perplexity: {perplexity:.2f}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


GPT-2 Perplexity: 3635.74


In [59]:
# Load the MMLU dataset
dataset = load_dataset("hendrycks_test", "college_computer_science")
print(dataset[0])  # Print an example

TypeError: Expected str 'name', but got: NoneType

In [51]:
import torch
from tqdm import tqdm
from datasets import load_dataset

def score_batch(model, tokenizer, questions, choices_list, format_prompt):
    """
    Computes log-probabilities of each answer choice for a batch of questions.
    """
    prompts = [format_prompt(question, choices) for question, choices in zip(questions, choices_list)]

    # Tokenize prompts once
    prompt_inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to("cuda")
    prompt_len = prompt_inputs.input_ids.shape[1]

    scores = []
    for i in range(len(choices_list[0])):  # Iterate over answer choices
        full_inputs = tokenizer([p + choices[i] for p, choices in zip(prompts, choices_list)], 
                                padding=True, truncation=True, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs = model(**full_inputs)

        # Get log-probabilities for each choice
        logits = outputs.logits[:, prompt_len - 1 : -1, :]
        token_ids = full_inputs.input_ids[:, prompt_len:]

        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        choice_log_probs = log_probs.gather(dim=-1, index=token_ids.unsqueeze(-1)).squeeze(-1)
        scores.append(choice_log_probs.sum(dim=-1))  # Sum log probabilities per choice

    return torch.stack(scores, dim=1).argmax(dim=1).tolist()  # Get index of max log-prob for each question


def evaluate_multiple_choice_dataset(
    model,
    tokenizer,
    dataset_name="hendrycks_test",
    subject="all",
    split="test",
    num_samples=500,
    batch_size=16,
    question_key="question",
    choices_key="choices",
    answer_key="answer",
    format_prompt=None
):
    """
    Evaluates a preloaded Hugging Face model on a multiple-choice dataset.

    Parameters:
    - model (torch.nn.Module): Preloaded Hugging Face model.
    - tokenizer (AutoTokenizer): Preloaded tokenizer.
    - dataset_name (str): Hugging Face dataset name.
    - subject (str): Subject within the dataset (if applicable).
    - split (str): Dataset split to evaluate (e.g., 'test', 'validation').
    - num_samples (int): Number of questions to evaluate.
    - batch_size (int): Number of questions to process in parallel.
    - question_key (str): Key for accessing the question text.
    - choices_key (str): Key for accessing answer choices.
    - answer_key (str): Key for accessing the correct answer.
    - format_prompt (function): A function that formats the prompt based on dataset requirements.

    Returns:
    - accuracy (float): Accuracy of the model on the dataset.
    """

    # Load dataset
    dataset = load_dataset(dataset_name, subject, split=split).select(range(num_samples))

    # Ensure format_prompt is defined
    if format_prompt is None:
        raise ValueError("A format_prompt function must be provided to specify how prompts should be structured.")

    # Evaluate dataset in batches
    correct = 0
    for i in tqdm(range(0, num_samples, batch_size), desc=f"Evaluating {dataset_name}"):
        batch = dataset.select(range(i, min(i + batch_size, num_samples)))
        batch_questions = [sample[question_key] for sample in batch]
        batch_choices = [sample[choices_key] for sample in batch]
        batch_answers = [sample[answer_key] for sample in batch]

        predicted = score_batch(model, tokenizer, batch_questions, batch_choices, format_prompt)
        correct += sum([1 for p, a in zip(predicted, batch_answers) if p == a])

    # Final Accuracy Score
    final_accuracy = correct / num_samples
    print(f"\nFinal Accuracy on '{dataset_name} ({split})': {final_accuracy:.2%}")

    return final_accuracy


In [56]:
def format_prompt_mmlu(question, choices):
    return f"{question}\n" + "\n".join([f"{i}. {choice}" for i, choice in enumerate(choices)]) + "\nAnswer: "

evaluate_multiple_choice_dataset(
    model, tokenizer,
    dataset_name="hendrycks_test",
    subject="college_computer_science",
    split="test",
    num_samples=500,
    format_prompt=format_prompt_mmlu
)

TypeError: Expected str 'name', but got: NoneType