<a href="https://colab.research.google.com/github/GianlucaRapaglia/LLM-training/blob/main/01%20-%20LLM%20validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🤖 Evaluate Fine-Tuned GPT-2 Model on WikiText-2

This notebook demonstrates how to validate a **fine-tuned DistilGPT-2 model** on the dataset it was trained on: `wikitext/wikitext-2-raw-v1`.

We will:
- Load the model and tokenizer from Hugging Face Hub
- Load and preprocess the test set of WikiText-2
- Evaluate the model using **perplexity**, a standard metric in language modeling


In [None]:
from huggingface_hub import notebook_login

# Run this to log in to your Hugging Face account
notebook_login()

In [None]:
!pip install -U transformers datasets evaluate fsspec

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Your fine-tuned model on WikiText-2
model_name = "lucarapaglia/distilgpt2-finetuned-wikitext2"

# Use original tokenizer from distilgpt2
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Load your fine-tuned model weights
model = AutoModelForCausalLM.from_pretrained(model_name)


let's load the test dataset:

In [None]:
from datasets import load_dataset

# Load the test split of the original training dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# Let's preview a few examples
dataset[:3]


In [None]:
# Tokenize the text into input IDs using the model's tokenizer
def tokenize(example):
    return tokenizer(example["text"], return_special_tokens_mask=True)

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])


In [None]:
# Language models need contiguous sequences (not single sentences)
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size
    return {
        k: [concatenated[k][i:i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated
    }

lm_dataset = tokenized_dataset.map(group_texts, batched=True)


In [None]:
import torch
#from evaluate import load
#
## Load the evaluation metric
#perplexity = load("perplexity")
#
## Compute perplexity using the raw test data (not tokenized)
#texts = [t for t in dataset["text"] if t.strip() != ""]
#results = perplexity.compute(
#    model_id=model_name,
#    add_start_token=True,
#    predictions=texts,
#)
#
#print(f"📊 Perplexity of the fine-tuned model: {results['perplexity']:.2f}")

# Load your fine-tuned model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device)

def compute_perplexity_on_dataset(dataset, batch_size=16):
    losses = []
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i : i + batch_size]

        # Convert batch to tensors and move to device
        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch.get("attention_mask", [[1]*input_ids.size(1)]*len(batch["input_ids"]))).to(device)

        # Labels are the same as input_ids for causal LM
        labels = input_ids.clone()

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            # outputs.loss is the average cross-entropy loss over all non-masked tokens in the batch
            loss = outputs.loss
            losses.append(loss.item() * input_ids.size(0))  # multiply by batch size to sum losses

    # Average loss over all tokens
    avg_loss = sum(losses) / len(dataset)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

ppl = compute_perplexity_on_dataset(lm_dataset)
print(f"Perplexity on tokenized grouped dataset: {ppl:.2f}")
