# Evaluating Text Generation with Perplexity, BLEU, ROUGE, METEOR, and BERTScore
We compare a simple prediction (`"the cat is on mat"`) against a reference (`"the cat is on the mat"`) to illustrate how each metric behaves with minor lexical differences.




In [1]:
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import math

In [3]:
# Example data
references = ["the cat is on the mat"]
predictions = ["the cat is on mat"]

# ---------- 1. Perplexity ----------
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def compute_perplexity(texts):
    ppl_list = []
    for text in texts:
        encodings = tokenizer(text, return_tensors="pt")
        breakpoint()
        with torch.no_grad():
            outputs = model(**encodings, labels=encodings["input_ids"])
            loss = outputs.loss
            ppl = math.exp(loss.item())
            ppl_list.append(ppl)
    return sum(ppl_list) / len(ppl_list)

ppl = compute_perplexity(references)
print(f"Perplexity: {ppl:.2f}")


Perplexity: 121.58


In [4]:
# ---------- 2. BLEU ----------
bleu = evaluate.load("bleu")
bleu_result = bleu.compute(predictions=predictions, references=[[r] for r in references])
print("BLEU:", bleu_result)



BLEU: {'bleu': 0.5789300674674098, 'precisions': [1.0, 0.75, 0.6666666666666666, 0.5], 'brevity_penalty': 0.8187307530779819, 'length_ratio': 0.8333333333333334, 'translation_length': 5, 'reference_length': 6}


In [5]:
# ---------- 3. ROUGE ----------
rouge = evaluate.load("rouge")
rouge_result = rouge.compute(predictions=predictions, references=references)
print("ROUGE:", rouge_result)


ROUGE: {'rouge1': np.float64(0.9090909090909091), 'rouge2': np.float64(0.6666666666666665), 'rougeL': np.float64(0.9090909090909091), 'rougeLsum': np.float64(0.9090909090909091)}


In [6]:
# ---------- 4. METEOR ----------
meteor = evaluate.load("meteor")
meteor_result = meteor.compute(predictions=predictions, references=references)
print("METEOR:", meteor_result)


METEOR: {'meteor': np.float64(0.7559322033898305)}


[nltk_data] Downloading package wordnet to /home/lily/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lily/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lily/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
# ---------- 5. BERTScore ----------
bertscore = evaluate.load("bertscore")
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
print(f"BERTScore:", bertscore_result)
print(f"BERTScore F1 mean: {sum(bertscore_result['f1'])/len(bertscore_result['f1']):.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: {'precision': [0.9516574740409851], 'recall': [0.9524093866348267], 'f1': [0.9520333409309387], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.53.2)'}
BERTScore F1 mean: 0.952
