# Evaluate a automatic summarization using the metric rouge

This notebook computes the rouge score and saves it to a dedicated text file.
Please indicate the model you want to evaluate in the first code cell.

For more information on the metric rouge visit the [Hugging Face documentation](https://huggingface.co/spaces/evaluate-metric/rouge)


## Notes
- [comparison between BLEU and ROUGE](https://www.geeksforgeeks.org/nlp/understanding-bleu-and-rouge-score-for-nlp-evaluation/) (rouge is better suited for summaries)
- [orientation](https://pub.aimind.so/unveiling-the-power-of-rouge-metrics-in-nlp-b6d3f96d3363) for low/moderate/good rouge scores

In [3]:
# load model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_folder = "./t5_small_finetuned"
model = AutoModelForSeq2SeqLM.from_pretrained(model_folder)
tokenizer = AutoTokenizer.from_pretrained(model_folder)

In [4]:
# evaluate model
from datasets import load_from_disk
import evaluate
from tqdm import tqdm

# load dataset
dataset = load_from_disk("cnn_dailymail_full")
test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

# load rouge metric
rouge = evaluate.load("rouge")

# generate summaries
predictions = []
references = []
for example in tqdm(test_dataset):
    input_text = "summarize: " + example["article"]
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        length_penalty=2.0,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    predictions.append(summary)
    references.append(example["highlights"])

# calculate and print rouge scores
results = rouge.compute(predictions=predictions, references=references)
print("ROUGE-1:", results["rouge1"], "(overlap of individual words (unigrams))")
print("ROUGE-2:", results["rouge2"], "(overlap of two-word phrases (bigrams))")
print("ROUGE-L:", results["rougeL"], "(longest common subsequence (LCS))")

# save rouge scores
model_name = model_folder.split("/")[-1]
with open(f"rouge_scores_{model_name}.txt", "w") as f:
    f.write(f"ROUGE scores for model: {model_name}\n")
    f.write(f"ROUGE-1: {results['rouge1']}\n")
    f.write(f"ROUGE-2: {results['rouge2']}\n")
    f.write(f"ROUGE-L: {results['rougeL']}\n")

100%|██████████| 1000/1000 [1:05:27<00:00,  3.93s/it]


ROUGE-1: 0.3862081747462096 (overlap of individual words (unigrams))
ROUGE-2: 0.1678682721324621 (overlap of two-word phrases (bigrams))
ROUGE-L: 0.25375539221607096 (longest common subsequence (LCS))
