# [ROUGE-N (Recall-Oriented Understudy for Gisting Evaluation)](https://docs.kolena.io/metrics/rouge-n/)

- Complimentary to BLEU, ROUGE-N can be thought of as an analog to recall for text comparisons.



In [1]:
import math
from collections import Counter

import nltk

nltk.download("punkt")
from nltk.util import ngrams

[nltk_data] Downloading package punkt to /Users/yifanwu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
references = [
    "A fast brown dog jumps over a sleeping fox",
    "A quick brown dog jumps over the fox",
]
candidate = "The quick brown fox jumps over the lazy dog"

In [3]:
def rouge_n_similarity(candidate: str, references: list[str], n: int):
    candidate = nltk.word_tokenize(candidate)
    references = [nltk.word_tokenize(ref) for ref in references]

    candidate = ngrams(candidate, n)
    can_counter = Counter(candidate)
    ref_counters = []
    overlap = 0

    for ref in references:
        ref = ngrams(ref, n)
        ref_counter = Counter(ref)
        for token in can_counter:
            token_count_can = can_counter.get(token, 0)
            token_count_ref = ref_counter.get(token, 0)
            overlap += min(token_count_can, token_count_ref)
        ref_counters.append(ref_counter)

    precision = overlap / sum(can_counter.values())
    recall = overlap / sum([sum(ref_counter.values()) for ref_counter in ref_counters])

    print(f"{precision=}, {recall=}")
    rouge_n_score = (
        2 * (precision * recall) / (precision + recall)
        if precision + recall != 0
        else 0
    )

    return rouge_n_score

In [4]:
rouge_n_similarity(candidate, references, 1)

precision=1.3333333333333333, recall=0.7058823529411765


0.9230769230769231

In [5]:
rouge_n_similarity(candidate, references, 2)

precision=0.5, recall=0.26666666666666666


0.3478260869565218