Lightning-AI · SkafteNicki · Jul 29, 2021 · Jul 23, 2021 · Jul 23, 2021 · Jul 23, 2021
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added Symmetric Mean Absolute Percentage error (SMAPE) ([#375](https://github.com/PyTorchLightning/metrics/issues/375))
 
+- Added ROUGE Metric ([#399](https://github.com/PyTorchLightning/metrics/issues/399))
 
 - Allowed passing labels in (n_samples, n_classes) to `AveragePrecision` ([#386](https://github.com/PyTorchLightning/metrics/issues/386))
 

@@ -355,6 +355,10 @@ bleu_score [func]
 .. autofunction:: torchmetrics.functional.bleu_score
     :noindex:
 
+rouge_score [func]
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torchmetrics.functional.rouge_score
 
 wer [func]
 ~~~~~~~~~~

@@ -517,6 +517,12 @@ BLEUScore
 .. autoclass:: torchmetrics.BLEUScore
     :noindex:
 
+ROUGEScore
+~~~~~~~~~~~
+
+.. autoclass:: torchmetrics.ROUGEScore
+    :noindex:
+
 
 WER
 ~~~

diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -13,3 +13,6 @@ sphinx-copybutton>=0.3
 
 # integrations
 pytorch-lightning>=1.1
+
+# add extra requirements
+-r text.txt
@@ -15,7 +15,6 @@ phmdoctest>=1.1.1
 cloudpickle>=1.3
 scikit-learn>=0.24
 scikit-image>0.17.1
-nltk>=3.6
 
 # add extra requirements
 -r image.txt

@@ -1 +1,3 @@
-jiwer==2.2.0
+jiwer>=2.2.0
+nltk>=3.6
+rouge-score>=0.0.4
@@ -0,0 +1,51 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from torch import tensor
+
+from torchmetrics.functional.text.rouge import rouge_score
+from torchmetrics.text.rouge import ROUGEScore
+from torchmetrics.utilities.imports import _NLTK_AVAILABLE, _ROUGE_SCORE_AVAILABLE
+
+PREDS = "My name is John".split()
+TARGET = "Is your name John".split()
+
+
+@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
+@pytest.mark.parametrize("rouge_metric, expected", [("rouge1_recall", 0.25)])
+def test_rouge_metric_functional(rouge_metric, expected):
+    pl_output = tensor(rouge_score(PREDS, TARGET)[rouge_metric]).float()
+    assert torch.allclose(pl_output, tensor(expected), 1e-4)
+
+
+@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
+@pytest.mark.parametrize("rouge_metric, expected", [("rouge1_recall", 0.25)])
+def test_rouge_metric_class(rouge_metric, expected):
+    rouge = ROUGEScore()
+    pl_output = tensor(rouge(PREDS, TARGET)[rouge_metric]).float()
+    assert torch.allclose(pl_output, tensor(expected), 1e-4)
+
+
+def test_rouge_metric_raises_errors_and_warnings():
+    """ Test that expected warnings and errors are raised """
+    if not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE):
+        with pytest.raises(
+            ValueError,
+            match='ROUGE metric requires that both nltk and rouge-score is installed.'
+            'Either as `pip install torchmetrics[text]`'
+            ' or `pip install nltk rouge-score`'
+        ):
+            ROUGEScore()
@@ -60,5 +60,5 @@
     RetrievalPrecision,
     RetrievalRecall,
 )
-from torchmetrics.text import WER, BLEUScore  # noqa: E402, F401
+from torchmetrics.text import WER, BLEUScore, ROUGEScore  # noqa: E402, F401
 from torchmetrics.wrappers import BootStrapper  # noqa: E402, F401
@@ -58,4 +58,5 @@
 from torchmetrics.functional.retrieval.reciprocal_rank import retrieval_reciprocal_rank  # noqa: F401
 from torchmetrics.functional.self_supervised import embedding_similarity  # noqa: F401
 from torchmetrics.functional.text.bleu import bleu_score  # noqa: F401
+from torchmetrics.functional.text.rouge import rouge_score  # noqa: F401
 from torchmetrics.functional.text.wer import wer  # noqa: F401
@@ -0,0 +1,158 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict, List, Tuple
+
+import numpy as np
+from torch import Tensor, tensor
+
+from torchmetrics.utilities.imports import _NLTK_AVAILABLE, _ROUGE_SCORE_AVAILABLE
+
+if _ROUGE_SCORE_AVAILABLE:
+    from rouge_score.rouge_scorer import RougeScorer
+    from rouge_score.scoring import AggregateScore, BootstrapAggregator, Score
+else:
+    RougeScorer, AggregateScore, Score, BootstrapAggregator = object, object, object, object
+
+
+def add_newline_to_end_of_each_sentence(x: str) -> str:
+    """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
+    if _NLTK_AVAILABLE:
+        import nltk
+        nltk.download("punkt", quiet=True, force=False)
+
+    re.sub("<n>", "", x)  # remove pegasus newline char
+    assert nltk, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
+    return "\n".join(nltk.sent_tokenize(x))
+
+
+def format_rouge_results(result: Dict[str, AggregateScore], decimal_places: int = 4) -> Dict[str, Tensor]:
+    flattened_result = {}
+    for rouge_key, rouge_aggregate_score in result.items():
+        for stat in ["precision", "recall", "fmeasure"]:
+            mid = rouge_aggregate_score.mid
+            score = round(getattr(mid, stat), decimal_places)
+            flattened_result[f"{rouge_key}_{stat}"] = tensor(score)
+    return flattened_result
+
+
+class RougeBatchAggregator(BootstrapAggregator):
+    """
+    Aggregates rouge scores and provides confidence intervals.
+    """
+
+    def aggregate(self) -> Dict[str, AggregateScore]:
+        """
+        Override function to wrap the final results in `Score` objects.
+        This is due to the scores being replaced with a list of torch tensors.
+        """
+        result = {}
+        for score_type, scores in self._scores.items():
+            # Stack scores into a 2-d matrix of (sample, measure).
+            score_matrix = np.vstack(tuple(scores))
+            # Percentiles are returned as (interval, measure).
+            percentiles = self._bootstrap_resample(score_matrix)
+            # Extract the three intervals (low, mid, high).
+            intervals = tuple(Score(*percentiles[j, :]) for j in range(3))
+            result[score_type] = AggregateScore(low=intervals[0], mid=intervals[1], high=intervals[2])
+        return result
+
+    def add_scores(self, scores: Dict[str, List[Tensor]]) -> None:
+        self._scores = scores
+
+
+def _rouge_score_update(
+    preds: List[str],
+    targets: List[str],
+    scores: Dict[str, List[Tensor]],
+    scorer: RougeScorer,
+    newline_sep: bool = False,
+) -> None:
+
+    for pred, target in zip(preds, targets):
+        # rougeLsum expects "\n" separated sentences within a summary
+        if newline_sep:
+            pred = add_newline_to_end_of_each_sentence(pred)
+            target = add_newline_to_end_of_each_sentence(target)
+        results = scorer.score(pred, target)
+        for key, score in results.items():
+            score = tensor([score.precision, score.recall, score.fmeasure])
+            scores[key].append(score)
+
+
+def _rouge_score_compute(scores: Dict[str, List[Tensor]], aggregator: RougeBatchAggregator) -> Dict[str, Tensor]:
+    aggregator.add_scores(scores)
+    result = aggregator.aggregate()
+    return format_rouge_results(result)
+
+
+def rouge_score(
+    preds: List[str],
+    targets: List[str],
+    newline_sep: bool = False,
+    use_stemmer: bool = False,
+    rouge_keys: Tuple[str] = ("rouge1", "rouge2", "rougeL", "rougeLsum")  # type: ignore
+) -> Dict[str, Tensor]:
+    """
+    Calculate `ROUGE score <https://en.wikipedia.org/wiki/ROUGE_(metric)>`_, used for automatic summarization.
+
+    Args:
+        preds:
+            An iterable of predicted sentences.
+        targets:
+            An iterable of target sentences.
+        newline_sep:
+            New line separate the inputs.
+        use_stemmer:
+            Use Porter stemmer to strip word suffixes to improve matching.
+        rouge_keys:
+            A list of rouge types to calculate.
+
+    Return:
+        Python dictionary of rouge scores for each input rouge key.
+
+    Example:
+        >>> targets = "Is your name John".split()
+        >>> preds = "My name is John".split()
+        >>> from pprint import pprint
+        >>> pprint(rouge_score(preds, targets))  # doctest: +NORMALIZE_WHITESPACE +SKIP
+        {'rouge1_fmeasure': 0.25,
+         'rouge1_precision': 0.25,
+         'rouge1_recall': 0.25,
+         'rouge2_fmeasure': 0.0,
+         'rouge2_precision': 0.0,
+         'rouge2_recall': 0.0,
+         'rougeL_fmeasure': 0.25,
+         'rougeL_precision': 0.25,
+         'rougeL_recall': 0.25,
+         'rougeLsum_fmeasure': 0.25,
+         'rougeLsum_precision': 0.25,
+         'rougeLsum_recall': 0.25}
+
+    References:
+        [1] ROUGE: A Package for Automatic Evaluation of Summaries by Chin-Yew Lin https://aclanthology.org/W04-1013/
+    """
+
+    if not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE):
+        raise ValueError(
+            'ROUGE metric requires that both nltk and rouge-score is installed.'
+            'Either as `pip install torchmetrics[text]`'
+        )
+
+    aggregator = RougeBatchAggregator()
+    scorer = RougeScorer(rouge_keys, use_stemmer=use_stemmer)
+    scores: Dict[str, List[Tensor]] = {key: [] for key in rouge_keys}
+
+    _rouge_score_update(preds, targets, scores=scores, scorer=scorer, newline_sep=newline_sep)
+    return _rouge_score_compute(scores, aggregator=aggregator)
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from torchmetrics.text.bleu import BLEUScore  # noqa: F401
+from torchmetrics.text.rouge import ROUGEScore  # noqa: F401
 from torchmetrics.text.wer import WER  # noqa: F401