Skip to content

Commit

Permalink
add Rouge score (#399)
Browse files Browse the repository at this point in the history
* Adding new metric ROUGE Metric for text

* Added tests for the ROUGE metric

* Updated docs and imports, added types

* Apply suggestions from code review

* Applied changes suggested in code review

* Updated text dependencies and CHANGELOG

* Fix typing issues

* Updated docs dependencies

* pkg

* pkg

* set jiwer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Simplified the implementation for batches and added more tests.

* Updated docs requirements and removed unused imports.

* Fix typing, rigorously check rouge_keys, add tests for rouge_keys error, update doc strings

* Remove unused imports

* Apply suggestions from code review

* Fixed typing and added docstrings for update and compute method

* Changes based on review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people committed Jul 29, 2021
1 parent b1062c9 commit bb083f2
Show file tree
Hide file tree
Showing 12 changed files with 575 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -20,6 +20,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added Symmetric Mean Absolute Percentage error (SMAPE) ([#375](https://github.com/PyTorchLightning/metrics/issues/375))


- Added ROUGE Metric ([#399](https://github.com/PyTorchLightning/metrics/issues/399))


- Allowed passing labels in (n_samples, n_classes) to `AveragePrecision` ([#386](https://github.com/PyTorchLightning/metrics/issues/386))


Expand Down
4 changes: 4 additions & 0 deletions docs/source/references/functional.rst
Expand Up @@ -355,6 +355,10 @@ bleu_score [func]
.. autofunction:: torchmetrics.functional.bleu_score
:noindex:

rouge_score [func]
~~~~~~~~~~~~~~~~~~

.. autofunction:: torchmetrics.functional.rouge_score

wer [func]
~~~~~~~~~~
Expand Down
6 changes: 6 additions & 0 deletions docs/source/references/modules.rst
Expand Up @@ -517,6 +517,12 @@ BLEUScore
.. autoclass:: torchmetrics.BLEUScore
:noindex:

ROUGEScore
~~~~~~~~~~

.. autoclass:: torchmetrics.ROUGEScore
:noindex:


WER
~~~
Expand Down
1 change: 0 additions & 1 deletion requirements/test.txt
Expand Up @@ -15,7 +15,6 @@ phmdoctest>=1.1.1
cloudpickle>=1.3
scikit-learn>=0.24
scikit-image>0.17.1
nltk>=3.6

# add extra requirements
-r image.txt
Expand Down
4 changes: 3 additions & 1 deletion requirements/text.txt
@@ -1 +1,3 @@
jiwer==2.2.0
jiwer>=2.2.0
nltk>=3.6
rouge-score>=0.0.4
205 changes: 205 additions & 0 deletions tests/text/test_rouge.py
@@ -0,0 +1,205 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import pytest
import torch
from torch import tensor

from torchmetrics.functional.text.rouge import rouge_score
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.utilities.imports import _NLTK_AVAILABLE, _ROUGE_SCORE_AVAILABLE

if _ROUGE_SCORE_AVAILABLE:
from rouge_score.rouge_scorer import RougeScorer
from rouge_score.scoring import BootstrapAggregator
else:
RougeScorer, BootstrapAggregator = object, object

ROUGE_KEYS = ("rouge1", "rouge2", "rougeL", "rougeLsum")

PRECISION = 0
RECALL = 1
F_MEASURE = 2

SINGLE_SENTENCE_EXAMPLE_PREDS = 'The quick brown fox jumps over the lazy dog'
SINGLE_SENTENCE_EXAMPLE_TARGET = 'The quick brown dog jumps on the log.'

PREDS = "My name is John".split()
TARGETS = "Is your name John".split()

BATCHES_RS_PREDS = [SINGLE_SENTENCE_EXAMPLE_PREDS]
BATCHES_RS_PREDS.extend(PREDS)
BATCHES_RS_TARGETS = [SINGLE_SENTENCE_EXAMPLE_TARGET]
BATCHES_RS_TARGETS.extend(TARGETS)

BATCHES = [
dict(preds=[SINGLE_SENTENCE_EXAMPLE_PREDS], targets=[SINGLE_SENTENCE_EXAMPLE_TARGET]),
dict(preds=PREDS, targets=TARGETS)
]


def _compute_rouge_score(preds: List[str], targets: List[str], use_stemmer: bool):
scorer = RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
aggregator = BootstrapAggregator()
for pred, target in zip(preds, targets):
aggregator.add_scores(scorer.score(pred, target))
return aggregator.aggregate()


@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
@pytest.mark.parametrize(
["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
[
pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
],
)
def test_rouge_metric_functional_single_sentence(
pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
):
scorer = RougeScorer(ROUGE_KEYS)
rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_PREDS, SINGLE_SENTENCE_EXAMPLE_TARGET)
rs_output = round(rs_scores[rouge_score_key][metric], decimal_places)

pl_output = rouge_score([SINGLE_SENTENCE_EXAMPLE_PREDS], [SINGLE_SENTENCE_EXAMPLE_TARGET],
newline_sep=newline_sep,
use_stemmer=use_stemmer,
decimal_places=decimal_places)

assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))


@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
@pytest.mark.parametrize(
["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
[
pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
],
)
def test_rouge_metric_functional(
pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
):
rs_scores = _compute_rouge_score(PREDS, TARGETS, use_stemmer=use_stemmer)
rs_output = round(rs_scores[rouge_score_key].mid[metric], decimal_places)

pl_output = rouge_score(
PREDS, TARGETS, newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places
)

assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))


@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
@pytest.mark.parametrize(
["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
[
pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
],
)
def test_rouge_metric_class(pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep):
scorer = RougeScorer(ROUGE_KEYS)
rs_scores = scorer.score(SINGLE_SENTENCE_EXAMPLE_PREDS, SINGLE_SENTENCE_EXAMPLE_TARGET)
rs_output = round(rs_scores[rouge_score_key][metric], decimal_places)

rouge = ROUGEScore(newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places)
pl_output = rouge([SINGLE_SENTENCE_EXAMPLE_PREDS], [SINGLE_SENTENCE_EXAMPLE_TARGET])

assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))


@pytest.mark.skipif(not (_NLTK_AVAILABLE or _ROUGE_SCORE_AVAILABLE), reason='test requires nltk and rouge-score')
@pytest.mark.parametrize(
["pl_rouge_metric_key", "rouge_score_key", "metric", "decimal_places", "use_stemmer", "newline_sep"],
[
pytest.param("rouge1_precision", "rouge1", PRECISION, 1, True, True),
pytest.param("rouge1_recall", "rouge1", RECALL, 2, True, False),
pytest.param("rouge1_fmeasure", "rouge1", F_MEASURE, 3, False, True),
pytest.param("rouge2_precision", "rouge2", PRECISION, 4, False, False),
pytest.param("rouge2_recall", "rouge2", RECALL, 5, True, True),
pytest.param("rouge2_fmeasure", "rouge2", F_MEASURE, 6, True, False),
pytest.param("rougeL_precision", "rougeL", PRECISION, 6, False, True),
pytest.param("rougeL_recall", "rougeL", RECALL, 5, False, False),
pytest.param("rougeL_fmeasure", "rougeL", F_MEASURE, 3, True, True),
pytest.param("rougeLsum_precision", "rougeLsum", PRECISION, 2, True, False),
pytest.param("rougeLsum_recall", "rougeLsum", RECALL, 1, False, True),
pytest.param("rougeLsum_fmeasure", "rougeLsum", F_MEASURE, 8, False, False),
],
)
def test_rouge_metric_class_batches(
pl_rouge_metric_key, rouge_score_key, metric, decimal_places, use_stemmer, newline_sep
):
rs_scores = _compute_rouge_score(BATCHES_RS_PREDS, BATCHES_RS_TARGETS, use_stemmer=use_stemmer)
rs_output = round(rs_scores[rouge_score_key].mid[metric], decimal_places)

rouge = ROUGEScore(newline_sep=newline_sep, use_stemmer=use_stemmer, decimal_places=decimal_places)
for batch in BATCHES:
rouge.update(batch['preds'], batch['targets'])
pl_output = rouge.compute()

assert torch.allclose(pl_output[pl_rouge_metric_key], tensor(rs_output, dtype=torch.float32))


def test_rouge_metric_raises_errors_and_warnings():
""" Test that expected warnings and errors are raised """
if not (_NLTK_AVAILABLE and _ROUGE_SCORE_AVAILABLE):
with pytest.raises(
ValueError,
match='ROUGE metric requires that both nltk and rouge-score is installed.'
'Either as `pip install torchmetrics[text]` or `pip install nltk rouge-score`'
):
ROUGEScore()


def test_rouge_metric_wrong_key_value_error():
key = ("rouge1", "rouge")

with pytest.raises(ValueError):
ROUGEScore(rouge_keys=key)

with pytest.raises(ValueError):
rouge_score(PREDS, TARGETS, rouge_keys=key)
2 changes: 1 addition & 1 deletion torchmetrics/__init__.py
Expand Up @@ -60,5 +60,5 @@
RetrievalPrecision,
RetrievalRecall,
)
from torchmetrics.text import WER, BLEUScore # noqa: E402, F401
from torchmetrics.text import WER, BLEUScore, ROUGEScore # noqa: E402, F401
from torchmetrics.wrappers import BootStrapper # noqa: E402, F401
1 change: 1 addition & 0 deletions torchmetrics/functional/__init__.py
Expand Up @@ -58,4 +58,5 @@
from torchmetrics.functional.retrieval.reciprocal_rank import retrieval_reciprocal_rank # noqa: F401
from torchmetrics.functional.self_supervised import embedding_similarity # noqa: F401
from torchmetrics.functional.text.bleu import bleu_score # noqa: F401
from torchmetrics.functional.text.rouge import rouge_score # noqa: F401
from torchmetrics.functional.text.wer import wer # noqa: F401

0 comments on commit bb083f2

Please sign in to comment.