Skip to content

Commit

Permalink
text: reduce lang. specific mecab-ko (#2456)
Browse files Browse the repository at this point in the history
* text: reduce lang. specific `mecab-ko`
* mecab-ko for testing
* Add skips for TestSacreBLEUScore for ko-mecab

---------

Co-authored-by: daniel.stancl <daniel.stancl@rossum.ai>
  • Loading branch information
Borda and daniel.stancl committed Apr 19, 2024
1 parent d03ca5e commit 9d04667
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
2 changes: 0 additions & 2 deletions requirements/text.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,5 @@ tqdm >=4.41.0, <4.67.0
regex >=2021.9.24, <=2023.12.25
transformers >4.4.0, <4.40.0
mecab-python3 >=1.0.6, <1.1.0
mecab-ko >=1.0.0, <1.1.0
mecab-ko-dic >=1.0.0, <1.1.0
ipadic >=1.0.0, <1.1.0
sentencepiece >=0.2.0, <0.3.0
3 changes: 3 additions & 0 deletions requirements/text_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ rouge-score >0.1.0, <=0.1.2
bert_score ==0.3.13
huggingface-hub <0.23
sacrebleu >=2.3.0, <2.5.0

mecab-ko >=1.0.0, <1.1.0
mecab-ko-dic >=1.0.0, <1.1.0
14 changes: 14 additions & 0 deletions tests/unittests/text/test_sacre_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from typing import Sequence

import pytest
from lightning_utilities.core.imports import RequirementCache
from torch import Tensor, tensor
from torchmetrics.functional.text.sacre_bleu import AVAILABLE_TOKENIZERS, _TokenizersLiteral, sacre_bleu_score
from torchmetrics.text.sacre_bleu import SacreBLEUScore
Expand Down Expand Up @@ -51,6 +52,8 @@ class TestSacreBLEUScore(TextTester):
@pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase):
"""Test class implementation of metric."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")
if tokenize == "flores200":
pytest.skip("flores200 tests are flaky") # TODO: figure out why

Expand All @@ -68,6 +71,9 @@ def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase):

def test_bleu_score_functional(self, preds, targets, tokenize, lowercase):
"""Test functional implementation of metric."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")

metric_args = {"tokenize": tokenize, "lowercase": lowercase}
original_sacrebleu = partial(_reference_sacre_bleu, tokenize=tokenize, lowercase=lowercase)

Expand All @@ -81,6 +87,9 @@ def test_bleu_score_functional(self, preds, targets, tokenize, lowercase):

def test_bleu_score_differentiability(self, preds, targets, tokenize, lowercase):
"""Test the differentiability of the metric, according to its `is_differentiable` attribute."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")

metric_args = {"tokenize": tokenize, "lowercase": lowercase}

self.run_differentiability_test(
Expand Down Expand Up @@ -122,6 +131,7 @@ def test_tokenize_ja_mecab():
assert sacrebleu(preds, targets) == _reference_sacre_bleu(preds, targets, tokenize="ja-mecab", lowercase=False)


@pytest.mark.skipif(not RequirementCache("mecab-ko"), reason="this test requires `mecab-ko` package to be installed")
def test_tokenize_ko_mecab():
"""Test that `ja-mecab` tokenizer works on a Japanese text in alignment with the SacreBleu implementation."""
sacrebleu = SacreBLEUScore(tokenize="ko-mecab")
Expand All @@ -134,3 +144,7 @@ def test_tokenize_ko_mecab():
def test_equivalence_of_available_tokenizers_and_annotation():
"""Test equivalence of SacreBLEU available tokenizers and corresponding type annotation."""
assert set(AVAILABLE_TOKENIZERS) == set(_TokenizersLiteral.__args__)


def _should_skip_tokenizer(tokenizer: _TokenizersLiteral) -> bool:
return tokenizer == "ko-mecab" and not RequirementCache("mecab-ko")

0 comments on commit 9d04667

Please sign in to comment.