add intl to korean and arabic (#698)

* add intl * fix * add tests to ar and ko * ko tokenizer * ko-mecab addition * add more examples + error * Moved sacrebleu dependencies to tests.rqr * Added general mechanism for detailed installation messages for required packages Signed-off-by: Yoav Katz <katz@il.ibm.com> * Simplified error mechanism for missing requirements Signed-off-by: Yoav Katz <katz@il.ibm.com> * Add documentation for package requirements mixin Signed-off-by: Yoav Katz <katz@il.ibm.com> * Added unitest for sacrebleu Signed-off-by: Yoav Katz <katz@il.ibm.com> * Fixed typo in doc. Signed-off-by: Yoav Katz <katz@il.ibm.com> --------- Signed-off-by: Yoav Katz <katz@il.ibm.com> Co-authored-by: Przemysław Klocek <przemyslaw.klocek@ibm.com> Co-authored-by: Yoav Katz <katz@il.ibm.com>
IBM · Mar 27, 2024 · 44a410c · 44a410c
1 parent c1fd5e4
commit 44a410c
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 52 deletions.
diff --git a/prepare/metrics/normalized_sacrebleu.py b/prepare/metrics/normalized_sacrebleu.py
@@ -1,5 +1,5 @@
 from src.unitxt import add_to_catalog
-from src.unitxt.metrics import HuggingfaceMetric, MetricPipeline
+from src.unitxt.metrics import MetricPipeline, NormalizedSacrebleu
 from src.unitxt.operators import CopyFields, MapInstanceValues
 from src.unitxt.test_utils.metrics import test_metric
 
@@ -12,15 +12,15 @@
     "French": None,
     "Spanish": None,
     "Portuguese": None,
-    "Arabic": None,
-    "Korean": None,
+    "Arabic": "intl",
+    "Korean": "ko-mecab",
     "fr": None,
     "de": None,
     "es": None,
     "pt": None,
     "en": None,
-    "ar": None,
-    "ko": None,
+    "ar": "intl",
+    "ko": "ko-mecab",
     "japanese": "ja-mecab",
     "Japanese": "ja-mecab",
     "ja": "ja-mecab",
@@ -41,17 +41,11 @@
             use_query=True,
         ),
     ],
-    metric=HuggingfaceMetric(
-        hf_metric_name="sacrebleu",
-        hf_main_score="score",
-        prediction_type="str",
-        main_score="sacrebleu",
-        scale=100.0,
-        scaled_fields=["sacrebleu", "precisions"],
-        hf_additional_input_fields_pass_one_value=["tokenize"],
-    ),
+    metric=NormalizedSacrebleu(),
 )
 
+### ENGLISH
+
 predictions = ["hello there general kenobi", "on our way to ankh morpork"]
 references = [
     ["hello there general kenobi", "hello there !"],
@@ -106,39 +100,117 @@
     global_target=global_target,
 )
 
+
+### JAPANESE
+
 predictions = [
-    "他の専門家たちと同様に、彼は糖尿病を完治できるかどうかについては懐疑的であり、これらの調査結果はすでにI型糖尿病を患っている人々には何の関連性もないことを指摘しています。"
+    "他の専門家たちと同様に、彼は糖尿病を完治できるかどうかについては懐疑的であり、これらの調査結果はすでにI型糖尿病を患っている人々には何の関連性もないことを指摘しています。",
+    "他方、成績評価の甘い授業がく評価されたり、人気取に走教師が出たりし、成績のりや大学教師のレベルダウという弊害をもたら恐れがある、などの反省見もある.",
 ]
 references = [
     [
         "他の専門家たちと同様に、彼は糖尿病を完治できるかどうかについては懐疑的であり、これらの調査結果はすでにI型糖尿病を患っている人々には何の関連性もないことを指摘しています。"
-    ]
+    ],
+    [
+        "他方、成績評価の甘い授業が高く評価されたり、人気取りに走る教師が出たりし、成績の安売りや大学教師のレベルダウンという弊害をもたらす恐れがある、などの反省意見もある."
+    ],
 ]
-task_data = [{"target_language": "ja"}]
+task_data = len(predictions) * [{"target_language": "ja", "tokenize": "ja-mecab"}]
+
 instance_targets = [
     {
-        "bp": 1.0,
         "counts": [57, 56, 55, 54],
+        "totals": [57, 56, 55, 54],
         "precisions": [1.0, 1.0, 1.0, 1.0],
+        "bp": 1.0,
+        "sys_len": 57,
         "ref_len": 57,
         "sacrebleu": 1.0,
         "score": 1.0,
         "score_name": "sacrebleu",
-        "sys_len": 57,
-        "totals": [57, 56, 55, 54],
+    },
+    {
+        "counts": [39, 31, 24, 17],
+        "totals": [47, 46, 45, 44],
+        "precisions": [0.83, 0.67, 0.53, 0.39],
+        "bp": 0.98,
+        "sys_len": 47,
+        "ref_len": 48,
+        "sacrebleu": 0.57,
+        "score": 0.57,
+        "score_name": "sacrebleu",
+    },
+]
+
+
+global_target = {
+    "counts": [96, 87, 79, 71],
+    "totals": [104, 102, 100, 98],
+    "precisions": [0.92, 0.85, 0.79, 0.72],
+    "bp": 0.99,
+    "sys_len": 104,
+    "ref_len": 105,
+    "sacrebleu": 0.81,
+    "score": 0.81,
+    "score_name": "sacrebleu",
+    "score_ci_low": 0.57,
+    "score_ci_high": 1.0,
+    "sacrebleu_ci_low": 0.57,
+    "sacrebleu_ci_high": 1.0,
+}
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+    task_data=task_data,
+)
+
+### ARABIC
+
+predictions = ["لى يسارك ، بر ماركت.", "ﻣَﺮَّﺕ ﻋِﺪَّﺓُ ﺳَﻨَﻮَﺍﺕٍ ﻗَﺒﻞ ﺃَﻥ ﺃَﺭَﺍﻫَﺎ ﻣِﻦ ﺟَﺪِﻳﺪٍ"]
+references = [["على ، ستمر سوبر ماركت."], ["ﻣَﺮَّﺕ ﻋِﺪَّﺓُ ﺳَﻨَﻮَﺍﺕٍ ﻗَﺒﻞ ﺃَﻥ ﺃَﺭَﺍﻫَﺎ ﻣِﻦ ﺟَﺪِﻳﺪٍ"]]
+task_data = len(predictions) * [{"target_language": "ar", "tokenize": "intl"}]
+instance_targets = [
+    {
+        "counts": [3, 1, 0, 0],
+        "totals": [6, 5, 4, 3],
+        "precisions": [0.5, 0.2, 0.12, 0.08],
+        "bp": 1.0,
+        "sys_len": 6,
+        "ref_len": 6,
+        "sacrebleu": 0.18,
+        "score": 0.18,
+        "score_name": "sacrebleu",
+    },
+    {
+        "counts": [8, 7, 6, 5],
+        "totals": [8, 7, 6, 5],
+        "precisions": [1.0, 1.0, 1.0, 1.0],
+        "bp": 1.0,
+        "sys_len": 8,
+        "ref_len": 8,
+        "sacrebleu": 1.0,
+        "score": 1.0,
+        "score_name": "sacrebleu",
     },
 ]
 
 global_target = {
+    "counts": [11, 8, 6, 5],
+    "totals": [14, 12, 10, 8],
+    "precisions": [0.79, 0.67, 0.6, 0.62],
     "bp": 1.0,
-    "counts": [57, 56, 55, 54],
-    "precisions": [1.0, 1.0, 1.0, 1.0],
-    "ref_len": 57,
-    "sacrebleu": 1.0,
-    "score": 1.0,
+    "sys_len": 14,
+    "ref_len": 14,
+    "sacrebleu": 0.67,
+    "score": 0.67,
     "score_name": "sacrebleu",
-    "sys_len": 57,
-    "totals": [57, 56, 55, 54],
+    "score_ci_low": 0.13,
+    "score_ci_high": 1.0,
+    "sacrebleu_ci_low": 0.13,
+    "sacrebleu_ci_high": 1.0,
 }
 
 outputs = test_metric(
@@ -150,5 +222,63 @@
     task_data=task_data,
 )
 
+### KOREAN
+
+predictions = ["이게에 신을 살 거예요", "저는 한국 친구를 사귀고 싶습니다"]
+references = [
+    ["이 가게에서 신발을 살 거예요", "이 가에서 신발살 거예요"],
+    ["저는 한국 친구를 사귀고 싶습니다", "저는 한구를 사귀 싶습니다"],
+]
+task_data = len(predictions) * [{"target_language": "ko", "tokenize": "ko-mecab"}]
+
+instance_targets = [
+    {
+        "counts": [4, 3, 2, 1],
+        "totals": [7, 6, 5, 4],
+        "precisions": [0.57, 0.5, 0.4, 0.25],
+        "bp": 1.0,
+        "sys_len": 7,
+        "ref_len": 7,
+        "sacrebleu": 0.41,
+        "score": 0.41,
+        "score_name": "sacrebleu",
+    },
+    {
+        "counts": [9, 8, 7, 6],
+        "totals": [9, 8, 7, 6],
+        "precisions": [1.0, 1.0, 1.0, 1.0],
+        "bp": 1.0,
+        "sys_len": 9,
+        "ref_len": 9,
+        "sacrebleu": 1.0,
+        "score": 1.0,
+        "score_name": "sacrebleu",
+    },
+]
+
+global_target = {
+    "counts": [13, 11, 9, 7],
+    "totals": [16, 14, 12, 10],
+    "precisions": [0.81, 0.79, 0.75, 0.7],
+    "bp": 1.0,
+    "sys_len": 16,
+    "ref_len": 16,
+    "sacrebleu": 0.76,
+    "score": 0.76,
+    "score_name": "sacrebleu",
+    "score_ci_low": 0.41,
+    "score_ci_high": 1.0,
+    "sacrebleu_ci_low": 0.41,
+    "sacrebleu_ci_high": 1.0,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+    task_data=task_data,
+)
 
 add_to_catalog(metric, "metrics.normalized_sacrebleu", overwrite=True)
diff --git a/requirements/base.rqr b/requirements/base.rqr
@@ -1,6 +1,5 @@
 datasets>=2.16.0
 evaluate
-mecab-python3
 absl-py
 ipadic
 scipy
diff --git a/requirements/tests.rqr b/requirements/tests.rqr
@@ -7,7 +7,8 @@ httpretty~=1.1.4
 editdistance
 rouge-score
 nltk
-sacrebleu
+mecab-python3
+sacrebleu[ko]
 scikit-learn
 jiwer
 conllu

diff --git a/src/unitxt/catalog/metrics/normalized_sacrebleu.json b/src/unitxt/catalog/metrics/normalized_sacrebleu.json
@@ -26,15 +26,15 @@
                     "French": null,
                     "Spanish": null,
                     "Portuguese": null,
-                    "Arabic": null,
-                    "Korean": null,
+                    "Arabic": "intl",
+                    "Korean": "ko-mecab",
                     "fr": null,
                     "de": null,
                     "es": null,
                     "pt": null,
                     "en": null,
-                    "ar": null,
-                    "ko": null,
+                    "ar": "intl",
+                    "ko": "ko-mecab",
                     "japanese": "ja-mecab",
                     "Japanese": "ja-mecab",
                     "ja": "ja-mecab"
@@ -45,18 +45,6 @@
         }
     ],
     "metric": {
-        "type": "huggingface_metric",
-        "hf_metric_name": "sacrebleu",
-        "hf_main_score": "score",
-        "prediction_type": "str",
-        "main_score": "sacrebleu",
-        "scale": 100.0,
-        "scaled_fields": [
-            "sacrebleu",
-            "precisions"
-        ],
-        "hf_additional_input_fields_pass_one_value": [
-            "tokenize"
-        ]
+        "type": "normalized_sacrebleu"
     }
 }
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -3302,3 +3302,30 @@ def compute(
                 best_thr = thr
 
         return {self.main_score: best_acc, "best_thr_max_acc": best_thr}
+
+
+KO_ERROR_MESSAGE = """
+
+Additional dependencies required. To install them, run:
+`pip install "sacrebleu[ko]"`.
+
+For MacOS: If error on 'mecab-config' show up during installation ], one should run:
+
+`brew install mecab`
+`pip install "sacrebleu[ko]"`
+
+"""
+
+
+class NormalizedSacrebleu(HuggingfaceMetric):
+    hf_metric_name = "sacrebleu"
+    hf_main_score = "score"
+    prediction_type = "str"
+    main_score = "sacrebleu"
+    scale = 100.0
+    scaled_fields = ["sacrebleu", "precisions"]
+    hf_additional_input_fields_pass_one_value = ["tokenize"]
+    _requirements_list = {
+        "mecab_ko": KO_ERROR_MESSAGE,
+        "mecab_ko_dic": KO_ERROR_MESSAGE,
+    }
diff --git a/src/unitxt/operator.py b/src/unitxt/operator.py
@@ -1,7 +1,7 @@
 import re
 from abc import abstractmethod
 from dataclasses import field
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Dict, Generator, List, Optional, Union
 
 from .artifact import Artifact
 from .dataclass import InternalField, NonPositionalField
@@ -14,7 +14,18 @@ class Operator(Artifact):
 
 
 class PackageRequirementsMixin(Artifact):
-    _requirements_list: List[str] = InternalField(default_factory=list)
+    """Base class used to automatically check for the existence of required python dependencies for an artifact (e.g. Operator or Metric).
+
+    The _requirement list is either a list of required packages
+    (e.g. ["torch","sentence_transformers"]) or a dictionary between required packages
+    and detailed installation instructions on how how to install each package.
+    (e.g. {"torch" : "Install Torch using `pip install torch`", "sentence_transformers" : Install Sentence Transformers using `pip install sentence-transformers`})
+    Note that the package names should be specified as they are used in the python import statement for the package.
+    """
+
+    _requirements_list: Union[List[str], Dict[str, str]] = InternalField(
+        default_factory=list
+    )
 
     def verify(self):
         super().verify()
@@ -23,19 +34,30 @@ def verify(self):
     def check_missing_requirements(self, requirements=None):
         if requirements is None:
             requirements = self._requirements_list
+        if isinstance(requirements, List):
+            requirements = {package: "" for package in requirements}
+
         missing_packages = []
-        for package in requirements:
+        installation_instructions = []
+        for package, installation_instruction in requirements.items():
             if not is_module_available(package):
                 missing_packages.append(package)
+                installation_instructions.append(installation_instruction)
         if missing_packages:
-            raise MissingRequirementsError(self.__class__.__name__, missing_packages)
+            raise MissingRequirementsError(
+                self.__class__.__name__, missing_packages, installation_instructions
+            )
 
 
 class MissingRequirementsError(Exception):
-    def __init__(self, class_name, missing_packages):
+    def __init__(self, class_name, missing_packages, installation_instructions):
         self.class_name = class_name
         self.missing_packages = missing_packages
-        self.message = f"{self.class_name} requires the following missing package(s): {', '.join(self.missing_packages)}"
+        self.installation_instruction = installation_instructions
+        self.message = (
+            f"{self.class_name} requires the following missing package(s): {', '.join(self.missing_packages)}. "
+            + "\n".join(self.installation_instruction)
+        )
         super().__init__(self.message)