From 74899e451114e4126367b28bc372d647b1c20002 Mon Sep 17 00:00:00 2001 From: Yoav Katz <68273864+yoavkatz@users.noreply.github.com> Date: Thu, 28 Mar 2024 10:43:05 +0200 Subject: [PATCH] Allow disabling per group f1 scores in customF1 (#719) Signed-off-by: Yoav Katz Co-authored-by: Elron Bandel --- prepare/metrics/custom_f1.py | 67 +++++++++++++++++++++++++++++++++++ src/unitxt/metrics.py | 46 ++++++++++++++++-------- tests/library/test_metrics.py | 36 +++++++++++++++++++ 3 files changed, 134 insertions(+), 15 deletions(-) diff --git a/prepare/metrics/custom_f1.py b/prepare/metrics/custom_f1.py index a5ae1af60..42df32fce 100644 --- a/prepare/metrics/custom_f1.py +++ b/prepare/metrics/custom_f1.py @@ -358,4 +358,71 @@ global_target=global_target, ) + +class NERWithoutClassReporting(NER): + report_per_group_scores = False + + +metric_without_class_reporting = NERWithoutClassReporting() +# 1.4 multi classes multi examples +predictions = [ + [ + ("Dalia", "Person"), + ("Amir", "Person"), + ("Yaron", "Person"), + ("Ramat-Gan", "Location"), + ("Ramat-Gan", "Location"), + ("IBM", "Org"), + ("CIA", "Org"), + ("FBI", "Org"), + ] +] +references = [ + [ + [ + ("Amir", "Person"), + ("Yaron", "Person"), + ("Dalia", "Person"), + ("Naftali", "Person"), + ("Ramat-Gan", "Location"), + ("Givataaim", "Location"), + ] + ] +] +# Person: Precision = 3/3, Recall = 3/4, F1 = 2 * 1 * 0.75 / (1 + 0.75) = 0.8571 +# Location: Precision = 1/2, Recall = 1/2, F1 = 0.5 +# Org (OOD): Precision = 0/3, Recall = 0/0 = 1(!), F1 = 0 +instance_targets = [ + { + "recall_micro": 0.67, + "recall_macro": 0.62, + "precision_micro": 0.5, + "precision_macro": 0.75, # Only on indomain classes + "f1_macro": 0.68, + "in_classes_support": 0.62, + "f1_micro": 0.57, + "score": 0.57, + "score_name": "f1_micro", + }, +] +global_target = { + "recall_micro": 0.67, + "recall_macro": 0.62, + "precision_micro": 0.5, + "precision_macro": 0.75, + "f1_macro": 0.68, + "in_classes_support": 0.62, + "f1_micro": 0.57, + "score": 0.57, + "score_name": "f1_micro", +} + +outputs = test_metric( + metric=metric_without_class_reporting, + predictions=predictions, + references=references, + instance_targets=instance_targets, + global_target=global_target, +) + add_to_catalog(metric, "metrics.ner", overwrite=True) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index a336b3df5..f4a5e60d3 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1606,7 +1606,8 @@ class CustomF1(GlobalMetric): prediction_type = "Any" single_reference_per_prediction = True groups = None - zero_division = 0.0 + zero_division: float = 0.0 + report_per_group_scores: bool = True @abstractmethod def get_element_group(self, element, additional_input): @@ -1737,6 +1738,35 @@ def compute( num_of_unknown_class_predictions += pd result = f1_result + self.add_macro_scores(f1_result, recall_result, precision_result, result) + self.add_in_class_support_scores( + num_of_unknown_class_predictions, pd_total, result + ) + self.add_micro_scores(rd_total, rn_total, pd_total, pn_total, result) + if not self.report_per_group_scores: + for group in groups: + del result[f"f1_{group}"] + return result + + def add_micro_scores(self, rd_total, rn_total, pd_total, pn_total, result): + result["f1_micro"] = self.f1(pn_total, pd_total, rn_total, rd_total) + result["recall_micro"] = self.recall(pn_total, pd_total, rn_total, rd_total) + result["precision_micro"] = self.precision( + pn_total, pd_total, rn_total, rd_total + ) + + def add_in_class_support_scores( + self, num_of_unknown_class_predictions, pd_total, result + ): + amount_of_predictions = pd_total + if amount_of_predictions == 0: + result["in_classes_support"] = 1.0 + else: + result["in_classes_support"] = ( + 1.0 - num_of_unknown_class_predictions / amount_of_predictions + ) + + def add_macro_scores(self, f1_result, recall_result, precision_result, result): try: result["f1_macro"] = sum(f1_result.values()) / len(result.keys()) result["recall_macro"] = sum(recall_result.values()) / len( @@ -1750,20 +1780,6 @@ def compute( result["recall_macro"] = self.zero_division result["precision_macro"] = self.zero_division - amount_of_predictions = pd_total - if amount_of_predictions == 0: - result["in_classes_support"] = 1.0 - else: - result["in_classes_support"] = ( - 1.0 - num_of_unknown_class_predictions / amount_of_predictions - ) - result["f1_micro"] = self.f1(pn_total, pd_total, rn_total, rd_total) - result["recall_micro"] = self.recall(pn_total, pd_total, rn_total, rd_total) - result["precision_micro"] = self.precision( - pn_total, pd_total, rn_total, rd_total - ) - return result - class NER(CustomF1): prediction_type = "List[Tuple[str,str]]" diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 21a1b2f92..d273f2350 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -2,6 +2,7 @@ from src.unitxt.logging_utils import get_logger from src.unitxt.metrics import ( + NER, Accuracy, BinaryAccuracy, BinaryMaxAccuracy, @@ -718,6 +719,41 @@ def test_normalized_sacrebleu(self): global_target = 1.0 self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + def test_ner(self): + metric = NER() + predictions = [ + [ + ("Dalia", "Person"), + ("Ramat-Gan", "Location"), + ("IBM", "Org"), + ] + ] + references = [ + [ + [ + ("Dalia", "Person"), + ("Givataaim", "Location"), + ] + ] + ] + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + global_target = 1.0 + self.assertAlmostEqual( + global_target, outputs[0]["score"]["global"]["f1_Person"] + ) + global_target = 0.0 + self.assertAlmostEqual( + global_target, outputs[0]["score"]["global"]["f1_Location"] + ) + metric.report_per_group_scores = False + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + self.assertTrue("f1_Person" not in outputs[0]["score"]["global"]) + self.assertTrue("f1_Location" not in outputs[0]["score"]["global"]) + def test_llama_index_correctness(self): metric = LlamaIndexCorrectness(model_name="mock") predictions = ["1976"]