From 1ee2ef43c19aacd08a3945e98071ebca0be30841 Mon Sep 17 00:00:00 2001
From: JulesBelveze <jules.belveze@hotmail.fr>
Date: Thu, 20 Jul 2023 15:14:52 +0200
Subject: [PATCH 1/4] chore(NEROutput): getitem by word

---
 .pre-commit-config.yaml               |  2 +-
 langtest/utils/custom_types/output.py | 36 ++++++++++++---------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bb964cfbe..4a1bd4842 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
     hooks:
       - id: pydocstyle
         args: [
-            "--add-ignore=D100,D104,D105,D400,D415",
+            "--add-ignore=D100,D104,D105,D400,D415,D419",
             "--add-select=D417",
             "--convention=google"
         ]
diff --git a/langtest/utils/custom_types/output.py b/langtest/utils/custom_types/output.py
index be1affc8f..26db98f25 100644
--- a/langtest/utils/custom_types/output.py
+++ b/langtest/utils/custom_types/output.py
@@ -5,9 +5,7 @@
 
 
 class SequenceClassificationOutput(BaseModel):
-    """
-    Output model for text classification tasks.
-    """
+    """Output model for text classification tasks."""
 
     predictions: List[SequenceLabel]
 
@@ -68,9 +66,7 @@ def __str__(self) -> str:
 
 
 class NEROutput(BaseModel):
-    """
-    Output model for NER tasks.
-    """
+    """Output model for NER tasks."""
 
     predictions: List[NERPrediction]
 
@@ -84,11 +80,16 @@ def __len__(self):
         return len(self.predictions)
 
     def __getitem__(
-        self, item: Union[Span, int]
+        self, item: Union[Span, int, str]
     ) -> Optional[Union[List[NERPrediction], NERPrediction]]:
         """"""
         if isinstance(item, int):
             return self.predictions[item]
+        elif isinstance(item, str):
+            for pred in self.predictions:
+                if pred.span.word == item:
+                    return pred
+            return None
         elif isinstance(item, Span):
             for prediction in self.predictions:
                 if prediction.span == item:
@@ -98,8 +99,7 @@ def __getitem__(
             return [self.predictions[i] for i in range(item.start, item.stop)]
 
     def to_str_list(self) -> str:
-        """
-        Converts predictions into a list of strings.
+        """Converts predictions into a list of strings.
 
         Returns:
             List[str]: predictions in form of a list of strings.
@@ -122,28 +122,24 @@ def __eq__(self, other: "NEROutput"):
 
 
 class TranslationOutput(BaseModel):
-    """
-    Output model for translation tasks.
-    """
+    """Output model for translation tasks."""
 
     translation_text: str  # Changed from List[str] to str
 
     def to_str_list(self) -> List[str]:
-        """
-        Returns the translation_text as a list of strings.
+        """Formatting helper
+
+        Returns:
+             List[str]: the translation_text as a list of strings.
         """
         return [self.translation_text]  # Wrap self.translation_text in a list
 
     def __str__(self):
-        """
-        String representation of TranslationOutput.
-        """
+        """String representation of TranslationOutput."""
         return self.translation_text  # Return translation_text directly
 
     def __eq__(self, other):
-        """
-        Equality comparison method.
-        """
+        """Equality comparison method."""
         if isinstance(other, TranslationOutput):
             return self.translation_text == other.translation_text
         if isinstance(other, list):

From f835095661174940834fd68e4188973473abf4e8 Mon Sep 17 00:00:00 2001
From: JulesBelveze <jules.belveze@hotmail.fr>
Date: Thu, 20 Jul 2023 15:16:08 +0200
Subject: [PATCH 2/4] fix(formatter): exportation method

---
 langtest/datahandler/datasource.py |  33 ++++--
 langtest/datahandler/format.py     | 163 +++++++++++++----------------
 2 files changed, 96 insertions(+), 100 deletions(-)

diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py
index c0326df97..f090fb400 100644
--- a/langtest/datahandler/datasource.py
+++ b/langtest/datahandler/datasource.py
@@ -1,5 +1,6 @@
 import csv
 import importlib
+from collections import defaultdict
 import os
 import re
 from abc import ABC, abstractmethod
@@ -373,11 +374,10 @@ def export_data(self, data: List[Sample], output_path: str):
             output_path (str):
                 path to save the data to
         """
-        temp_id = None
         otext = ""
         for i in data:
-            text, temp_id = Formatter.process(i, output_format="conll", temp_id=temp_id)
-            otext += text
+            text = Formatter.process(i, output_format="conll")
+            otext += text + "\n"
 
         with open(output_path, "wb") as fwriter:
             fwriter.write(bytes(otext, encoding="utf-8"))
@@ -534,15 +534,26 @@ def export_data(self, data: List[Sample], output_path: str):
             output_path (str):
                 path to save the data to
         """
-        temp_id = None
-        otext = ""
         if self.task == "ner":
-            for i in data:
-                text, temp_id = Formatter.process(i, output_format="csv", temp_id=temp_id)
-                otext += text
-
-            with open(output_path, "wb") as fwriter:
-                fwriter.write(bytes(otext, encoding="utf-8"))
+            final_data = defaultdict(list)
+            for elt in data:
+                tokens, labels, testcase_tokens, testcase_labels = Formatter.process(
+                    elt, output_format="csv"
+                )
+                final_data["text"].append(tokens)
+                final_data["ner"].append(labels)
+                final_data["testcase_text"].append(testcase_tokens)
+                final_data["testcase_labels"].append(testcase_labels)
+
+            if (
+                sum([len(labels) for labels in final_data["testcase_labels"]])
+                * sum([len(tokens) for tokens in final_data["testcase_text"]])
+                == 0
+            ):
+                final_data.pop("testcase_text")
+                final_data.pop("testcase_labels")
+
+            pd.DataFrame(data=final_data).to_csv(output_path, index=False)
 
         elif self.task == "text-classification":
             rows = []
diff --git a/langtest/datahandler/format.py b/langtest/datahandler/format.py
index 22a2b0fb2..aae8be1d7 100644
--- a/langtest/datahandler/format.py
+++ b/langtest/datahandler/format.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
-
+from typing import Tuple, List, Union
+import re
 from ..utils.custom_types import Sample
 
 
@@ -91,11 +91,10 @@ class SequenceClassificationOutputFormatter(BaseFormatter, ABC):
 
     @staticmethod
     def to_csv(sample: Sample) -> str:
-        """
-        Convert a Sample object into a row for exporting.
+        """Convert a Sample object into a row for exporting.
 
         Args:
-            Sample :
+            Sample:
                 Sample object to convert.
 
         Returns:
@@ -118,8 +117,8 @@ class NEROutputFormatter(BaseFormatter):
 
     @staticmethod
     def to_csv(
-        sample: Sample, delimiter: str = ",", temp_id: int = None
-    ) -> Tuple[str, int]:
+        sample: Sample, delimiter: str = ","
+    ) -> Tuple[List[str], List[str], List[str], List[str]]:
         """Converts a custom type to a CSV string.
 
         Args:
@@ -127,106 +126,92 @@ def to_csv(
                 The input sample containing the `NEROutput` object to convert.
             delimiter (str):
                 The delimiter character to use in the CSV string.
-            temp_id (int):
-                A temporary ID to use for grouping entities by document.
 
         Returns:
-            Tuple[str, int]:
-                The CSV or CoNLL string representation of the `NEROutput` object along with the document id
+            Tuple[List[str], List[str], List[str], List[str]]:
+                tuple containing the list of tokens of the original sentence, the list of
+                labels of the original sentence, the list of tokens for the perturbed sentence
+                and the labels of the perturbed sentence.
         """
-        text = ""
         test_case = sample.test_case
         original = sample.original
+
+        words = re.finditer(r"([^\s]+)", original)
+        tokens, labels = [], []
+
+        for word in words:
+            tokens.append(word.group())
+            match = sample.expected_results[word.group()]
+            labels.append(match.entity if match is not None else "O")
+
+        assert len([label for label in labels if label != "O"]) == len(
+            sample.expected_results
+        )
+
         if test_case:
-            test_case_items = test_case.split()
-            norm_test_case_items = test_case.lower().split()
-            norm_original_items = original.lower().split()
-            temp_len = 0
-            for jdx, item in enumerate(norm_test_case_items):
-                if item in norm_original_items and jdx >= norm_original_items.index(item):
-                    oitem_index = norm_original_items.index(item)
-                    j = sample.expected_results.predictions[oitem_index + temp_len]
-                    if temp_id != j.doc_id and jdx == 0:
-                        text += f"{j.doc_name}\n\n"
-                        temp_id = j.doc_id
-                    text += f"{test_case_items[jdx]}{delimiter}{j.pos_tag}{delimiter}{j.chunk_tag}{delimiter}{j.entity}\n"
-                    norm_original_items.pop(oitem_index)
-                    temp_len += 1
-                else:
-                    o_item = norm_original_items[jdx - temp_len]
-                    letters_count = len(set(o_item) - set(item))
-                    if len(norm_test_case_items) == len(
-                        norm_original_items
-                    ) or letters_count < len(o_item):
-                        tl = sample.expected_results.predictions[jdx]
-                        text += f"{test_case_items[jdx]}{delimiter}{tl.pos_tag}{delimiter}{tl.chunk_tag}{delimiter}{tl.entity}\n"
-                    else:
-                        text += f"{test_case_items[jdx]}{delimiter}O{delimiter}O{delimiter}O\n"
-            text += "\n"
+            test_case_words = re.finditer(r"([^\s]+)", test_case)
+            test_case_tokens, test_case_labels = [], []
 
-        else:
-            for j in sample.expected_results.predictions:
-                if temp_id != j.doc_id:
-                    text += f"{j.doc_name}\n\n"
-                    temp_id = j.doc_id
-                text += f"{j.span.word}{delimiter}{j.pos_tag}{delimiter}{j.chunk_tag}{delimiter}{j.entity}\n"
-            text += "\n"
-        return text, temp_id
+            for word in test_case_words:
+                test_case_tokens.append(word.group())
+                match = sample.actual_results[word.group()]
+                test_case_labels.append(match.entity if match is not None else "O")
+
+            assert len([token for token in test_case_tokens if token != "O"]) == len(
+                sample.actual_results
+            )
+            return tokens, labels, test_case_tokens, test_case_labels
+        return tokens, labels, [], []
 
     @staticmethod
-    def to_conll(sample: Sample, temp_id: int = None) -> Tuple[str, int]:
+    def to_conll(
+        sample: Sample, writing_mode: str = "ignore"
+    ) -> Union[str, Tuple[str, str]]:
         """Converts a custom type to a CoNLL string.
 
         Args:
             sample (Sample):
                 The input sample containing the `NEROutput` object to convert.
-            temp_id (int):
-                A temporary ID to use for grouping entities by document.
+            writing_mode (str):
+                what to do with the expected results if present:
+                - ignore: simply ignores the expected_results
+                - append: the formatted expected_results to the original ones
+                - separate: returns a formatted string for the original sentence and one for
+                            the perturbed sentence
 
         Returns:
             The CoNLL string representation of the custom type.
         """
-        text = ""
+        assert writing_mode in [
+            "ignore",
+            "append",
+            "separate",
+        ], f"writing_mode: {writing_mode} not supported."
+
+        text, text_perturbed = "", ""
         test_case = sample.test_case
         original = sample.original
-        if test_case:
-            test_case_items = test_case.split()
-            norm_test_case_items = test_case.lower().split()
-            norm_original_items = original.lower().split()
-            temp_len = 0
-            for jdx, item in enumerate(norm_test_case_items):
-                try:
-                    if item in norm_original_items and jdx >= norm_original_items.index(
-                        item
-                    ):
-                        oitem_index = norm_original_items.index(item)
-                        j = sample.expected_results.predictions[oitem_index + temp_len]
-                        if temp_id != j.doc_id and jdx == 0:
-                            text += f"{j.doc_name}\n\n"
-                            temp_id = j.doc_id
-                        text += f"{test_case_items[jdx]} {j.pos_tag} {j.chunk_tag} {j.entity}\n"
-                        norm_original_items.pop(oitem_index)
-                        temp_len += 1
-                    else:
-                        o_item = sample.expected_results.predictions[jdx].span.word
-                        letters_count = len(set(item) - set(o_item))
-                        if (
-                            len(norm_test_case_items) == len(original.lower().split())
-                            or letters_count < 2
-                        ):
-                            tl = sample.expected_results.predictions[jdx]
-                            text += f"{test_case_items[jdx]} {tl.pos_tag} {tl.chunk_tag} {tl.entity}\n"
-                        else:
-                            text += f"{test_case_items[jdx]} O O O\n"
-                except IndexError:
-                    text += f"{test_case_items[jdx]} O O O\n"
-            text += "\n"
 
-        else:
-            for j in sample.expected_results.predictions:
-                if temp_id != j.doc_id:
-                    text += f"{j.doc_name}\n\n"
-                    temp_id = j.doc_id
-                text += f"{j.span.word} {j.pos_tag} {j.chunk_tag} {j.entity}\n"
-            text += "\n"
-        return text, temp_id
+        words = re.finditer(r"([^\s]+)", original)
+
+        for word in words:
+            token = word.group()
+            match = sample.expected_results[word.group()]
+            label = match.entity if match is not None else "O"
+            text += f"{token} X X {label}\n"
+
+        if test_case and writing_mode != "ignore":
+            words = re.finditer(r"([^\s]+)", test_case)
+
+            for word in words:
+                token = word.group()
+                match = sample.actual_results[word.group()]
+                label = match.entity if match is not None else "O"
+                if writing_mode == "append":
+                    text += f"{token} X X {label}\n"
+                elif writing_mode == "separate":
+                    text_perturbed += f"{token} X X {label}\n"
+
+        if writing_mode == "separate":
+            return text, text_perturbed
+        return text

From 7fd6c8901d0e099591916989b574478fab8d78bc Mon Sep 17 00:00:00 2001
From: JulesBelveze <jules.belveze@hotmail.fr>
Date: Thu, 20 Jul 2023 15:16:32 +0200
Subject: [PATCH 3/4] tests(datasource): add tests for NER exportation

---
 tests/test_datasource.py | 104 ++++++++++++++++++++++++++++++++-------
 1 file changed, 87 insertions(+), 17 deletions(-)

diff --git a/tests/test_datasource.py b/tests/test_datasource.py
index fbadc2090..3083a8db3 100644
--- a/tests/test_datasource.py
+++ b/tests/test_datasource.py
@@ -1,4 +1,5 @@
 import pytest
+import pandas as pd
 
 from langtest.datahandler.datasource import (
     CSVDataset,
@@ -6,7 +7,12 @@
     HuggingFaceDataset,
     JSONLDataset,
 )
-from langtest.utils.custom_types.output import NEROutput, SequenceClassificationOutput
+from langtest.utils.custom_types.output import (
+    NEROutput,
+    SequenceClassificationOutput,
+    NERPrediction,
+    Span,
+)
 from langtest.utils.custom_types.sample import (
     NERSample,
     QASample,
@@ -17,24 +23,34 @@
 )
 
 
-@pytest.mark.parametrize(
-    "dataset,feature_col,target_col",
-    [
-        (
-            CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"),
-            "tokens",
-            "ner_tags",
-        ),
-        (
-            ConllDataset(file_path="tests/fixtures/test.conll", task="ner"),
-            "text",
-            "labels",
-        ),
-    ],
-)
 class TestNERDataset:
     """Test cases for ner datasets"""
 
+    sample = NERSample(
+        original="I do love KFC",
+        test_type="add_context",
+        expected_results=NEROutput(
+            predictions=[
+                NERPrediction(entity="PROD", span=Span(start=10, end=13, word="KFC"))
+            ]
+        ),
+    )
+
+    @pytest.mark.parametrize(
+        "dataset,feature_col,target_col",
+        [
+            (
+                CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"),
+                "tokens",
+                "ner_tags",
+            ),
+            (
+                ConllDataset(file_path="tests/fixtures/test.conll", task="ner"),
+                "text",
+                "labels",
+            ),
+        ],
+    )
     def test_load_raw_data(self, dataset, feature_col, target_col):
         """"""
         raw_data = dataset.load_raw_data()
@@ -52,7 +68,14 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
             for label in sample[target_col]:
                 assert isinstance(label, str)
 
-    def test_load_data(self, dataset, feature_col, target_col):
+    @pytest.mark.parametrize(
+        "dataset",
+        [
+            CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"),
+            ConllDataset(file_path="tests/fixtures/test.conll", task="ner"),
+        ],
+    )
+    def test_load_data(self, dataset):
         """"""
         samples = dataset.load_data()
 
@@ -62,6 +85,53 @@ def test_load_data(self, dataset, feature_col, target_col):
             assert isinstance(sample, NERSample)
             assert isinstance(sample.expected_results, NEROutput)
 
+    def test_export_data_csv(self):
+        """"""
+        dataset = CSVDataset(file_path="tests/fixtures/tner.csv", task="ner")
+        dataset.export_data(
+            data=[self.sample, self.sample], output_path="/tmp/exported_sample.csv"
+        )
+
+        df = pd.read_csv("/tmp/exported_sample.csv")
+        saved_sample = df.text[0]
+
+        assert isinstance(saved_sample, str)
+        assert " ".join(eval(saved_sample)) == self.sample.original
+
+    def test_export_data_conll(self):
+        """"""
+        dataset = ConllDataset(file_path="tests/fixtures/test.conll", task="ner")
+        dataset.export_data(
+            data=[self.sample, self.sample], output_path="/tmp/exported_sample.conll"
+        )
+
+        all_tokens, all_labels = [], []
+        tokens, labels = [], []
+        with open("/tmp/exported_sample.conll", "r") as reader:
+            content = reader.read()
+
+            for line in content.strip().split("\n"):
+                row = line.strip().split()
+                if len(row) == 0:
+                    if len(tokens) > 0:
+                        all_tokens.append(tokens)
+                        all_labels.append(labels)
+                        tokens = []
+                        labels = []
+                    continue
+                tokens.append(row[0])
+                labels.append(row[-1])
+
+            if len(tokens) != 0:
+                all_tokens.append(tokens)
+                all_labels.append(labels)
+
+        assert len(all_tokens) == len(all_labels) == 2
+        assert " ".join(all_tokens[0]) == self.sample.original
+
+        # assert isinstance(saved_sample, str)
+        # assert " ".join(eval(saved_sample)) == self.sample.original
+
 
 @pytest.mark.parametrize(
     "dataset,feature_col,target_col",

From 34200c0af5f4b7653418d3b58078751f9866ed1e Mon Sep 17 00:00:00 2001
From: JulesBelveze <jules.belveze@hotmail.fr>
Date: Thu, 20 Jul 2023 17:04:38 +0200
Subject: [PATCH 4/4] fix(tests): templatic augmentation

---
 langtest/datahandler/format.py |  6 +++---
 tests/test_augmentation.py     | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/langtest/datahandler/format.py b/langtest/datahandler/format.py
index aae8be1d7..cafeef602 100644
--- a/langtest/datahandler/format.py
+++ b/langtest/datahandler/format.py
@@ -198,7 +198,7 @@ def to_conll(
             token = word.group()
             match = sample.expected_results[word.group()]
             label = match.entity if match is not None else "O"
-            text += f"{token} X X {label}\n"
+            text += f"{token} -X- -X- {label}\n"
 
         if test_case and writing_mode != "ignore":
             words = re.finditer(r"([^\s]+)", test_case)
@@ -208,9 +208,9 @@ def to_conll(
                 match = sample.actual_results[word.group()]
                 label = match.entity if match is not None else "O"
                 if writing_mode == "append":
-                    text += f"{token} X X {label}\n"
+                    text += f"{token} -X- -X- {label}\n"
                 elif writing_mode == "separate":
-                    text_perturbed += f"{token} X X {label}\n"
+                    text_perturbed += f"{token} -X- -X- {label}\n"
 
         if writing_mode == "separate":
             return text, text_perturbed
diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py
index 3727c9bd4..436a8c6eb 100644
--- a/tests/test_augmentation.py
+++ b/tests/test_augmentation.py
@@ -221,16 +221,16 @@ def test_fix(self):
             "My -X- -X- O",
             "name -X- -X- O",
             "is -X- -X- O",
-            "Jean NN NN B-PER",
-            "- NN NN I-PER",
-            "Pierre NN NN I-PER",
+            "Jean -X- -X- B-PER",
+            "- -X- -X- I-PER",
+            "Pierre -X- -X- I-PER",
             "and -X- -X- O",
             "I -X- -X- O",
             "am -X- -X- O",
             "from -X- -X- O",
-            "New NN NN B-LOC",
-            "York NN NN I-LOC",
-            "City NN NN I-LOC",
+            "New -X- -X- B-LOC",
+            "York -X- -X- I-LOC",
+            "City -X- -X- I-LOC",
         ]
         generator = TemplaticAugment(
             templates=["My name is {PER} and I am from {LOC}"], task="ner"