From 1ee2ef43c19aacd08a3945e98071ebca0be30841 Mon Sep 17 00:00:00 2001 From: JulesBelveze Date: Thu, 20 Jul 2023 15:14:52 +0200 Subject: [PATCH 1/4] chore(NEROutput): getitem by word --- .pre-commit-config.yaml | 2 +- langtest/utils/custom_types/output.py | 36 ++++++++++++--------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bb964cfbe..4a1bd4842 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: hooks: - id: pydocstyle args: [ - "--add-ignore=D100,D104,D105,D400,D415", + "--add-ignore=D100,D104,D105,D400,D415,D419", "--add-select=D417", "--convention=google" ] diff --git a/langtest/utils/custom_types/output.py b/langtest/utils/custom_types/output.py index be1affc8f..26db98f25 100644 --- a/langtest/utils/custom_types/output.py +++ b/langtest/utils/custom_types/output.py @@ -5,9 +5,7 @@ class SequenceClassificationOutput(BaseModel): - """ - Output model for text classification tasks. - """ + """Output model for text classification tasks.""" predictions: List[SequenceLabel] @@ -68,9 +66,7 @@ def __str__(self) -> str: class NEROutput(BaseModel): - """ - Output model for NER tasks. - """ + """Output model for NER tasks.""" predictions: List[NERPrediction] @@ -84,11 +80,16 @@ def __len__(self): return len(self.predictions) def __getitem__( - self, item: Union[Span, int] + self, item: Union[Span, int, str] ) -> Optional[Union[List[NERPrediction], NERPrediction]]: """""" if isinstance(item, int): return self.predictions[item] + elif isinstance(item, str): + for pred in self.predictions: + if pred.span.word == item: + return pred + return None elif isinstance(item, Span): for prediction in self.predictions: if prediction.span == item: @@ -98,8 +99,7 @@ def __getitem__( return [self.predictions[i] for i in range(item.start, item.stop)] def to_str_list(self) -> str: - """ - Converts predictions into a list of strings. + """Converts predictions into a list of strings. Returns: List[str]: predictions in form of a list of strings. @@ -122,28 +122,24 @@ def __eq__(self, other: "NEROutput"): class TranslationOutput(BaseModel): - """ - Output model for translation tasks. - """ + """Output model for translation tasks.""" translation_text: str # Changed from List[str] to str def to_str_list(self) -> List[str]: - """ - Returns the translation_text as a list of strings. + """Formatting helper + + Returns: + List[str]: the translation_text as a list of strings. """ return [self.translation_text] # Wrap self.translation_text in a list def __str__(self): - """ - String representation of TranslationOutput. - """ + """String representation of TranslationOutput.""" return self.translation_text # Return translation_text directly def __eq__(self, other): - """ - Equality comparison method. - """ + """Equality comparison method.""" if isinstance(other, TranslationOutput): return self.translation_text == other.translation_text if isinstance(other, list): From f835095661174940834fd68e4188973473abf4e8 Mon Sep 17 00:00:00 2001 From: JulesBelveze Date: Thu, 20 Jul 2023 15:16:08 +0200 Subject: [PATCH 2/4] fix(formatter): exportation method --- langtest/datahandler/datasource.py | 33 ++++-- langtest/datahandler/format.py | 163 +++++++++++++---------------- 2 files changed, 96 insertions(+), 100 deletions(-) diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py index c0326df97..f090fb400 100644 --- a/langtest/datahandler/datasource.py +++ b/langtest/datahandler/datasource.py @@ -1,5 +1,6 @@ import csv import importlib +from collections import defaultdict import os import re from abc import ABC, abstractmethod @@ -373,11 +374,10 @@ def export_data(self, data: List[Sample], output_path: str): output_path (str): path to save the data to """ - temp_id = None otext = "" for i in data: - text, temp_id = Formatter.process(i, output_format="conll", temp_id=temp_id) - otext += text + text = Formatter.process(i, output_format="conll") + otext += text + "\n" with open(output_path, "wb") as fwriter: fwriter.write(bytes(otext, encoding="utf-8")) @@ -534,15 +534,26 @@ def export_data(self, data: List[Sample], output_path: str): output_path (str): path to save the data to """ - temp_id = None - otext = "" if self.task == "ner": - for i in data: - text, temp_id = Formatter.process(i, output_format="csv", temp_id=temp_id) - otext += text - - with open(output_path, "wb") as fwriter: - fwriter.write(bytes(otext, encoding="utf-8")) + final_data = defaultdict(list) + for elt in data: + tokens, labels, testcase_tokens, testcase_labels = Formatter.process( + elt, output_format="csv" + ) + final_data["text"].append(tokens) + final_data["ner"].append(labels) + final_data["testcase_text"].append(testcase_tokens) + final_data["testcase_labels"].append(testcase_labels) + + if ( + sum([len(labels) for labels in final_data["testcase_labels"]]) + * sum([len(tokens) for tokens in final_data["testcase_text"]]) + == 0 + ): + final_data.pop("testcase_text") + final_data.pop("testcase_labels") + + pd.DataFrame(data=final_data).to_csv(output_path, index=False) elif self.task == "text-classification": rows = [] diff --git a/langtest/datahandler/format.py b/langtest/datahandler/format.py index 22a2b0fb2..aae8be1d7 100644 --- a/langtest/datahandler/format.py +++ b/langtest/datahandler/format.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from typing import Tuple - +from typing import Tuple, List, Union +import re from ..utils.custom_types import Sample @@ -91,11 +91,10 @@ class SequenceClassificationOutputFormatter(BaseFormatter, ABC): @staticmethod def to_csv(sample: Sample) -> str: - """ - Convert a Sample object into a row for exporting. + """Convert a Sample object into a row for exporting. Args: - Sample : + Sample: Sample object to convert. Returns: @@ -118,8 +117,8 @@ class NEROutputFormatter(BaseFormatter): @staticmethod def to_csv( - sample: Sample, delimiter: str = ",", temp_id: int = None - ) -> Tuple[str, int]: + sample: Sample, delimiter: str = "," + ) -> Tuple[List[str], List[str], List[str], List[str]]: """Converts a custom type to a CSV string. Args: @@ -127,106 +126,92 @@ def to_csv( The input sample containing the `NEROutput` object to convert. delimiter (str): The delimiter character to use in the CSV string. - temp_id (int): - A temporary ID to use for grouping entities by document. Returns: - Tuple[str, int]: - The CSV or CoNLL string representation of the `NEROutput` object along with the document id + Tuple[List[str], List[str], List[str], List[str]]: + tuple containing the list of tokens of the original sentence, the list of + labels of the original sentence, the list of tokens for the perturbed sentence + and the labels of the perturbed sentence. """ - text = "" test_case = sample.test_case original = sample.original + + words = re.finditer(r"([^\s]+)", original) + tokens, labels = [], [] + + for word in words: + tokens.append(word.group()) + match = sample.expected_results[word.group()] + labels.append(match.entity if match is not None else "O") + + assert len([label for label in labels if label != "O"]) == len( + sample.expected_results + ) + if test_case: - test_case_items = test_case.split() - norm_test_case_items = test_case.lower().split() - norm_original_items = original.lower().split() - temp_len = 0 - for jdx, item in enumerate(norm_test_case_items): - if item in norm_original_items and jdx >= norm_original_items.index(item): - oitem_index = norm_original_items.index(item) - j = sample.expected_results.predictions[oitem_index + temp_len] - if temp_id != j.doc_id and jdx == 0: - text += f"{j.doc_name}\n\n" - temp_id = j.doc_id - text += f"{test_case_items[jdx]}{delimiter}{j.pos_tag}{delimiter}{j.chunk_tag}{delimiter}{j.entity}\n" - norm_original_items.pop(oitem_index) - temp_len += 1 - else: - o_item = norm_original_items[jdx - temp_len] - letters_count = len(set(o_item) - set(item)) - if len(norm_test_case_items) == len( - norm_original_items - ) or letters_count < len(o_item): - tl = sample.expected_results.predictions[jdx] - text += f"{test_case_items[jdx]}{delimiter}{tl.pos_tag}{delimiter}{tl.chunk_tag}{delimiter}{tl.entity}\n" - else: - text += f"{test_case_items[jdx]}{delimiter}O{delimiter}O{delimiter}O\n" - text += "\n" + test_case_words = re.finditer(r"([^\s]+)", test_case) + test_case_tokens, test_case_labels = [], [] - else: - for j in sample.expected_results.predictions: - if temp_id != j.doc_id: - text += f"{j.doc_name}\n\n" - temp_id = j.doc_id - text += f"{j.span.word}{delimiter}{j.pos_tag}{delimiter}{j.chunk_tag}{delimiter}{j.entity}\n" - text += "\n" - return text, temp_id + for word in test_case_words: + test_case_tokens.append(word.group()) + match = sample.actual_results[word.group()] + test_case_labels.append(match.entity if match is not None else "O") + + assert len([token for token in test_case_tokens if token != "O"]) == len( + sample.actual_results + ) + return tokens, labels, test_case_tokens, test_case_labels + return tokens, labels, [], [] @staticmethod - def to_conll(sample: Sample, temp_id: int = None) -> Tuple[str, int]: + def to_conll( + sample: Sample, writing_mode: str = "ignore" + ) -> Union[str, Tuple[str, str]]: """Converts a custom type to a CoNLL string. Args: sample (Sample): The input sample containing the `NEROutput` object to convert. - temp_id (int): - A temporary ID to use for grouping entities by document. + writing_mode (str): + what to do with the expected results if present: + - ignore: simply ignores the expected_results + - append: the formatted expected_results to the original ones + - separate: returns a formatted string for the original sentence and one for + the perturbed sentence Returns: The CoNLL string representation of the custom type. """ - text = "" + assert writing_mode in [ + "ignore", + "append", + "separate", + ], f"writing_mode: {writing_mode} not supported." + + text, text_perturbed = "", "" test_case = sample.test_case original = sample.original - if test_case: - test_case_items = test_case.split() - norm_test_case_items = test_case.lower().split() - norm_original_items = original.lower().split() - temp_len = 0 - for jdx, item in enumerate(norm_test_case_items): - try: - if item in norm_original_items and jdx >= norm_original_items.index( - item - ): - oitem_index = norm_original_items.index(item) - j = sample.expected_results.predictions[oitem_index + temp_len] - if temp_id != j.doc_id and jdx == 0: - text += f"{j.doc_name}\n\n" - temp_id = j.doc_id - text += f"{test_case_items[jdx]} {j.pos_tag} {j.chunk_tag} {j.entity}\n" - norm_original_items.pop(oitem_index) - temp_len += 1 - else: - o_item = sample.expected_results.predictions[jdx].span.word - letters_count = len(set(item) - set(o_item)) - if ( - len(norm_test_case_items) == len(original.lower().split()) - or letters_count < 2 - ): - tl = sample.expected_results.predictions[jdx] - text += f"{test_case_items[jdx]} {tl.pos_tag} {tl.chunk_tag} {tl.entity}\n" - else: - text += f"{test_case_items[jdx]} O O O\n" - except IndexError: - text += f"{test_case_items[jdx]} O O O\n" - text += "\n" - else: - for j in sample.expected_results.predictions: - if temp_id != j.doc_id: - text += f"{j.doc_name}\n\n" - temp_id = j.doc_id - text += f"{j.span.word} {j.pos_tag} {j.chunk_tag} {j.entity}\n" - text += "\n" - return text, temp_id + words = re.finditer(r"([^\s]+)", original) + + for word in words: + token = word.group() + match = sample.expected_results[word.group()] + label = match.entity if match is not None else "O" + text += f"{token} X X {label}\n" + + if test_case and writing_mode != "ignore": + words = re.finditer(r"([^\s]+)", test_case) + + for word in words: + token = word.group() + match = sample.actual_results[word.group()] + label = match.entity if match is not None else "O" + if writing_mode == "append": + text += f"{token} X X {label}\n" + elif writing_mode == "separate": + text_perturbed += f"{token} X X {label}\n" + + if writing_mode == "separate": + return text, text_perturbed + return text From 7fd6c8901d0e099591916989b574478fab8d78bc Mon Sep 17 00:00:00 2001 From: JulesBelveze Date: Thu, 20 Jul 2023 15:16:32 +0200 Subject: [PATCH 3/4] tests(datasource): add tests for NER exportation --- tests/test_datasource.py | 104 ++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 17 deletions(-) diff --git a/tests/test_datasource.py b/tests/test_datasource.py index fbadc2090..3083a8db3 100644 --- a/tests/test_datasource.py +++ b/tests/test_datasource.py @@ -1,4 +1,5 @@ import pytest +import pandas as pd from langtest.datahandler.datasource import ( CSVDataset, @@ -6,7 +7,12 @@ HuggingFaceDataset, JSONLDataset, ) -from langtest.utils.custom_types.output import NEROutput, SequenceClassificationOutput +from langtest.utils.custom_types.output import ( + NEROutput, + SequenceClassificationOutput, + NERPrediction, + Span, +) from langtest.utils.custom_types.sample import ( NERSample, QASample, @@ -17,24 +23,34 @@ ) -@pytest.mark.parametrize( - "dataset,feature_col,target_col", - [ - ( - CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"), - "tokens", - "ner_tags", - ), - ( - ConllDataset(file_path="tests/fixtures/test.conll", task="ner"), - "text", - "labels", - ), - ], -) class TestNERDataset: """Test cases for ner datasets""" + sample = NERSample( + original="I do love KFC", + test_type="add_context", + expected_results=NEROutput( + predictions=[ + NERPrediction(entity="PROD", span=Span(start=10, end=13, word="KFC")) + ] + ), + ) + + @pytest.mark.parametrize( + "dataset,feature_col,target_col", + [ + ( + CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"), + "tokens", + "ner_tags", + ), + ( + ConllDataset(file_path="tests/fixtures/test.conll", task="ner"), + "text", + "labels", + ), + ], + ) def test_load_raw_data(self, dataset, feature_col, target_col): """""" raw_data = dataset.load_raw_data() @@ -52,7 +68,14 @@ def test_load_raw_data(self, dataset, feature_col, target_col): for label in sample[target_col]: assert isinstance(label, str) - def test_load_data(self, dataset, feature_col, target_col): + @pytest.mark.parametrize( + "dataset", + [ + CSVDataset(file_path="tests/fixtures/tner.csv", task="ner"), + ConllDataset(file_path="tests/fixtures/test.conll", task="ner"), + ], + ) + def test_load_data(self, dataset): """""" samples = dataset.load_data() @@ -62,6 +85,53 @@ def test_load_data(self, dataset, feature_col, target_col): assert isinstance(sample, NERSample) assert isinstance(sample.expected_results, NEROutput) + def test_export_data_csv(self): + """""" + dataset = CSVDataset(file_path="tests/fixtures/tner.csv", task="ner") + dataset.export_data( + data=[self.sample, self.sample], output_path="/tmp/exported_sample.csv" + ) + + df = pd.read_csv("/tmp/exported_sample.csv") + saved_sample = df.text[0] + + assert isinstance(saved_sample, str) + assert " ".join(eval(saved_sample)) == self.sample.original + + def test_export_data_conll(self): + """""" + dataset = ConllDataset(file_path="tests/fixtures/test.conll", task="ner") + dataset.export_data( + data=[self.sample, self.sample], output_path="/tmp/exported_sample.conll" + ) + + all_tokens, all_labels = [], [] + tokens, labels = [], [] + with open("/tmp/exported_sample.conll", "r") as reader: + content = reader.read() + + for line in content.strip().split("\n"): + row = line.strip().split() + if len(row) == 0: + if len(tokens) > 0: + all_tokens.append(tokens) + all_labels.append(labels) + tokens = [] + labels = [] + continue + tokens.append(row[0]) + labels.append(row[-1]) + + if len(tokens) != 0: + all_tokens.append(tokens) + all_labels.append(labels) + + assert len(all_tokens) == len(all_labels) == 2 + assert " ".join(all_tokens[0]) == self.sample.original + + # assert isinstance(saved_sample, str) + # assert " ".join(eval(saved_sample)) == self.sample.original + @pytest.mark.parametrize( "dataset,feature_col,target_col", From 34200c0af5f4b7653418d3b58078751f9866ed1e Mon Sep 17 00:00:00 2001 From: JulesBelveze Date: Thu, 20 Jul 2023 17:04:38 +0200 Subject: [PATCH 4/4] fix(tests): templatic augmentation --- langtest/datahandler/format.py | 6 +++--- tests/test_augmentation.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/langtest/datahandler/format.py b/langtest/datahandler/format.py index aae8be1d7..cafeef602 100644 --- a/langtest/datahandler/format.py +++ b/langtest/datahandler/format.py @@ -198,7 +198,7 @@ def to_conll( token = word.group() match = sample.expected_results[word.group()] label = match.entity if match is not None else "O" - text += f"{token} X X {label}\n" + text += f"{token} -X- -X- {label}\n" if test_case and writing_mode != "ignore": words = re.finditer(r"([^\s]+)", test_case) @@ -208,9 +208,9 @@ def to_conll( match = sample.actual_results[word.group()] label = match.entity if match is not None else "O" if writing_mode == "append": - text += f"{token} X X {label}\n" + text += f"{token} -X- -X- {label}\n" elif writing_mode == "separate": - text_perturbed += f"{token} X X {label}\n" + text_perturbed += f"{token} -X- -X- {label}\n" if writing_mode == "separate": return text, text_perturbed diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 3727c9bd4..436a8c6eb 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -221,16 +221,16 @@ def test_fix(self): "My -X- -X- O", "name -X- -X- O", "is -X- -X- O", - "Jean NN NN B-PER", - "- NN NN I-PER", - "Pierre NN NN I-PER", + "Jean -X- -X- B-PER", + "- -X- -X- I-PER", + "Pierre -X- -X- I-PER", "and -X- -X- O", "I -X- -X- O", "am -X- -X- O", "from -X- -X- O", - "New NN NN B-LOC", - "York NN NN I-LOC", - "City NN NN I-LOC", + "New -X- -X- B-LOC", + "York -X- -X- I-LOC", + "City -X- -X- I-LOC", ] generator = TemplaticAugment( templates=["My name is {PER} and I am from {LOC}"], task="ner"