From ccec57cf67f575f97751b7d87d36a273825b9606 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 16:11:36 +0100 Subject: [PATCH 1/7] feat: Added danfever --- src/seb/registered_tasks/danish.py | 10 +++ src/seb/registered_tasks/mteb_retrieval.py | 80 ++++++++++++++++++++++ tests/test_tasks.py | 9 +-- 3 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 src/seb/registered_tasks/mteb_retrieval.py diff --git a/src/seb/registered_tasks/danish.py b/src/seb/registered_tasks/danish.py index a816957..3485816 100644 --- a/src/seb/registered_tasks/danish.py +++ b/src/seb/registered_tasks/danish.py @@ -63,3 +63,13 @@ def create_da_political_comments() -> Task: task.domain = ["social"] task.reference = "https://huggingface.co/datasets/danish_political_comments" # TODO: Make a PR for MTEB to add this reference return task + + +@tasks.register("DanFEVER") +def create_dan_fever() -> Task: + from .mteb_retrieval import DanFever + + task = MTEBTask(DanFever()) + task.name = "DanFEVER" + task.domain = ["wiki", "non-fiction"] + return task diff --git a/src/seb/registered_tasks/mteb_retrieval.py b/src/seb/registered_tasks/mteb_retrieval.py new file mode 100644 index 0000000..3fb1b81 --- /dev/null +++ b/src/seb/registered_tasks/mteb_retrieval.py @@ -0,0 +1,80 @@ +from typing import Any + +import datasets +from mteb.abstasks import AbsTaskRetrieval + + +class DanFever(AbsTaskRetrieval): + @property + def description(self) -> dict[str, Any]: + return { + "name": "DanFEVER", + "hf_hub_name": "strombergnlp/danfever", + "description": "A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.", + "reference": "https://aclanthology.org/2021.nodalida-main.47/", + "type": "Retrieval", + "category": "p2p", + "eval_splits": ["train"], + "eval_langs": ["da"], + "main_score": "ndcg_at_10", + "revision": "5d01e3f6a661d48e127ab5d7e3aaa0dc8331438a", + } + + def load_data(self, **kwargs: dict): # noqa: ARG002 + """ + Load dataset from HuggingFace hub + """ + if self.data_loaded: + return + + self.dataset: datasets.DatasetDict = datasets.load_dataset( + self.description["hf_hub_name"], + revision=self.description.get("revision"), + ) # type: ignore + + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """ + and transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + text2id = {} + + for split in self.dataset: + self.corpus[split] = {} + self.relevant_docs[split] = {} + self.queries[split] = {} + + ds = self.dataset[split] + claims = ds["claim"] + evidences = ds["evidence_extract"] + labels = ds["label"] + + for claim, evidence, label in zip(claims, evidences, labels): + claim_is_supported = label == "Supported" + + sim = 1 if claim_is_supported else 0 # negative for refutes claims - is that what we want? + + if claim not in text2id: + text2id[claim] = str(len(text2id)) + if evidence not in text2id: + text2id[evidence] = len(text2id) + + claim_id = text2id[claim] + evidence_id = text2id[evidence] + + self.queries[split][claim_id] = claim + self.corpus[split][evidence_id] = {"title": "", "text": evidence} + + if claim_id not in self.relevant_docs[split]: + self.relevant_docs[split][claim_id] = {} + + self.relevant_docs[split][claim_id][evidence_id] = sim diff --git a/tests/test_tasks.py b/tests/test_tasks.py index c2a5f0c..187f614 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -52,10 +52,11 @@ def test_task_result_main_score(task_result: seb.TaskResult): assert task_result.get_main_score(["da", "nb"]) - 0.3 < 0.0001 -@pytest.mark.skip( - reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.", -) -@pytest.mark.parametrize("task_name", all_tasks_names) +# @pytest.mark.skip( +# reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.", +# ) +# @pytest.mark.parametrize("task_name", all_tasks_names) +@pytest.mark.parametrize("task_name", ["DanFEVER"]) @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) def test_all_tasks(task_name: str, model_name: str): task: seb.Task = seb.get_task(task_name) From bcc123179a2ac0764f00d223b900b4840fa90da5 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 16:15:25 +0100 Subject: [PATCH 2/7] style: ran linter --- src/seb/interfaces/mteb_task.py | 4 +--- src/seb/interfaces/task.py | 4 +--- src/seb/registered_models/e5_mistral.py | 8 ++------ src/seb/registered_models/translate_e5_models.py | 8 ++------ src/seb/registered_tasks/speed.py | 4 +--- src/seb/result_dataclasses.py | 12 +++--------- 6 files changed, 10 insertions(+), 30 deletions(-) diff --git a/src/seb/interfaces/mteb_task.py b/src/seb/interfaces/mteb_task.py index 9d306bb..5a9b581 100644 --- a/src/seb/interfaces/mteb_task.py +++ b/src/seb/interfaces/mteb_task.py @@ -88,9 +88,7 @@ def evaluate(self, model: Encoder) -> TaskResult: scores = scores.get(split, scores) score_is_nested = isinstance(scores[next(iter(scores.keys()))], dict) if not score_is_nested: - _scores: dict[str, dict[str, Union[float, str]]] = { - lang: scores for lang in self.languages - } + _scores: dict[str, dict[str, Union[float, str]]] = {lang: scores for lang in self.languages} scores = _scores task_result = TaskResult( diff --git a/src/seb/interfaces/task.py b/src/seb/interfaces/task.py index eadddb6..a5de49e 100644 --- a/src/seb/interfaces/task.py +++ b/src/seb/interfaces/task.py @@ -25,9 +25,7 @@ "bible", ] -TaskType = Literal[ - "Classification", "Retrieval", "STS", "BitextMining", "Clustering", "Speed" -] +TaskType = Literal["Classification", "Retrieval", "STS", "BitextMining", "Clustering", "Speed"] class DescriptiveDatasetStats(TypedDict): diff --git a/src/seb/registered_models/e5_mistral.py b/src/seb/registered_models/e5_mistral.py index 4dafe5b..b995d20 100644 --- a/src/seb/registered_models/e5_mistral.py +++ b/src/seb/registered_models/e5_mistral.py @@ -30,9 +30,7 @@ def __init__(self): self.load_model() def load_model(self): - self.tokenizer = AutoTokenizer.from_pretrained( - "intfloat/e5-mistral-7b-instruct" - ) + self.tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-mistral-7b-instruct") self.model = AutoModel.from_pretrained("intfloat/e5-mistral-7b-instruct") def preprocess(self, sentences: Sequence[str]) -> BatchEncoding: @@ -53,9 +51,7 @@ def preprocess(self, sentences: Sequence[str]) -> BatchEncoding: [*input_ids, self.tokenizer.eos_token_id] for input_ids in batch_dict["input_ids"] # type: ignore ] - batch_dict = self.tokenizer.pad( - batch_dict, padding=True, return_attention_mask=True, return_tensors="pt" - ) + batch_dict = self.tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors="pt") return batch_dict diff --git a/src/seb/registered_models/translate_e5_models.py b/src/seb/registered_models/translate_e5_models.py index ea68758..bf300e1 100644 --- a/src/seb/registered_models/translate_e5_models.py +++ b/src/seb/registered_models/translate_e5_models.py @@ -23,12 +23,8 @@ def __init__(self, model_name: str) -> None: def translate(self, sentence: str, src_lang: str) -> str: self.trans_tokenizer.src_lang = src_lang encoded_sent = self.trans_tokenizer(sentence, return_tensors="pt") - gen_tokens = self.trans_model.generate( - **encoded_sent, forced_bos_token_id=self.trans_tokenizer.get_lang_id("en") - ) - return self.trans_tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[ - 0 - ] + gen_tokens = self.trans_model.generate(**encoded_sent, forced_bos_token_id=self.trans_tokenizer.get_lang_id("en")) + return self.trans_tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] def encode( self, diff --git a/src/seb/registered_tasks/speed.py b/src/seb/registered_tasks/speed.py index 395da06..ecd929a 100644 --- a/src/seb/registered_tasks/speed.py +++ b/src/seb/registered_tasks/speed.py @@ -69,9 +69,7 @@ def evaluate(self, model: EmbeddingModel) -> TaskResult: if run_inference: time_taken = self.get_time_taken(model) else: - logger.warn( - f"Could not run inference on {model.meta.name} on {self.device} as it does not have a 'to' method. Skipping" - ) + logger.warn(f"Could not run inference on {model.meta.name} on {self.device} as it does not have a 'to' method. Skipping") time_taken = np.nan scores: dict[str, Union[str, float]] = { diff --git a/src/seb/result_dataclasses.py b/src/seb/result_dataclasses.py index 38fa520..53c6672 100644 --- a/src/seb/result_dataclasses.py +++ b/src/seb/result_dataclasses.py @@ -28,9 +28,7 @@ class TaskResult(BaseModel): task_description: str task_version: str time_of_run: datetime - scores: dict[ - Language, dict[str, Union[float, str]] - ] # {language: {"metric": value}}. + scores: dict[Language, dict[str, Union[float, str]]] # {language: {"metric": value}}. main_score: str def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float: @@ -155,9 +153,7 @@ def to_disk(self, path: Path) -> None: Write task results to a path. """ if path.is_file(): - raise ValueError( - "Can't save BenchmarkResults to a file. Path must be a directory." - ) + raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.") path.mkdir(parents=True, exist_ok=True) for task_result in self.task_results: if isinstance(task_result, TaskResult): @@ -174,9 +170,7 @@ def from_disk(cls, path: Path) -> "BenchmarkResults": Load task results from a path. """ if not path.is_dir(): - raise ValueError( - "Can't load BenchmarkResults from path: {path}. Path must be a directory." - ) + raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.") task_results = [] for file in path.glob("*.json"): if file.stem == "meta": From 37d165f162a324c33c180d767bcdbdc9765e972a Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 16:27:01 +0100 Subject: [PATCH 3/7] fix: Update indexes to strings --- src/seb/registered_tasks/mteb_retrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/seb/registered_tasks/mteb_retrieval.py b/src/seb/registered_tasks/mteb_retrieval.py index 3fb1b81..9d25f16 100644 --- a/src/seb/registered_tasks/mteb_retrieval.py +++ b/src/seb/registered_tasks/mteb_retrieval.py @@ -68,8 +68,8 @@ def dataset_transform(self) -> None: if evidence not in text2id: text2id[evidence] = len(text2id) - claim_id = text2id[claim] - evidence_id = text2id[evidence] + claim_id = str(text2id[claim]) + evidence_id = str(text2id[evidence]) self.queries[split][claim_id] = claim self.corpus[split][evidence_id] = {"title": "", "text": evidence} From 22eb72bc1b530d3425d2b4fd1fb64d3bf3b2c971 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 16:53:14 +0100 Subject: [PATCH 4/7] feat: Added performance metrics for danfever --- .../cache/intfloat__multilingual-e5-small/DanFEVER.json | 1 + .../sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json | 1 + src/seb/registered_tasks/mteb_retrieval.py | 7 ++++--- 3 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 src/seb/cache/intfloat__multilingual-e5-small/DanFEVER.json create mode 100644 src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json diff --git a/src/seb/cache/intfloat__multilingual-e5-small/DanFEVER.json b/src/seb/cache/intfloat__multilingual-e5-small/DanFEVER.json new file mode 100644 index 0000000..4dce99a --- /dev/null +++ b/src/seb/cache/intfloat__multilingual-e5-small/DanFEVER.json @@ -0,0 +1 @@ +{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:10.510144","scores":{"da":{"ndcg_at_1":0.28982,"ndcg_at_3":0.36722,"ndcg_at_5":0.37753,"ndcg_at_10":0.38335,"ndcg_at_100":0.38781,"ndcg_at_1000":0.388,"map_at_1":0.28974,"map_at_3":0.34882,"map_at_5":0.35458,"map_at_10":0.35702,"map_at_100":0.35811,"map_at_1000":0.35812,"recall_at_1":0.28974,"recall_at_3":0.42013,"recall_at_5":0.445,"recall_at_10":0.46273,"recall_at_100":0.48188,"recall_at_1000":0.48329,"precision_at_1":0.28982,"precision_at_3":0.14007,"precision_at_5":0.08903,"precision_at_10":0.0463,"precision_at_100":0.00482,"precision_at_1000":0.00048,"mrr_at_1":0.28982,"mrr_at_3":0.34889,"mrr_at_5":0.35463,"mrr_at_10":0.35709,"mrr_at_100":0.35815,"mrr_at_1000":0.35816}},"main_score":"ndcg_at_10"} \ No newline at end of file diff --git a/src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json b/src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json new file mode 100644 index 0000000..95dc1dd --- /dev/null +++ b/src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json @@ -0,0 +1 @@ +{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:56.185726","scores":{"da":{"ndcg_at_1":0.21732,"ndcg_at_3":0.26397,"ndcg_at_5":0.27362,"ndcg_at_10":0.28184,"ndcg_at_100":0.29779,"ndcg_at_1000":0.30425,"map_at_1":0.21732,"map_at_3":0.25292,"map_at_5":0.25828,"map_at_10":0.26166,"map_at_100":0.26466,"map_at_1000":0.26488,"recall_at_1":0.21732,"recall_at_3":0.29578,"recall_at_5":0.31916,"recall_at_10":0.34466,"recall_at_100":0.42225,"recall_at_1000":0.47434,"precision_at_1":0.21732,"precision_at_3":0.09859,"precision_at_5":0.06383,"precision_at_10":0.03447,"precision_at_100":0.00423,"precision_at_1000":0.00047,"mrr_at_1":0.21732,"mrr_at_3":0.25297,"mrr_at_5":0.2583,"mrr_at_10":0.26168,"mrr_at_100":0.26468,"mrr_at_1000":0.2649}},"main_score":"ndcg_at_10"} \ No newline at end of file diff --git a/src/seb/registered_tasks/mteb_retrieval.py b/src/seb/registered_tasks/mteb_retrieval.py index 9d25f16..0a8d737 100644 --- a/src/seb/registered_tasks/mteb_retrieval.py +++ b/src/seb/registered_tasks/mteb_retrieval.py @@ -39,7 +39,7 @@ def dataset_transform(self) -> None: """ and transform to a retrieval datset, which have the following attributes - self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text self.queries = Dict[query_id, str] #id => query self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] """ @@ -57,9 +57,10 @@ def dataset_transform(self) -> None: claims = ds["claim"] evidences = ds["evidence_extract"] labels = ds["label"] + class_labels = ds.features["label"].names - for claim, evidence, label in zip(claims, evidences, labels): - claim_is_supported = label == "Supported" + for claim, evidence, label_id in zip(claims, evidences, labels): + claim_is_supported = class_labels[label_id] == "Supported" sim = 1 if claim_is_supported else 0 # negative for refutes claims - is that what we want? From be2c071f85fb2454bb4b90e1dbad8602eef6d4bf Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 16:59:12 +0100 Subject: [PATCH 5/7] tests: convert test_task back to normal --- tests/test_tasks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 187f614..c2a5f0c 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -52,11 +52,10 @@ def test_task_result_main_score(task_result: seb.TaskResult): assert task_result.get_main_score(["da", "nb"]) - 0.3 < 0.0001 -# @pytest.mark.skip( -# reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.", -# ) -# @pytest.mark.parametrize("task_name", all_tasks_names) -@pytest.mark.parametrize("task_name", ["DanFEVER"]) +@pytest.mark.skip( + reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.", +) +@pytest.mark.parametrize("task_name", all_tasks_names) @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) def test_all_tasks(task_name: str, model_name: str): task: seb.Task = seb.get_task(task_name) From 04aa44e5c62e26919eddd16487f92c37b85f6fb4 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 17:04:15 +0100 Subject: [PATCH 6/7] tests: remove tests which has to be changed when adding new datasets --- tests/cli/test_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index bfb3adf..2572343 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -33,7 +33,6 @@ def to_command(self, output_path: Path) -> list[str]: BenchmarkCliTestInput("sentence-transformers/all-MiniLM-L6-v2", 0.550, tasks=["DKHate"]), BenchmarkCliTestInput("sentence-transformers/all-MiniLM-L6-v2", 0.525, tasks=["DKHate", "ScaLA"]), BenchmarkCliTestInput("sentence-transformers/all-MiniLM-L6-v2", 0.50, tasks=["DKHate", "ScaLA"], languages=["sv", "nn", "nb"]), - BenchmarkCliTestInput("sentence-transformers/all-MiniLM-L6-v2", 0.423, languages=["da"]), BenchmarkCliTestInput( "test_model", np.nan, code_path=(test_dir / "benchmark_cli_code_inject.py"), tasks=["test-encode-task"], ignore_cache=True ), From a572962c5f6fc6826182c86aa5b6ad5ec1af076e Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 25 Jan 2024 17:04:53 +0100 Subject: [PATCH 7/7] appease pyright --- src/seb/interfaces/mteb_task.py | 2 +- src/seb/registered_models/fasttext.py | 11 ++++++----- src/seb/result_dataclasses.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/seb/interfaces/mteb_task.py b/src/seb/interfaces/mteb_task.py index a978dff..184160b 100644 --- a/src/seb/interfaces/mteb_task.py +++ b/src/seb/interfaces/mteb_task.py @@ -96,7 +96,7 @@ def evaluate(self, model: Encoder) -> TaskResult: task_description=self.description, task_version=self.version, time_of_run=time_of_run, - scores=scores, + scores=scores, # type: ignore main_score=self.main_score, ) diff --git a/src/seb/registered_models/fasttext.py b/src/seb/registered_models/fasttext.py index ac6bedc..2465b93 100644 --- a/src/seb/registered_models/fasttext.py +++ b/src/seb/registered_models/fasttext.py @@ -1,5 +1,6 @@ from collections.abc import Sequence from functools import partial +from typing import Any import numpy as np import torch @@ -10,8 +11,8 @@ class FastTextModel(seb.Encoder): def __init__(self, model_name: str, lang: str) -> None: - import fasttext - import fasttext.util + import fasttext # type: ignore + import fasttext.util # type: ignore fasttext.util.download_model(self.lang, if_exists="ignore") self.model = fasttext.load_model(self.model_name) @@ -22,16 +23,16 @@ def get_embedding_dim(self) -> int: v = self.encode(["get emb dim"]) return v.shape[1] - def encode( + def encode( # type: ignore self, sentences: Sequence[str], - **kwargs: dict, # noqa: ARG002 + **kwargs: Any, # noqa: ARG002 ) -> torch.Tensor: embeddings = [] for sentence in sentences: # This is to appease FastText as they made the function err # if there's a \n in the sentence. - sentence = " ".join(sentence.split()) + sentence = " ".join(sentence.split()) # noqa sentence_embedding = self.model.get_sentence_vector(sentence) embeddings.append(sentence_embedding) return torch.tensor(np.stack(embeddings)) diff --git a/src/seb/result_dataclasses.py b/src/seb/result_dataclasses.py index 2e3021c..91f384e 100644 --- a/src/seb/result_dataclasses.py +++ b/src/seb/result_dataclasses.py @@ -46,7 +46,7 @@ def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float: lang = self.scores.keys() for l in lang: - main_scores.append(self.scores[l][self.main_score]) + main_scores.append(self.scores[l][self.main_score]) # type: ignore return sum(main_scores) / len(main_scores)