Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add danFEVER #97

Merged
merged 9 commits into from
Jan 26, 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:10.510144","scores":{"da":{"ndcg_at_1":0.28982,"ndcg_at_3":0.36722,"ndcg_at_5":0.37753,"ndcg_at_10":0.38335,"ndcg_at_100":0.38781,"ndcg_at_1000":0.388,"map_at_1":0.28974,"map_at_3":0.34882,"map_at_5":0.35458,"map_at_10":0.35702,"map_at_100":0.35811,"map_at_1000":0.35812,"recall_at_1":0.28974,"recall_at_3":0.42013,"recall_at_5":0.445,"recall_at_10":0.46273,"recall_at_100":0.48188,"recall_at_1000":0.48329,"precision_at_1":0.28982,"precision_at_3":0.14007,"precision_at_5":0.08903,"precision_at_10":0.0463,"precision_at_100":0.00482,"precision_at_1000":0.00048,"mrr_at_1":0.28982,"mrr_at_3":0.34889,"mrr_at_5":0.35463,"mrr_at_10":0.35709,"mrr_at_100":0.35815,"mrr_at_1000":0.35816}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:56.185726","scores":{"da":{"ndcg_at_1":0.21732,"ndcg_at_3":0.26397,"ndcg_at_5":0.27362,"ndcg_at_10":0.28184,"ndcg_at_100":0.29779,"ndcg_at_1000":0.30425,"map_at_1":0.21732,"map_at_3":0.25292,"map_at_5":0.25828,"map_at_10":0.26166,"map_at_100":0.26466,"map_at_1000":0.26488,"recall_at_1":0.21732,"recall_at_3":0.29578,"recall_at_5":0.31916,"recall_at_10":0.34466,"recall_at_100":0.42225,"recall_at_1000":0.47434,"precision_at_1":0.21732,"precision_at_3":0.09859,"precision_at_5":0.06383,"precision_at_10":0.03447,"precision_at_100":0.00423,"precision_at_1000":0.00047,"mrr_at_1":0.21732,"mrr_at_3":0.25297,"mrr_at_5":0.2583,"mrr_at_10":0.26168,"mrr_at_100":0.26468,"mrr_at_1000":0.2649}},"main_score":"ndcg_at_10"}
10 changes: 10 additions & 0 deletions src/seb/registered_tasks/danish.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,13 @@ def create_da_political_comments() -> Task:
task.domain = ["social"]
task.reference = "https://huggingface.co/datasets/danish_political_comments" # TODO: Make a PR for MTEB to add this reference
return task


@tasks.register("DanFEVER")
def create_dan_fever() -> Task:
from .mteb_retrieval import DanFever

task = MTEBTask(DanFever())
task.name = "DanFEVER"
task.domain = ["wiki", "non-fiction"]
return task
81 changes: 81 additions & 0 deletions src/seb/registered_tasks/mteb_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any

import datasets
from mteb.abstasks import AbsTaskRetrieval


class DanFever(AbsTaskRetrieval):
@property
def description(self) -> dict[str, Any]:
return {
"name": "DanFEVER",
"hf_hub_name": "strombergnlp/danfever",
"description": "A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.",
"reference": "https://aclanthology.org/2021.nodalida-main.47/",
"type": "Retrieval",
"category": "p2p",
"eval_splits": ["train"],
"eval_langs": ["da"],
"main_score": "ndcg_at_10",
"revision": "5d01e3f6a661d48e127ab5d7e3aaa0dc8331438a",
}

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.description["hf_hub_name"],
revision=self.description.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""
and transform to a retrieval datset, which have the following attributes

self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
self.corpus[split] = {}
self.relevant_docs[split] = {}
self.queries[split] = {}

ds = self.dataset[split]
claims = ds["claim"]
evidences = ds["evidence_extract"]
labels = ds["label"]
class_labels = ds.features["label"].names

for claim, evidence, label_id in zip(claims, evidences, labels):
claim_is_supported = class_labels[label_id] == "Supported"

sim = 1 if claim_is_supported else 0 # negative for refutes claims - is that what we want?
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Muennighoff - The DanFEVER dataset is similar to the FEVER dataset. Just wanted to make sure that this dataset is constructed fairly similar to FEVER.

I use the claim as the query to all the evidence segments as the corpus. The relevance score is then determined by whether the claim is supported.

However, I am unsure if assigning 0 to "not supported" and "not enough evidence" is meaningful.

What are your thoughts?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I am unsure if assigning 0 to "not supported" and "not enough evidence" is meaningful.

If that's the same way it is done for FEVER, then I think it's okay!

Copy link
Owner Author

@KennethEnevoldsen KennethEnevoldsen Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure how it is done for FEVER (can I find the processing script somewhere?)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what it says in BEIR, so it does seem like they everything that's not the evidence is a 0
FEVER [60] The Fact Extraction and VERification dataset is collected to facilitate the automatic fact checking. We utilize the original paper splits as queries Q and retrieve evidences from the pre-processed Wikipedia Abstracts (June 2017 dump) as our corpus T


if claim not in text2id:
text2id[claim] = str(len(text2id))
if evidence not in text2id:
text2id[evidence] = len(text2id)

claim_id = str(text2id[claim])
evidence_id = str(text2id[evidence])

self.queries[split][claim_id] = claim
self.corpus[split][evidence_id] = {"title": "", "text": evidence}

if claim_id not in self.relevant_docs[split]:
self.relevant_docs[split][claim_id] = {}

self.relevant_docs[split][claim_id][evidence_id] = sim
9 changes: 5 additions & 4 deletions tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@ def test_task_result_main_score(task_result: seb.TaskResult):
assert task_result.get_main_score(["da", "nb"]) - 0.3 < 0.0001


@pytest.mark.skip(
reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.",
)
@pytest.mark.parametrize("task_name", all_tasks_names)
# @pytest.mark.skip(
# reason="This test downloads all datasets. It takes a long time to test and often fails due to errors on HF's side.",
# )
# @pytest.mark.parametrize("task_name", all_tasks_names)
@pytest.mark.parametrize("task_name", ["DanFEVER"])
@pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"])
def test_all_tasks(task_name: str, model_name: str):
task: seb.Task = seb.get_task(task_name)
Expand Down
Loading