-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #97 from KennethEnevoldsen/add-danfever
Add danFEVER
- Loading branch information
Showing
8 changed files
with
101 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:10.510144","scores":{"da":{"ndcg_at_1":0.28982,"ndcg_at_3":0.36722,"ndcg_at_5":0.37753,"ndcg_at_10":0.38335,"ndcg_at_100":0.38781,"ndcg_at_1000":0.388,"map_at_1":0.28974,"map_at_3":0.34882,"map_at_5":0.35458,"map_at_10":0.35702,"map_at_100":0.35811,"map_at_1000":0.35812,"recall_at_1":0.28974,"recall_at_3":0.42013,"recall_at_5":0.445,"recall_at_10":0.46273,"recall_at_100":0.48188,"recall_at_1000":0.48329,"precision_at_1":0.28982,"precision_at_3":0.14007,"precision_at_5":0.08903,"precision_at_10":0.0463,"precision_at_100":0.00482,"precision_at_1000":0.00048,"mrr_at_1":0.28982,"mrr_at_3":0.34889,"mrr_at_5":0.35463,"mrr_at_10":0.35709,"mrr_at_100":0.35815,"mrr_at_1000":0.35816}},"main_score":"ndcg_at_10"} |
1 change: 1 addition & 0 deletions
1
src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/DanFEVER.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-01-25T16:46:56.185726","scores":{"da":{"ndcg_at_1":0.21732,"ndcg_at_3":0.26397,"ndcg_at_5":0.27362,"ndcg_at_10":0.28184,"ndcg_at_100":0.29779,"ndcg_at_1000":0.30425,"map_at_1":0.21732,"map_at_3":0.25292,"map_at_5":0.25828,"map_at_10":0.26166,"map_at_100":0.26466,"map_at_1000":0.26488,"recall_at_1":0.21732,"recall_at_3":0.29578,"recall_at_5":0.31916,"recall_at_10":0.34466,"recall_at_100":0.42225,"recall_at_1000":0.47434,"precision_at_1":0.21732,"precision_at_3":0.09859,"precision_at_5":0.06383,"precision_at_10":0.03447,"precision_at_100":0.00423,"precision_at_1000":0.00047,"mrr_at_1":0.21732,"mrr_at_3":0.25297,"mrr_at_5":0.2583,"mrr_at_10":0.26168,"mrr_at_100":0.26468,"mrr_at_1000":0.2649}},"main_score":"ndcg_at_10"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from typing import Any | ||
|
||
import datasets | ||
from mteb.abstasks import AbsTaskRetrieval | ||
|
||
|
||
class DanFever(AbsTaskRetrieval): | ||
@property | ||
def description(self) -> dict[str, Any]: | ||
return { | ||
"name": "DanFEVER", | ||
"hf_hub_name": "strombergnlp/danfever", | ||
"description": "A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.", | ||
"reference": "https://aclanthology.org/2021.nodalida-main.47/", | ||
"type": "Retrieval", | ||
"category": "p2p", | ||
"eval_splits": ["train"], | ||
"eval_langs": ["da"], | ||
"main_score": "ndcg_at_10", | ||
"revision": "5d01e3f6a661d48e127ab5d7e3aaa0dc8331438a", | ||
} | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
revision=self.description.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self) -> None: | ||
""" | ||
and transform to a retrieval datset, which have the following attributes | ||
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document data like title and text | ||
self.queries = Dict[query_id, str] #id => query | ||
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] | ||
""" | ||
self.corpus = {} | ||
self.relevant_docs = {} | ||
self.queries = {} | ||
text2id = {} | ||
|
||
for split in self.dataset: | ||
self.corpus[split] = {} | ||
self.relevant_docs[split] = {} | ||
self.queries[split] = {} | ||
|
||
ds = self.dataset[split] | ||
claims = ds["claim"] | ||
evidences = ds["evidence_extract"] | ||
labels = ds["label"] | ||
class_labels = ds.features["label"].names | ||
|
||
for claim, evidence, label_id in zip(claims, evidences, labels): | ||
claim_is_supported = class_labels[label_id] == "Supported" | ||
|
||
sim = 1 if claim_is_supported else 0 # negative for refutes claims - is that what we want? | ||
|
||
if claim not in text2id: | ||
text2id[claim] = str(len(text2id)) | ||
if evidence not in text2id: | ||
text2id[evidence] = len(text2id) | ||
|
||
claim_id = str(text2id[claim]) | ||
evidence_id = str(text2id[evidence]) | ||
|
||
self.queries[split][claim_id] = claim | ||
self.corpus[split][evidence_id] = {"title": "", "text": evidence} | ||
|
||
if claim_id not in self.relevant_docs[split]: | ||
self.relevant_docs[split][claim_id] = {} | ||
|
||
self.relevant_docs[split][claim_id][evidence_id] = sim |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters