From c76535c76565e2cbec8695b9024742a055705919 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 21 May 2024 09:43:09 +0200 Subject: [PATCH 1/3] docs: Added licenses --- docs/datasets.md | 80 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/docs/datasets.md b/docs/datasets.md index ffe92e00..fd989ada 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -7,35 +7,63 @@ The following tables contains description of all the dataset in the benchmark al -| Dataset | Description | Main Score | Languages | Type | Domains | Number of Documents | Mean Length of Documents (characters) | -|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------|:-----------------------|:---------------|:---------------------------------------------------------------------|----------------------:|:----------------------------------------| -| [Angry Tweets](https://aclanthology.org/2021.nodalida-main.53/) | A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets | Accuracy | da | Classification | social | 1047 | 156.15 (std: 82.02) | -| [Bornholm Parallel](https://aclanthology.org/W19-6138/) | Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden. | F1 | da, da-bornholm | BitextMining | poetry, wiki, fiction, web, social | 1000 | 44.36 (std: 41.22) | -| [DKHate](https://aclanthology.org/2020.lrec-1.430/) | Danish Tweets annotated for Hate Speech either being Offensive or not | Accuracy | da | Classification | social | 329 | 88.18 (std: 168.30) | -| [Da Political Comments](https://huggingface.co/datasets/danish_political_comments) | A dataset of Danish political comments rated for sentiment | Accuracy | da | Classification | social | 7206 | 69.60 (std: 62.85) | -| [DaLAJ](https://spraakbanken.gu.se/en/resources/superlim) | A Swedish dataset for linguistic acceptability. Available as a part of Superlim. | Accuracy | sv | Classification | fiction, non-fiction | 888 | 120.77 (std: 67.95) | -| [DanFEVER](https://aclanthology.org/2021.nodalida-main.47/) | A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset. | Ndcg_at_10 | da | Retrieval | wiki, non-fiction | 8897 | 124.84 (std: 168.53) | -| [LCC](https://github.com/fnielsen/lcc-sentiment) | The leipzig corpora collection, annotated for sentiment | Accuracy | da | Classification | legal, web, news, social, fiction, non-fiction, academic, government | 150 | 118.73 (std: 57.82) | -| [Language Identification](https://aclanthology.org/2021.vardial-1.8/) | A dataset for Nordic language identification. | Accuracy | da, sv, nb, nn, is, fo | Classification | wiki | 3000 | 78.23 (std: 48.54) | -| [Massive Intent](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages | Accuracy | da, nb, sv | Classification | spoken | 15021 | 34.65 (std: 16.99) | -| [Massive Scenario](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages | Accuracy | da, nb, sv | Classification | spoken | 15021 | 34.65 (std: 16.99) | -| [NoReC](https://aclanthology.org/L18-1661/) | A Norwegian dataset for sentiment classification on review | Accuracy | nb | Classification | reviews | 2048 | 89.62 (std: 61.21) | -| [NorQuad](https://aclanthology.org/2023.nodalida-1.17/) | Human-created question for Norwegian wikipedia passages. | Ndcg_at_10 | nb | Retrieval | non-fiction, wiki | 2602 | 502.19 (std: 875.23) | -| [Norwegian courts](https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php) | Nynorsk and Bokmål parallel corpus from Norwegian courts. Norway has two standardised written languages. Bokmål is a variant closer to Danish, while Nynorsk was created to resemble regional dialects of Norwegian. | F1 | nb, nn | BitextMining | legal, non-fiction | 456 | 82.11 (std: 49.48) | -| [Norwegian parliament](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | Norwegian parliament speeches annotated with the party of the speaker (`Sosialistisk Venstreparti` vs `Fremskrittspartiet`) | Accuracy | nb | Classification | spoken | 2400 | 1897.51 (std: 1988.62) | -| [SNL Clustering](https://huggingface.co/datasets/navjordj/SNL_summarization) | Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters. | V_measure | nb | Clustering | non-fiction, wiki | 2048 | 1101.30 (std: 2168.35) | -| [SNL Retrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) | Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'. | Ndcg_at_10 | nb | Retrieval | non-fiction, wiki | 2600 | 1001.43 (std: 2537.83) | -| [ScaLA](https://aclanthology.org/2023.nodalida-1.20/) | A linguistic acceptability task for Danish, Norwegian Bokmål Norwegian Nynorsk and Swedish. | Accuracy | da, nb, sv, nn | Classification | fiction, news, non-fiction, spoken, blog | 8192 | 102.45 (std: 55.49) | -| [SweFAQ](https://spraakbanken.gu.se/en/resources/superlim) | A Swedish QA dataset derived from FAQ | Ndcg_at_10 | sv | Retrieval | non-fiction, web | 1024 | 195.44 (std: 209.33) | -| [SweReC](https://aclanthology.org/2023.nodalida-1.20/) | A Swedish dataset for sentiment classification on review | Accuracy | sv | Classification | reviews | 2048 | 318.83 (std: 499.57) | -| [SwednClustering](https://spraakbanken.gu.se/en/resources/swedn) | The SWE-DN corpus is based on 1,963,576 news articles from the Swedish newspaper Dagens Nyheter (DN) during the years 2000--2020. The articles are filtered to resemble the CNN/DailyMail dataset both regarding textual structure. This dataset uses the category labels as clusters. | V_measure | sv | Clustering | non-fiction, news | 2048 | 1619.71 (std: 2220.36) | -| [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) | News Article Summary Semantic Similarity Estimation. | Ndcg_at_10 | sv | Retrieval | non-fiction, news | 3070 | 1946.35 (std: 3071.98) | -| [TV2Nord Retrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord. | Ndcg_at_10 | da | Retrieval | news, non-fiction | 4096 | 784.11 (std: 982.97) | -| [Twitterhjerne](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) | Danish question asked on Twitter with the Hashtag #Twitterhjerne ('Twitter brain') and their corresponding answer. | Ndcg_at_10 | da | Retrieval | social | 340 | 138.23 (std: 82.41) | -| [VG Clustering](https://huggingface.co/datasets/navjordj/VG_summarization) | Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus. | V_measure | nb | Clustering | non-fiction, news | 2048 | 1009.65 (std: 1597.60) | +| Dataset | Description | Main Score | Languages | Type | Domains | Number of Documents | Mean Length of Documents (characters) | +| :------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------- | :--------------------- | :------------- | :------------------------------------------------------------------- | ------------------: | :------------------------------------ | +| [Angry Tweets](https://aclanthology.org/2021.nodalida-main.53/) | A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets | Accuracy | da | Classification | social | 1047 | 156.15 (std: 82.02) | +| [Bornholm Parallel](https://aclanthology.org/W19-6138/) | Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden. | F1 | da, da-bornholm | BitextMining | poetry, wiki, fiction, web, social | 1000 | 44.36 (std: 41.22) | +| [DKHate](https://aclanthology.org/2020.lrec-1.430/) | Danish Tweets annotated for Hate Speech either being Offensive or not | Accuracy | da | Classification | social | 329 | 88.18 (std: 168.30) | +| [Da Political Comments](https://huggingface.co/datasets/danish_political_comments) | A dataset of Danish political comments rated for sentiment | Accuracy | da | Classification | social | 7206 | 69.60 (std: 62.85) | +| [DaLAJ](https://spraakbanken.gu.se/en/resources/superlim) | A Swedish dataset for linguistic acceptability. Available as a part of Superlim. | Accuracy | sv | Classification | fiction, non-fiction | 888 | 120.77 (std: 67.95) | +| [DanFEVER](https://aclanthology.org/2021.nodalida-main.47/) | A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset. | Ndcg_at_10 | da | Retrieval | wiki, non-fiction | 8897 | 124.84 (std: 168.53) | +| [LCC](https://github.com/fnielsen/lcc-sentiment) | The leipzig corpora collection, annotated for sentiment | Accuracy | da | Classification | legal, web, news, social, fiction, non-fiction, academic, government | 150 | 118.73 (std: 57.82) | +| [Language Identification](https://aclanthology.org/2021.vardial-1.8/) | A dataset for Nordic language identification. | Accuracy | da, sv, nb, nn, is, fo | Classification | wiki | 3000 | 78.23 (std: 48.54) | +| [Massive Intent](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages | Accuracy | da, nb, sv | Classification | spoken | 15021 | 34.65 (std: 16.99) | +| [Massive Scenario](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages | Accuracy | da, nb, sv | Classification | spoken | 15021 | 34.65 (std: 16.99) | +| [NoReC](https://aclanthology.org/L18-1661/) | A Norwegian dataset for sentiment classification on review | Accuracy | nb | Classification | reviews | 2048 | 89.62 (std: 61.21) | +| [NorQuad](https://aclanthology.org/2023.nodalida-1.17/) | Human-created question for Norwegian wikipedia passages. | Ndcg_at_10 | nb | Retrieval | non-fiction, wiki | 2602 | 502.19 (std: 875.23) | +| [Norwegian courts](https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php) | Nynorsk and Bokmål parallel corpus from Norwegian courts. Norway has two standardised written languages. Bokmål is a variant closer to Danish, while Nynorsk was created to resemble regional dialects of Norwegian. | F1 | nb, nn | BitextMining | legal, non-fiction | 456 | 82.11 (std: 49.48) | +| [Norwegian parliament](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | Norwegian parliament speeches annotated with the party of the speaker (`Sosialistisk Venstreparti` vs `Fremskrittspartiet`) | Accuracy | nb | Classification | spoken | 2400 | 1897.51 (std: 1988.62) | +| [SNL Clustering](https://huggingface.co/datasets/navjordj/SNL_summarization) | Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters. | V_measure | nb | Clustering | non-fiction, wiki | 2048 | 1101.30 (std: 2168.35) | +| [SNL Retrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) | Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'. | Ndcg_at_10 | nb | Retrieval | non-fiction, wiki | 2600 | 1001.43 (std: 2537.83) | +| [ScaLA](https://aclanthology.org/2023.nodalida-1.20/) | A linguistic acceptability task for Danish, Norwegian Bokmål Norwegian Nynorsk and Swedish. | Accuracy | da, nb, sv, nn | Classification | fiction, news, non-fiction, spoken, blog | 8192 | 102.45 (std: 55.49) | +| [SweFAQ](https://spraakbanken.gu.se/en/resources/superlim) | A Swedish QA dataset derived from FAQ | Ndcg_at_10 | sv | Retrieval | non-fiction, web | 1024 | 195.44 (std: 209.33) | +| [SweReC](https://aclanthology.org/2023.nodalida-1.20/) | A Swedish dataset for sentiment classification on review | Accuracy | sv | Classification | reviews | 2048 | 318.83 (std: 499.57) | +| [SwednClustering](https://spraakbanken.gu.se/en/resources/swedn) | The SWE-DN corpus is based on 1,963,576 news articles from the Swedish newspaper Dagens Nyheter (DN) during the years 2000--2020. The articles are filtered to resemble the CNN/DailyMail dataset both regarding textual structure. This dataset uses the category labels as clusters. | V_measure | sv | Clustering | non-fiction, news | 2048 | 1619.71 (std: 2220.36) | +| [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) | News Article Summary Semantic Similarity Estimation. | Ndcg_at_10 | sv | Retrieval | non-fiction, news | 3070 | 1946.35 (std: 3071.98) | +| [TV2Nord Retrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord. | Ndcg_at_10 | da | Retrieval | news, non-fiction | 4096 | 784.11 (std: 982.97) | +| [Twitterhjerne](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) | Danish question asked on Twitter with the Hashtag #Twitterhjerne ('Twitter brain') and their corresponding answer. | Ndcg_at_10 | da | Retrieval | social | 340 | 138.23 (std: 82.41) | +| [VG Clustering](https://huggingface.co/datasets/navjordj/VG_summarization) | Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus. | V_measure | nb | Clustering | non-fiction, news | 2048 | 1009.65 (std: 1597.60) | +## Dataset Licenses + + +| Dataset | License | +| :------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------- | +| [Angry Tweets](https://aclanthology.org/2021.nodalida-main.53/) | CC-BY-4.0 | +| [Bornholm Parallel](https://aclanthology.org/W19-6138/) | CC-BY-4.0 | +| [DKHate](https://aclanthology.org/2020.lrec-1.430/) | CC-BY-4.0 | +| [Da Political Comments](https://huggingface.co/datasets/danish_political_comments) | | +| [DaLAJ](https://spraakbanken.gu.se/en/resources/superlim) | CC-BY-4.0 | +| [DanFEVER](https://aclanthology.org/2021.nodalida-main.47/) | CC-BY-4.0 | +| [LCC](https://github.com/fnielsen/lcc-sentiment) | CC-BY-4.0 | +| [Massive Scenario](https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.) | CC-BY-4.0 | +| [NoReC](https://aclanthology.org/L18-1661/) | CC-BY-NC-4.0 | +| [NorQuad](https://aclanthology.org/2023.nodalida-1.17/) | CC0-1.0 | +| [Norwegian courts](https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php) | MIT | +| [Norwegian parliament](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | CC-BY-4.0 | +| [SNL Clustering](https://huggingface.co/datasets/navjordj/SNL_summarization) | CC-BY-NC | +| [SNL Retrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) | CC-BY-NC | +| [ScaLA](https://aclanthology.org/2023.nodalida-1.20/) | CC-BY-SA-4.0 | +| [SweFAQ](https://spraakbanken.gu.se/en/resources/superlim) | CC-BY-4.0 | +| [SweReC](https://aclanthology.org/2023.nodalida-1.20/) | CC-BY-4.0 | +| [SwednClustering](https://spraakbanken.gu.se/en/resources/swedn) | CC-BY-4.0 | +| [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) | CC-BY-4.0 | +| [TV2Nord Retrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | Apache 2.0 | +| [Twitterhjerne](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) | Upcoming | +| [VG Clustering](https://huggingface.co/datasets/navjordj/VG_summarization) | CC-BY-NC | + ## Dataset Disclaimer - We do not own or host any of the datasets which we use for this benchmark. From 452bfe25d2b7e3531c6fddb80dea0314b79895d3 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 21 May 2024 10:48:12 +0200 Subject: [PATCH 2/3] ci: remove macos due to it being slow --- .github/workflows/tests.yml | 2 +- src/seb/cache/all-MiniLM-L6-v2/LCC.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ea939d8c..dd5c434f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,7 +20,7 @@ jobs: pull-requests: write strategy: matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest] python-version: ["3.9"] # , "3.10"] # This allows a subsequently queued workflow run to interrupt previous runs diff --git a/src/seb/cache/all-MiniLM-L6-v2/LCC.json b/src/seb/cache/all-MiniLM-L6-v2/LCC.json index 044e52d7..6611ebfa 100644 --- a/src/seb/cache/all-MiniLM-L6-v2/LCC.json +++ b/src/seb/cache/all-MiniLM-L6-v2/LCC.json @@ -1 +1 @@ -{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.1.1","time_of_run":"2024-04-08T19:18:32.717984","scores":{"da":{"accuracy":0.3846666666666666,"f1":0.3650136884557438,"accuracy_stderr":0.03664241622309678,"f1_stderr":0.03540233062350939,"main_score":0.3846666666666666}},"main_score":"accuracy"} \ No newline at end of file +{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.1.1","time_of_run":"2024-05-21T09:44:03.564974","scores":{"da":{"accuracy":0.3846666666666666,"f1":0.3650136884557438,"accuracy_stderr":0.03664241622309678,"f1_stderr":0.03540233062350939,"main_score":0.3846666666666666}},"main_score":"accuracy"} \ No newline at end of file From c3519357dd4eb13121e9b102c6d0e45bc2563e94 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 21 May 2024 10:52:52 +0200 Subject: [PATCH 3/3] fix docs dependencies --- pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 639ba5bd..a032fa7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ name = "MIT" dev = ["cruft>=2.0.0", "pyright==1.1.348", "ruff>=0.3.0"] tests = ["pytest>=7.1.3", "pytest-cov>=3.0.0"] docs = [ + "mkdocs==1.5.1", "mkdocs-jupyter==0.24.6", "mkdocs-material==9.1.21", "mkdocstrings[python]==0.22.0", @@ -50,7 +51,7 @@ mistral = [ openai = ["openai>=0.27.4"] cohere = ["cohere>=4.34"] sonar = [ - "fairseq2>=0.1.0", # requires sudo apt-get update -y; sudo apt install libsndfile1 + "fairseq2>=0.1.0", # requires sudo apt-get update -y; sudo apt install libsndfile1 "sonar-space>=0.2.1", ] # fairseq2 only works for linux at the moment @@ -167,5 +168,3 @@ build_command = "python -m pip install build; python -m build" [tool.setuptools] include-package-data = true - -