diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index bbed4a471..292e8d698 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -2,3 +2,5 @@
3bc18907354a40f1d89dca1833a2719ba7fb0933
# Reorder import statements with isort
68a72c5a603283f70abce2651dcde9c6f0177c41
+# Migrate code style to Black 24
+d4dbd73fe6a91964af82fbf6e6cb8d70b77569a3
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..2390d8c80
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "monthly"
+ groups:
+ github-actions:
+ patterns:
+ - "*"
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
index 77a16e4a1..f7c8a9966 100644
--- a/.github/workflows/cicd.yml
+++ b/.github/workflows/cicd.yml
@@ -10,18 +10,20 @@ on:
env:
PIPX_HOME: "/home/runner/.cache/pipx"
PIPX_BIN_DIR: "/home/runner/.local/bin"
- POETRY_VERSION: "1.5.1"
+ POETRY_VERSION: "1.8.2"
+permissions:
+ contents: read
jobs:
lint:
runs-on: ubuntu-22.04
name: lint with isort, Black & flake8
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: "Prepare: restore caches, install Poetry, set up Python"
uses: ./.github/actions/prepare
with:
- python-version: "3.10"
+ python-version: "3.11"
poetry-version: ${{ env.POETRY_VERSION }}
- name: Install Python dev dependencies
run: |
@@ -40,12 +42,12 @@ jobs:
runs-on: ubuntu-22.04
name: check CLI startup time
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: "Prepare: restore caches, install Poetry, set up Python"
id: prepare
uses: ./.github/actions/prepare
with:
- python-version: "3.9"
+ python-version: "3.10"
poetry-version: ${{ env.POETRY_VERSION }}
- name: Install Python dependencies
run: |
@@ -59,10 +61,10 @@ jobs:
timeout-minutes: 15
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
name: test on Python ${{ matrix.python-version }}
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: Install system packages
run: |
sudo apt-get install \
@@ -77,32 +79,42 @@ jobs:
- name: Install Python dependencies
run: |
# Selectively install the optional dependencies for some Python versions
- # For Python 3.8:
- if [[ ${{ matrix.python-version }} == '3.8' ]]; then
- poetry install -E "nn omikuji yake voikko stwfsa";
- fi
# For Python 3.9:
if [[ ${{ matrix.python-version }} == '3.9' ]]; then
+ poetry install -E "nn omikuji yake voikko stwfsa";
+ fi
+ # For Python 3.10:
+ if [[ ${{ matrix.python-version }} == '3.10' ]]; then
poetry install -E "fasttext spacy";
# download the small English pretrained spaCy model needed by spacy analyzer
poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
fi
- # For Python 3.10:
- if [[ ${{ matrix.python-version }} == '3.10' ]]; then
- poetry install -E "nn omikuji yake stwfsa";
+ # For Python 3.11:
+ if [[ ${{ matrix.python-version }} == '3.11' ]]; then
+ poetry install -E "nn fasttext yake stwfsa voikko spacy";
+ # download the small English pretrained spaCy model needed by spacy analyzer
+ poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
+ fi
+ # For Python 3.12:
+ if [[ ${{ matrix.python-version }} == '3.12' ]]; then
+ poetry install -E "fasttext yake voikko spacy";
+ # download the small English pretrained spaCy model needed by spacy analyzer
+ poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
fi
poetry run python -m nltk.downloader punkt
- name: Test with pytest
run: |
poetry run pytest --cov=./ --cov-report xml
- if [[ ${{ matrix.python-version }} == '3.9' ]]; then
+ if [[ ${{ matrix.python-version }} == '3.10' ]]; then
poetry run pytest --cov=./ --cov-report xml --cov-append -m slow
fi
- name: Upload coverage to Codecov
- uses: codecov/codecov-action@81cd2dc8148241f03f5839d295e000b8f761e378 # v3.1.0
+ uses: codecov/codecov-action@c16abc29c95fcf9174b58eb7e1abf4c866893bc8 # v4.1.1
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
- name: Save cache
if: steps.prepare.outputs.cache-matched-key != format('poetry-installation-and-cache-{0}-{1}-{2}', matrix.python-version, env.POETRY_VERSION, hashFiles('**/poetry.lock'))
- uses: actions/cache/save@v3
+ uses: actions/cache/save@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
with:
path: |
~/.cache/pipx/venvs
@@ -117,7 +129,7 @@ jobs:
timeout-minutes: 15
steps:
- name: "Build image for testing"
- uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
+ uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
push: false
tags: test-image
@@ -133,20 +145,20 @@ jobs:
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Login to Quay.io
- uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
+ uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
password: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_PASSWORD }}
- name: Docker meta
id: meta
- uses: docker/metadata-action@57396166ad8aefe6098280995947635806a0e6ea # v4.1.1
+ uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
with:
images: quay.io/natlibfi/annif
tags: |
latest
- name: Build and push to Quay.io
- uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
+ uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
@@ -158,7 +170,7 @@ jobs:
runs-on: ubuntu-22.04
if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: "Prepare: restore caches, install Poetry, set up Python"
uses: ./.github/actions/prepare
with:
@@ -172,14 +184,14 @@ jobs:
poetry publish --build
- name: Login to Quay.io
- uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
+ uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
password: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_PASSWORD }}
- name: Docker meta
id: meta
- uses: docker/metadata-action@57396166ad8aefe6098280995947635806a0e6ea # v4.1.1
+ uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
with:
images: quay.io/natlibfi/annif
tags: |
@@ -187,7 +199,7 @@ jobs:
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Build and push to Quay.io
- uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
+ uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 05cc39b7e..6eacc6fdd 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -8,6 +8,9 @@ on:
schedule:
- cron: "22 11 * * 6"
+permissions:
+ contents: read
+
jobs:
analyze:
name: Analyze
@@ -24,18 +27,18 @@ jobs:
steps:
- name: Checkout
- uses: actions/checkout@v3
+ uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- name: Initialize CodeQL
- uses: github/codeql-action/init@v2
+ uses: github/codeql-action/init@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
with:
languages: ${{ matrix.language }}
queries: +security-and-quality
- name: Autobuild
- uses: github/codeql-action/autobuild@v2
+ uses: github/codeql-action/autobuild@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
- name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@v2
+ uses: github/codeql-action/analyze@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
with:
category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/docker-rebuild.yml b/.github/workflows/docker-rebuild.yml
index 2a3f53b79..2b4301bb0 100644
--- a/.github/workflows/docker-rebuild.yml
+++ b/.github/workflows/docker-rebuild.yml
@@ -1,5 +1,8 @@
name: "Docker rebuild"
on: workflow_dispatch
+permissions:
+ contents: read
+
jobs:
rebuild-docker-images:
name: "Docker rebuild"
@@ -7,7 +10,7 @@ jobs:
timeout-minutes: 15
steps:
- name: "Build for testing"
- uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
+ uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
push: false
tags: test-image
@@ -15,14 +18,14 @@ jobs:
run: |
docker run --rm --workdir /Annif test-image pytest -p no:cacheprovider
- name: Login to Quay.io
- uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # v2.2.0
+ uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
password: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_PASSWORD }}
- name: Docker meta
id: meta
- uses: docker/metadata-action@2c0bd771b40637d97bf205cbccdd294a32112176 # v4.5.0
+ uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81 # v5.5.1
with:
images: quay.io/natlibfi/annif
flavor: |
@@ -32,7 +35,7 @@ jobs:
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Build and push to Quay.io
- uses: docker/build-push-action@44ea916f6c540f9302d50c2b1e5a8dc071f15cdf # v4.1.0
+ uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
diff --git a/.scrutinizer.yml b/.scrutinizer.yml
index ebf755aa4..095b59f43 100644
--- a/.scrutinizer.yml
+++ b/.scrutinizer.yml
@@ -4,7 +4,7 @@ checks:
duplicate_code: true
build:
environment:
- python: 3.8.12
+ python: 3.9.17
dependencies:
override:
- pip install .[dev]
diff --git a/CITATION.cff b/CITATION.cff
index 71a2bd3fd..69c88f335 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -33,11 +33,11 @@ authors:
affiliation: "National Library of Finland"
title: "Annif"
abstract: "Annif is an automatic indexing software."
-version: 1.0.0-dev
+version: 1.2.0-dev
license:
- Apache-2.0
- GPL-3.0
-date-released: 2023-04-18
+date-released: 2024-04-25
doi: 10.5281/zenodo.2578948
repository-code: "https://github.com/NatLibFi/Annif"
contact:
diff --git a/Dockerfile b/Dockerfile
index dbd98cb07..5ea240ed9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim-bullseye
+FROM python:3.10-slim-bookworm
LABEL org.opencontainers.image.authors="grp-natlibfi-annif@helsinki.fi"
SHELL ["/bin/bash", "-c"]
@@ -50,8 +50,11 @@ RUN annif completion --bash >> /etc/bash.bashrc # Enable tab completion
RUN groupadd -g 998 annif_user && \
useradd -r -u 998 -g annif_user annif_user && \
chmod -R a+rX /Annif && \
- mkdir -p /Annif/tests/data && \
+ mkdir -p /Annif/tests/data /Annif/projects.d && \
chown -R annif_user:annif_user /annif-projects /Annif/tests/data
USER annif_user
+ENV HF_HOME="/tmp"
+
+ENV GUNICORN_CMD_ARGS="--worker-class uvicorn.workers.UvicornWorker"
CMD annif
diff --git a/LICENSE.txt b/LICENSE.txt
index 861a227e5..6ac606f41 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2017-2022 University Of Helsinki (The National Library Of Finland)
+Copyright (c) 2017-2024 University Of Helsinki (The National Library Of Finland)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index 6d0eb5f2e..70b35a3f8 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,12 @@
[![DOI](https://zenodo.org/badge/100936800.svg)](https://zenodo.org/badge/latestdoi/100936800)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Container image](https://img.shields.io/badge/container_image-quay.io-blue.svg)](https://quay.io/repository/natlibfi/annif)
[![CI/CD](https://github.com/NatLibFi/Annif/actions/workflows/cicd.yml/badge.svg)](https://github.com/NatLibFi/Annif/actions/workflows/cicd.yml)
[![codecov](https://codecov.io/gh/NatLibFi/Annif/branch/main/graph/badge.svg)](https://codecov.io/gh/NatLibFi/Annif)
-[![Code Climate](https://codeclimate.com/github/NatLibFi/Annif/badges/gpa.svg)](https://codeclimate.com/github/NatLibFi/Annif)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/NatLibFi/Annif/badges/quality-score.png?b=main)](https://scrutinizer-ci.com/g/NatLibFi/Annif/?branch=main)
+[![Code Climate](https://codeclimate.com/github/NatLibFi/Annif/badges/gpa.svg)](https://codeclimate.com/github/NatLibFi/Annif)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/NatLibFi/Annif/badge)](https://securityscorecards.dev/viewer/?uri=github.com/NatLibFi/Annif)
[![codebeat badge](https://codebeat.co/badges/7a8ef539-0094-48b8-84c2-c413b4a50d57)](https://codebeat.co/projects/github-com-natlibfi-annif-main)
[![CodeQL](https://github.com/NatLibFi/Annif/actions/workflows/codeql.yml/badge.svg)](https://github.com/NatLibFi/Annif/actions/workflows/codeql.yml)
[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=NatLibFi_Annif&metric=alert_status)](https://sonarcloud.io/dashboard?id=NatLibFi_Annif)
@@ -17,16 +19,16 @@ a statistical automated indexing tool that used metadata from the
[Finna.fi](https://finna.fi) discovery interface as a training corpus.
This repo contains a rewritten production version of Annif based on the
-[prototype](https://github.com/osma/annif). It is a work in progress, but
-already functional for many common tasks.
+[prototype](https://github.com/osma/annif).
-[Finto AI](https://ai.finto.fi/) is a service based on Annif; see the [source code for Finto AI](https://github.com/NatLibFi/FintoAI).
+[Finto AI](https://ai.finto.fi/) is a service based on Annif; see the [source code of Finto AI](https://github.com/NatLibFi/FintoAI)
+and the [🤗 Hugging Face Hub collection](https://huggingface.co/collections/NatLibFi/annif-models-65b35fb98b7c508c8e8a1570) containing the models Finto AI uses.
# Basic install
Annif is developed and tested on Linux. If you want to run Annif on Windows or Mac OS, the recommended way is to use Docker (see below) or a Linux virtual machine.
-You will need Python 3.8+ to install Annif.
+You will need Python 3.9-3.12 to install Annif.
The recommended way is to install Annif from
[PyPI](https://pypi.org/project/annif/) into a virtual environment.
@@ -71,7 +73,7 @@ For details and usage for other shells see
[Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/).
# Docker install
-You can use Annif as a pre-built Docker container. Please see the
+You can use Annif as a pre-built Docker container image from [quay.io/natlibfi/annif](https://quay.io/repository/natlibfi/annif) repository. Please see the
[wiki documentation](https://github.com/NatLibFi/Annif/wiki/Usage-with-Docker)
for details.
@@ -130,7 +132,7 @@ Many resources are available:
# Publications / How to cite
-Two articles about Annif have been published in peer-reviewed Open Access
+See below for some articles about Annif in peer-reviewed Open Access
journals. The software itself is also archived on Zenodo and
has a [citable DOI](https://doi.org/10.5281/zenodo.5654173).
@@ -141,6 +143,24 @@ See "Cite this repository" in the details of the repository.
## Annif articles
-
+Golub, K.; Suominen, O.; Mohammed, A.; Aagaard, H.; Osterman, O, 2024.
+Automated Dewey Decimal Classification of Swedish library metadata using Annif software.
+Journal of Documentation, in press.
+https://doi.org/10.1108/JD-01-2022-0026
+
+See BibTex
+
+ @article{golub2024annif,
+ title={Automated Dewey Decimal Classification of Swedish library metadata using Annif software},
+ author={Golub, Koraljka and Suominen, Osma and Mohammed, Ahmed Taiye and Aagaard, Harriet and Osterman, Olof},
+ journal={J. Doc.},
+ year={in press},
+ doi = {10.1108/JD-01-2022-0026},
+ url={https://www.emerald.com/insight/content/doi/10.1108/JD-01-2022-0026},
+ }
+
+
+-
Suominen, O.; Inkinen, J.; Lehtinen, M., 2022.
Annif and Finto AI: Developing and Implementing Automated Subject Indexing.
JLIS.It, 13(1), pp. 265–282. URL:
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..3a723f89d
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,56 @@
+# Security Policy
+
+## Supported Versions
+
+The [most recent Annif major/minor release](https://github.com/NatLibFi/Annif/releases)
+is considered supported,
+in the sense that if a serious bug or vulnerability is encountered in it,
+we relase a patch to fix the issue.
+
+Generally, we aim to update all dependencies to their latest versions on each Annif major/minor release.
+However, note that most of the [dependencies of a given Annif release](https://github.com/NatLibFi/Annif/blob/main/pyproject.toml)
+are pinned only on minor version level, so patch level fixes of (most) dependencies can be applied to an Annif installation,
+by either manually updating the outdated packages or recreating the virtual environment from scratch and reinstalling Annif.
+
+### Docker image
+We rebuild and publish a new Docker image of the latest Annif release in the
+[quay.io repository](https://quay.io/repository/natlibfi/annif?tab=info)
+when it is considered necessary in order to update both system packages and Annif dependencies of the image.
+A new image is published about once every month.
+
+The security scanner that is used on quay.io is
+[Clair](https://access.redhat.com/documentation/en-us/red_hat_quay/3/html/about_quay_io/clair-vulnerability-scanner).
+You can see the vulnerabilities detected in an image by navigating via the link in the Security Scan column of the [tags view](https://quay.io/repository/natlibfi/annif?tab=tags),
+see the screenshot below.
+
+The scanner typically detects many vulnerabilities, that is several tens, in the packages of the images, even when they have been rebuild recently.
+However, there exist patches for only some of the vulnerabilities,
+and due to the way that Annif uses the dependencies, most of the detected vulnerabilities
+do not apply to Annif use.
+
+
+
+## Reporting a Vulnerability
+
+We value your findings, and we would be grateful if you report
+any concerns or vulnerabilities by email to **`finto-posti@helsinki.fi`**.
+_Do not create a GitHub issue for security vulnerabilities_.
+Note that Annif team is a part of the larger Finto team,
+which has resources for the contact service throughout the year.
+
+If the security vulnerability is in a third-party software library,
+please report it also to the team maintaining it.
+
+Each security concern will be assigned to a handler from our team,
+who will contact you if there is a need for additional information.
+We confirm the problem and keep you informed of the fix.
+
+To facilitate a quick and accurate response make sure to include the following details when submitting your report:
+
+- A clear and descriptive title that outlines the report's subject and the software it pertains to (Annif).
+- The version(s) of Annif, its dependencies and the (possible) other related software that contribute to the vulnerability.
+- Break down the technical aspects of the vulnerability in your description.
+- A minimal example showcasing the vulnerability.
+- An explanation who has the potential to exploit this vulnerability and the benefits they would derive from doing so.
+- Whether the vulnerability is public knowledge or known to third parties, and if so, share relevant details.
+- (A remediation suggestion if you have have one.)
diff --git a/annif/__init__.py b/annif/__init__.py
index a71b9f379..120c7d4cf 100644
--- a/annif/__init__.py
+++ b/annif/__init__.py
@@ -7,6 +7,8 @@
import os.path
from typing import TYPE_CHECKING
+from flask import Flask
+
logging.basicConfig()
logger = logging.getLogger("annif")
logger.setLevel(level=logging.INFO)
@@ -14,12 +16,11 @@
import annif.backend # noqa
if TYPE_CHECKING:
- from flask.app import Flask
+ from connexion.apps.flask import FlaskApp
def create_flask_app(config_name: str | None = None) -> Flask:
"""Create a Flask app to be used by the CLI."""
- from flask import Flask
_set_tensorflow_loglevel()
@@ -31,29 +32,41 @@ def create_flask_app(config_name: str | None = None) -> Flask:
return app
-def create_app(config_name: str | None = None) -> Flask:
+def create_cx_app(config_name: str | None = None) -> FlaskApp:
"""Create a Connexion app to be used for the API."""
- # 'cxapp' here is the Connexion application that has a normal Flask app
- # as a property (cxapp.app)
import connexion
- from flask_cors import CORS
+ from connexion.datastructures import MediaTypeDict
+ from connexion.middleware import MiddlewarePosition
+ from connexion.validators import FormDataValidator, MultiPartFormDataValidator
+ from starlette.middleware.cors import CORSMiddleware
+ import annif.registry
from annif.openapi.validation import CustomRequestBodyValidator
specdir = os.path.join(os.path.dirname(__file__), "openapi")
- cxapp = connexion.App(__name__, specification_dir=specdir)
+ cxapp = connexion.FlaskApp(__name__, specification_dir=specdir)
config_name = _get_config_name(config_name)
logger.debug(f"creating connexion app with configuration {config_name}")
cxapp.app.config.from_object(config_name)
cxapp.app.config.from_envvar("ANNIF_SETTINGS", silent=True)
validator_map = {
- "body": CustomRequestBodyValidator,
+ "body": MediaTypeDict(
+ {
+ "*/*json": CustomRequestBodyValidator,
+ "application/x-www-form-urlencoded": FormDataValidator,
+ "multipart/form-data": MultiPartFormDataValidator,
+ }
+ ),
}
cxapp.add_api("annif.yaml", validator_map=validator_map)
# add CORS support
- CORS(cxapp.app)
+ cxapp.add_middleware(
+ CORSMiddleware,
+ position=MiddlewarePosition.BEFORE_EXCEPTION,
+ allow_origins=["*"],
+ )
if cxapp.app.config["INITIALIZE_PROJECTS"]:
annif.registry.initialize_projects(cxapp.app)
@@ -64,8 +77,11 @@ def create_app(config_name: str | None = None) -> Flask:
cxapp.app.register_blueprint(bp)
- # return the Flask app
- return cxapp.app
+ # return the Connexion app
+ return cxapp
+
+
+create_app = create_cx_app # Alias to allow starting directly with uvicorn run
def _get_config_name(config_name: str | None) -> str:
diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
index a0f93ced3..27a2cd792 100644
--- a/annif/analyzer/__init__.py
+++ b/annif/analyzer/__init__.py
@@ -1,4 +1,5 @@
"""Collection of language-specific analyzers and analyzer registry for Annif"""
+
from __future__ import annotations
import re
diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py
index 5ba876f9d..25bdb6b57 100644
--- a/annif/analyzer/analyzer.py
+++ b/annif/analyzer/analyzer.py
@@ -1,4 +1,5 @@
"""Common functionality for analyzers."""
+
from __future__ import annotations
import abc
diff --git a/annif/analyzer/simple.py b/annif/analyzer/simple.py
index 4cc35e6f1..c3ff7240a 100644
--- a/annif/analyzer/simple.py
+++ b/annif/analyzer/simple.py
@@ -1,4 +1,5 @@
"""Simple analyzer for Annif. Only folds words to lower case."""
+
from __future__ import annotations
from . import analyzer
diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py
index fff0a2638..3e1536882 100644
--- a/annif/analyzer/simplemma.py
+++ b/annif/analyzer/simplemma.py
@@ -1,4 +1,5 @@
"""Simplemma analyzer for Annif, based on simplemma lemmatizer."""
+
from __future__ import annotations
import annif.simplemma_util
diff --git a/annif/analyzer/snowball.py b/annif/analyzer/snowball.py
index 57990c2a1..1a17702eb 100644
--- a/annif/analyzer/snowball.py
+++ b/annif/analyzer/snowball.py
@@ -1,4 +1,5 @@
"""Snowball analyzer for Annif, based on nltk Snowball stemmer."""
+
from __future__ import annotations
import functools
diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py
index b5e9cbc55..184f03ffc 100644
--- a/annif/analyzer/spacy.py
+++ b/annif/analyzer/spacy.py
@@ -1,4 +1,5 @@
"""spaCy analyzer for Annif which uses spaCy for lemmatization"""
+
from __future__ import annotations
import annif.util
diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py
index e6e693d65..b3e7d5007 100644
--- a/annif/analyzer/voikko.py
+++ b/annif/analyzer/voikko.py
@@ -1,4 +1,5 @@
"""Voikko analyzer for Annif, based on libvoikko library."""
+
from __future__ import annotations
import functools
diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py
index cbeeb648e..7be1264b4 100644
--- a/annif/backend/__init__.py
+++ b/annif/backend/__init__.py
@@ -1,4 +1,5 @@
"""Registry of backend types for Annif"""
+
from __future__ import annotations
from typing import TYPE_CHECKING, Type
diff --git a/annif/backend/backend.py b/annif/backend/backend.py
index 6a63c86b2..69f730d5d 100644
--- a/annif/backend/backend.py
+++ b/annif/backend/backend.py
@@ -1,4 +1,5 @@
"""Common functionality for backends."""
+
from __future__ import annotations
import abc
@@ -53,14 +54,15 @@ def params(self) -> dict[str, Any]:
@property
def _model_file_paths(self) -> list:
- all_paths = glob(os.path.join(self.datadir, "*"))
+ all_paths = glob(os.path.join(self.datadir, "**"), recursive=True)
+ file_paths = [p for p in all_paths if os.path.isfile(p)]
ignore_patterns = ("*-train*", "tmp-*", "vectorizer")
ignore_paths = [
path
for igp in ignore_patterns
for path in glob(os.path.join(self.datadir, igp))
]
- return list(set(all_paths) - set(ignore_paths))
+ return list(set(file_paths) - set(ignore_paths))
@property
def is_trained(self) -> bool:
diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py
index bb52e22e6..d10ce8d16 100644
--- a/annif/backend/dummy.py
+++ b/annif/backend/dummy.py
@@ -1,4 +1,5 @@
"""Dummy backend for testing basic interaction of projects and backends"""
+
from __future__ import annotations
from typing import TYPE_CHECKING, Any
diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py
index 6f7f2eb04..a25e8a03f 100644
--- a/annif/backend/ensemble.py
+++ b/annif/backend/ensemble.py
@@ -1,4 +1,5 @@
"""Ensemble backend that combines results from multiple projects"""
+
from __future__ import annotations
from typing import TYPE_CHECKING, Any
@@ -123,7 +124,7 @@ def _format_cfg_line(self, hps: dict[str, float]) -> str:
def _objective(self, trial: Trial) -> float:
eval_batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
proj_weights = {
- project_id: trial.suggest_uniform(project_id, 0.0, 1.0)
+ project_id: trial.suggest_float(project_id, 0.0, 1.0)
for project_id in self._sources
}
for gold_batch, src_batches in zip(self._gold_batches, self._source_batches):
diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py
index 23c33539a..e102b02ba 100644
--- a/annif/backend/fasttext.py
+++ b/annif/backend/fasttext.py
@@ -1,4 +1,5 @@
"""Annif backend using the fastText classifier"""
+
from __future__ import annotations
import collections
diff --git a/annif/backend/http.py b/annif/backend/http.py
index 0fce7f8e4..9036ec152 100644
--- a/annif/backend/http.py
+++ b/annif/backend/http.py
@@ -1,5 +1,6 @@
"""HTTP/REST client backend that makes calls to a web service
and returns the results"""
+
from __future__ import annotations
import importlib
diff --git a/annif/backend/hyperopt.py b/annif/backend/hyperopt.py
index 2c2e7422c..efbc10513 100644
--- a/annif/backend/hyperopt.py
+++ b/annif/backend/hyperopt.py
@@ -1,4 +1,5 @@
"""Hyperparameter optimization functionality for backends"""
+
from __future__ import annotations
import abc
diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py
index 066d5d862..e4af03d26 100644
--- a/annif/backend/mixins.py
+++ b/annif/backend/mixins.py
@@ -1,4 +1,5 @@
"""Annif backend mixins that can be used to implement features"""
+
from __future__ import annotations
import abc
@@ -72,9 +73,14 @@ def initialize_vectorizer(self) -> None:
)
def create_vectorizer(
- self, input: Iterable[str], params: dict[str, Any] = {}
+ self, input: Iterable[str], params: dict[str, Any] = None
) -> csr_matrix:
self.info("creating vectorizer")
+ if params is None:
+ params = {}
+ # avoid UserWarning when overriding tokenizer
+ if "tokenizer" in params:
+ params["token_pattern"] = None
self.vectorizer = TfidfVectorizer(**params)
veccorpus = self.vectorizer.fit_transform(input)
annif.util.atomic_save(
diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py
index cbcef11b1..cd274cf65 100644
--- a/annif/backend/mllm.py
+++ b/annif/backend/mllm.py
@@ -1,4 +1,5 @@
"""Maui-like Lexical Matching backend"""
+
from __future__ import annotations
import os.path
diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py
index 169eb8234..1fea38c5b 100644
--- a/annif/backend/nn_ensemble.py
+++ b/annif/backend/nn_ensemble.py
@@ -1,25 +1,34 @@
"""Neural network based ensemble backend that combines results from multiple
projects."""
+
from __future__ import annotations
+import importlib
+import json
import os.path
import shutil
+import zipfile
from io import BytesIO
from typing import TYPE_CHECKING, Any
import joblib
+import keras.backend as K
import lmdb
import numpy as np
-import tensorflow.keras.backend as K
+from keras.layers import Add, Dense, Dropout, Flatten, Input, Layer
+from keras.models import Model
+from keras.saving import load_model
+from keras.utils import Sequence
from scipy.sparse import csc_matrix, csr_matrix
-from tensorflow.keras.layers import Add, Dense, Dropout, Flatten, Input, Layer
-from tensorflow.keras.models import Model, load_model
-from tensorflow.keras.utils import Sequence
import annif.corpus
import annif.parallel
import annif.util
-from annif.exception import NotInitializedException, NotSupportedException
+from annif.exception import (
+ NotInitializedException,
+ NotSupportedException,
+ OperationFailedException,
+)
from annif.suggestion import SuggestionBatch, vector_to_suggestions
from . import backend, ensemble
@@ -29,6 +38,8 @@
from annif.corpus.document import DocumentCorpus
+logger = annif.logger
+
def idx_to_key(idx: int) -> bytes:
"""convert an integer index to a binary key for use in LMDB"""
@@ -97,7 +108,7 @@ class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBacke
name = "nn_ensemble"
- MODEL_FILE = "nn-model.h5"
+ MODEL_FILE = "nn-model.keras"
LMDB_FILE = "nn-train.mdb"
DEFAULT_PARAMETERS = {
@@ -127,9 +138,20 @@ def initialize(self, parallel: bool = False) -> None:
backend_id=self.backend_id,
)
self.debug("loading Keras model from {}".format(model_filename))
- self._model = load_model(
- model_filename, custom_objects={"MeanLayer": MeanLayer}
- )
+ try:
+ self._model = load_model(
+ model_filename, custom_objects={"MeanLayer": MeanLayer}
+ )
+ except Exception as err:
+ metadata = self.get_model_metadata(model_filename)
+ keras_version = importlib.metadata.version("keras")
+ message = (
+ f"loading Keras model from {model_filename}; "
+ f"model metadata: {metadata}; "
+ f"you have Keras version {keras_version}. "
+ f'Original error message: "{err}"'
+ )
+ raise OperationFailedException(message, backend_id=self.backend_id)
def _merge_source_batches(
self,
@@ -287,3 +309,16 @@ def _learn(
self._fit_model(
corpus, int(params["learn-epochs"]), int(params["lmdb_map_size"])
)
+
+ def get_model_metadata(self, model_filename: str) -> dict | None:
+ """Read metadata from Keras model files."""
+
+ try:
+ with zipfile.ZipFile(model_filename, "r") as zip:
+ with zip.open("metadata.json") as metadata_file:
+ metadata_str = metadata_file.read().decode("utf-8")
+ metadata = json.loads(metadata_str)
+ return metadata
+ except Exception:
+ self.warning(f"Failed to read metadata from {model_filename}")
+ return None
diff --git a/annif/backend/omikuji.py b/annif/backend/omikuji.py
index 7a2e6a1bb..89d8f0ea9 100644
--- a/annif/backend/omikuji.py
+++ b/annif/backend/omikuji.py
@@ -1,4 +1,5 @@
"""Annif backend using the Omikuji classifier"""
+
from __future__ import annotations
import os.path
diff --git a/annif/backend/pav.py b/annif/backend/pav.py
index 61f4362d1..2ee30337a 100644
--- a/annif/backend/pav.py
+++ b/annif/backend/pav.py
@@ -2,6 +2,7 @@
learns which concept suggestions from each backend are trustworthy using the
PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
individual backends into probabilities."""
+
from __future__ import annotations
import os.path
@@ -69,13 +70,15 @@ def _merge_source_batches(
reg_models = self._get_model(project_id)
pav_batch = [
[
- SubjectSuggestion(
- subject_id=sugg.subject_id,
- score=reg_models[sugg.subject_id].predict([sugg.score])[0],
- )
- if sugg.subject_id in reg_models
- else SubjectSuggestion(
- subject_id=sugg.subject_id, score=sugg.score
+ (
+ SubjectSuggestion(
+ subject_id=sugg.subject_id,
+ score=reg_models[sugg.subject_id].predict([sugg.score])[0],
+ )
+ if sugg.subject_id in reg_models
+ else SubjectSuggestion(
+ subject_id=sugg.subject_id, score=sugg.score
+ )
) # default to raw score
for sugg in result
]
diff --git a/annif/backend/svc.py b/annif/backend/svc.py
index e2f6c33a8..c18f1410b 100644
--- a/annif/backend/svc.py
+++ b/annif/backend/svc.py
@@ -1,4 +1,5 @@
"""Annif backend using a SVM classifier"""
+
from __future__ import annotations
import os.path
@@ -67,7 +68,7 @@ def _corpus_to_texts_and_classes(
def _train_classifier(self, veccorpus: csr_matrix, classes: list[int]) -> None:
self.info("creating classifier")
- self._model = LinearSVC()
+ self._model = LinearSVC(dual="auto")
self._model.fit(veccorpus, classes)
annif.util.atomic_save(
self._model, self.datadir, self.MODEL_FILE, method=joblib.dump
diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
index 1cca639ca..a77704446 100644
--- a/annif/backend/tfidf.py
+++ b/annif/backend/tfidf.py
@@ -1,5 +1,6 @@
"""Backend that returns most similar subjects based on similarity in sparse
TF-IDF normalized bag-of-words vector space"""
+
from __future__ import annotations
import os.path
diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 8f7d38c50..c8b933c9b 100644
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -1,4 +1,5 @@
"""Annif backend using Yake keyword extraction"""
+
# For license remarks of this backend see README.md:
# https://github.com/NatLibFi/Annif#license.
from __future__ import annotations
@@ -51,7 +52,7 @@ def is_trained(self):
@property
def label_types(self) -> list[URIRef]:
- if type(self.params["label_types"]) == str: # Label types set by user
+ if isinstance(self.params["label_types"], str): # Label types set by user
label_types = [lt.strip() for lt in self.params["label_types"].split(",")]
self._validate_label_types(label_types)
else:
diff --git a/annif/cli.py b/annif/cli.py
index ebed088ba..673dc42fb 100644
--- a/annif/cli.py
+++ b/annif/cli.py
@@ -1,7 +1,6 @@
"""Definitions for command-line (Click) commands for invoking Annif
operations and printing the results to console."""
-
import collections
import importlib
import json
@@ -18,23 +17,24 @@
import annif.parallel
import annif.project
import annif.registry
-from annif import cli_util
-from annif.exception import NotInitializedException, NotSupportedException
+from annif import cli_util, hfh_util
+from annif.exception import (
+ NotInitializedException,
+ NotSupportedException,
+ OperationFailedException,
+)
from annif.project import Access
from annif.util import metric_code
logger = annif.logger
click_log.basic_config(logger)
-
-if len(sys.argv) > 1 and sys.argv[1] in ("run", "routes"):
- create_app = annif.create_app # Use Flask with Connexion
-else:
- # Connexion is not needed for most CLI commands, use plain Flask
- create_app = annif.create_flask_app
-
-cli = FlaskGroup(create_app=create_app, add_version_option=False)
+create_app = annif.create_flask_app
+cli = FlaskGroup(
+ create_app=create_app, add_default_commands=False, add_version_option=False
+)
cli = click.version_option(message="%(version)s")(cli)
+cli.params = [opt for opt in cli.params if opt.name not in ("env_file", "app")]
@cli.command("list-projects")
@@ -443,6 +443,22 @@ def run_eval(
)
+@cli.command("run")
+@click.option("--host", type=str, default="127.0.0.1")
+@click.option("--port", type=int, default=5000)
+@click.option("--log-level")
+@click_log.simple_verbosity_option(logger, default="ERROR")
+def run_app(**kwargs):
+ """
+ Run Annif in server mode for development.
+ \f
+ The server is for development purposes only.
+ """
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
+ cxapp = annif.create_cx_app()
+ cxapp.run(**kwargs)
+
+
FILTER_BATCH_MAX_LIMIT = 15
OPTIMIZE_METRICS = ["Precision (doc avg)", "Recall (doc avg)", "F1 score (doc avg)"]
@@ -583,11 +599,129 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi
click.echo("---")
+@cli.command("upload")
+@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
+@click.argument("repo_id")
+@click.option(
+ "--token",
+ help="""Authentication token, obtained from the Hugging Face Hub.
+ Will default to the stored token.""",
+)
+@click.option(
+ "--revision",
+ help="""An optional git revision to commit from. Defaults to the head of the "main"
+ branch.""",
+)
+@click.option(
+ "--commit-message",
+ help="""The summary / title / first line of the generated commit.""",
+)
+@cli_util.common_options
+def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
+ """
+ Upload selected projects and their vocabularies to a Hugging Face Hub repository.
+ \f
+ This command zips the project directories and vocabularies of the projects
+ that match the given `project_ids_pattern` to archive files, and uploads the
+ archives along with the project configurations to the specified Hugging Face
+ Hub repository. An authentication token and commit message can be given with
+ options.
+ """
+ from huggingface_hub import HfApi
+ from huggingface_hub.utils import HfHubHTTPError, HFValidationError
+
+ projects = hfh_util.get_matching_projects(project_ids_pattern)
+ click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")
+
+ commit_message = (
+ commit_message
+ if commit_message is not None
+ else f"Upload project(s) {project_ids_pattern} with Annif"
+ )
+
+ fobjs, operations = [], []
+ try:
+ fobjs, operations = hfh_util.prepare_commits(projects, repo_id)
+ api = HfApi()
+ api.create_commit(
+ repo_id=repo_id,
+ operations=operations,
+ commit_message=commit_message,
+ revision=revision,
+ token=token,
+ )
+ except (HfHubHTTPError, HFValidationError) as err:
+ raise OperationFailedException(str(err))
+ finally:
+ for fobj in fobjs:
+ fobj.close()
+
+
+@cli.command("download")
+@click.argument("project_ids_pattern")
+@click.argument("repo_id")
+@click.option(
+ "--token",
+ help="""Authentication token, obtained from the Hugging Face Hub.
+ Will default to the stored token.""",
+)
+@click.option(
+ "--revision",
+ help="""
+ An optional Git revision id which can be a branch name, a tag, or a commit
+ hash.
+ """,
+)
+@click.option(
+ "--force",
+ "-f",
+ default=False,
+ is_flag=True,
+ help="Replace an existing project/vocabulary/config with the downloaded one",
+)
+@cli_util.common_options
+def run_download(project_ids_pattern, repo_id, token, revision, force):
+ """
+ Download selected projects and their vocabularies from a Hugging Face Hub
+ repository.
+ \f
+ This command downloads the project and vocabulary archives and the
+ configuration files of the projects that match the given
+ `project_ids_pattern` from the specified Hugging Face Hub repository and
+ unzips the archives to `data/` directory and places the configuration files
+ to `projects.d/` directory. An authentication token and revision can
+ be given with options.
+ """
+
+ project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
+ project_ids_pattern, repo_id, token, revision
+ )
+ click.echo(f"Downloading project(s): {', '.join(project_ids)}")
+
+ vocab_ids = set()
+ for project_id in project_ids:
+ project_zip_cache_path = hfh_util.download_from_hf_hub(
+ f"projects/{project_id}.zip", repo_id, token, revision
+ )
+ hfh_util.unzip_archive(project_zip_cache_path, force)
+ config_file_cache_path = hfh_util.download_from_hf_hub(
+ f"{project_id}.cfg", repo_id, token, revision
+ )
+ vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
+ hfh_util.copy_project_config(config_file_cache_path, force)
+
+ for vocab_id in vocab_ids:
+ vocab_zip_cache_path = hfh_util.download_from_hf_hub(
+ f"vocabs/{vocab_id}.zip", repo_id, token, revision
+ )
+ hfh_util.unzip_archive(vocab_zip_cache_path, force)
+
+
@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
@click.option("--fish", "shell", flag_value="fish")
-def completion(shell):
+def run_completion(shell):
"""Generate the script for tab-key autocompletion for the given shell. To enable the
completion support in your current bash terminal session run\n
source <(annif completion --bash)
diff --git a/annif/cli_util.py b/annif/cli_util.py
index bbfa96df4..2a64582f2 100644
--- a/annif/cli_util.py
+++ b/annif/cli_util.py
@@ -1,4 +1,5 @@
"""Utility functions for Annif CLI commands"""
+
from __future__ import annotations
import collections
@@ -16,8 +17,8 @@
from annif.project import Access
if TYPE_CHECKING:
+ import io
from datetime import datetime
- from io import TextIOWrapper
from click.core import Argument, Context, Option
@@ -184,7 +185,7 @@ def show_hits(
hits: SuggestionResult,
project: AnnifProject,
lang: str,
- file: TextIOWrapper | None = None,
+ file: io.TextIOWrapper | None = None,
) -> None:
"""
Print subject suggestions to the console or a file. The suggestions are displayed as
@@ -233,7 +234,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float
def _get_completion_choices(
param: Argument,
) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list:
- if param.name == "project_id":
+ if param.name in ("project_id", "project_ids_pattern"):
return annif.registry.get_projects()
elif param.name == "vocab_id":
return annif.registry.get_vocabs()
diff --git a/annif/config.py b/annif/config.py
index ab8f0d568..8cdc7d04f 100644
--- a/annif/config.py
+++ b/annif/config.py
@@ -1,11 +1,15 @@
"""Configuration file handling"""
+
from __future__ import annotations
import configparser
import os.path
from glob import glob
-import tomli
+try:
+ import tomllib
+except ImportError:
+ import tomli as tomllib
import annif
import annif.util
@@ -45,8 +49,8 @@ def __init__(self, filename: str) -> None:
with open(filename, "rb") as projf:
try:
logger.debug(f"Reading configuration file {filename} in TOML format")
- self._config = tomli.load(projf)
- except tomli.TOMLDecodeError as err:
+ self._config = tomllib.load(projf)
+ except tomllib.TOMLDecodeError as err:
raise ConfigurationException(
f"Parsing TOML file '{filename}' failed: {err}"
)
diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py
index 4200d2f87..fb0ceef86 100644
--- a/annif/corpus/__init__.py
+++ b/annif/corpus/__init__.py
@@ -1,6 +1,5 @@
"""Annif corpus operations"""
-
from .combine import CombinedCorpus
from .document import (
DocumentDirectory,
diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py
index 75fcc7f55..a0ad02147 100644
--- a/annif/corpus/combine.py
+++ b/annif/corpus/combine.py
@@ -1,4 +1,5 @@
"""Class for combining multiple corpora so they behave like a single corpus"""
+
from __future__ import annotations
import itertools
diff --git a/annif/corpus/document.py b/annif/corpus/document.py
index 09a80a309..8d4ef5319 100644
--- a/annif/corpus/document.py
+++ b/annif/corpus/document.py
@@ -1,4 +1,5 @@
"""Clases for supporting document corpora"""
+
from __future__ import annotations
import glob
diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index 462a35241..82a8c5c13 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -1,4 +1,5 @@
"""Support for subjects loaded from a SKOS/RDF file"""
+
from __future__ import annotations
import collections
@@ -83,11 +84,15 @@ def languages(self) -> set[str]:
def _concept_labels(self, concept: URIRef) -> dict[str, str]:
by_lang = self.get_concept_labels(concept, self.PREF_LABEL_PROPERTIES)
return {
- lang: by_lang[lang][0]
- if by_lang[lang] # correct lang
- else by_lang[None][0]
- if by_lang[None] # no language
- else self.graph.namespace_manager.qname(concept)
+ lang: (
+ by_lang[lang][0]
+ if by_lang[lang] # correct lang
+ else (
+ by_lang[None][0]
+ if by_lang[None] # no language
+ else self.graph.namespace_manager.qname(concept)
+ )
+ )
for lang in self.languages
}
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index a9ee06397..2e3ffe5de 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -1,4 +1,5 @@
"""Classes for supporting subject corpora expressed as directories or files"""
+
from __future__ import annotations
import csv
diff --git a/annif/corpus/types.py b/annif/corpus/types.py
index e6cd4b252..e94a8c1d9 100644
--- a/annif/corpus/types.py
+++ b/annif/corpus/types.py
@@ -1,4 +1,5 @@
"""Basic types for document and subject corpora"""
+
from __future__ import annotations
import abc
diff --git a/annif/datadir.py b/annif/datadir.py
index 752da32dd..02987490d 100644
--- a/annif/datadir.py
+++ b/annif/datadir.py
@@ -1,4 +1,5 @@
"""Mixin class for types that need a data directory"""
+
from __future__ import annotations
import os
diff --git a/annif/eval.py b/annif/eval.py
index 5ec5bd17a..95cdf449d 100644
--- a/annif/eval.py
+++ b/annif/eval.py
@@ -1,4 +1,5 @@
"""Evaluation metrics for Annif"""
+
from __future__ import annotations
import warnings
@@ -63,10 +64,10 @@ def ndcg_score(y_true: csr_array, y_pred: csr_array, limit: int | None = None) -
scores = np.ones(y_true.shape[0], dtype=np.float32)
for i in range(y_true.shape[0]):
- true = y_true.getrow(i)
+ true = y_true[[i]]
idcg = dcg_score(true, true, limit)
if idcg > 0:
- pred = y_pred.getrow(i)
+ pred = y_pred[[i]]
dcg = dcg_score(true, pred, limit)
scores[i] = dcg / idcg
@@ -86,9 +87,9 @@ def __init__(self, subject_index: SubjectIndex) -> None:
def evaluate_many(
self,
- suggestion_batch: list[list[SubjectSuggestion]]
- | SuggestionBatch
- | list[Iterator],
+ suggestion_batch: (
+ list[list[SubjectSuggestion]] | SuggestionBatch | list[Iterator]
+ ),
gold_subject_batch: Sequence[SubjectSet],
) -> None:
if not isinstance(suggestion_batch, SuggestionBatch):
diff --git a/annif/exception.py b/annif/exception.py
index b4b9c6552..9132d0134 100644
--- a/annif/exception.py
+++ b/annif/exception.py
@@ -1,4 +1,5 @@
"""Custom exceptions used by Annif"""
+
from __future__ import annotations
from click import ClickException
diff --git a/annif/hfh_util.py b/annif/hfh_util.py
new file mode 100644
index 000000000..045e4710f
--- /dev/null
+++ b/annif/hfh_util.py
@@ -0,0 +1,240 @@
+"""Utility functions for interactions with Hugging Face Hub."""
+
+import binascii
+import configparser
+import importlib
+import io
+import os
+import pathlib
+import shutil
+import tempfile
+import time
+import zipfile
+from fnmatch import fnmatch
+from typing import Any
+
+import click
+from flask import current_app
+
+import annif
+from annif.exception import OperationFailedException
+from annif.project import Access, AnnifProject
+
+logger = annif.logger
+
+
+def get_matching_projects(pattern: str) -> list[AnnifProject]:
+ """
+ Get projects that match the given pattern.
+ """
+ return [
+ proj
+ for proj in annif.registry.get_projects(min_access=Access.private).values()
+ if fnmatch(proj.project_id, pattern)
+ ]
+
+
+def prepare_commits(projects: list[AnnifProject], repo_id: str) -> tuple[list, list]:
+ """Prepare and pre-upload data and config commit operations for projects to a
+ Hugging Face Hub repository."""
+ from huggingface_hub import preupload_lfs_files
+
+ fobjs, operations = [], []
+ data_dirs = {p.datadir for p in projects}
+ vocab_dirs = {p.vocab.datadir for p in projects}
+ all_dirs = data_dirs.union(vocab_dirs)
+
+ for data_dir in all_dirs:
+ fobj, operation = _prepare_datadir_commit(data_dir)
+ preupload_lfs_files(repo_id, additions=[operation])
+ fobjs.append(fobj)
+ operations.append(operation)
+
+ for project in projects:
+ fobj, operation = _prepare_config_commit(project)
+ fobjs.append(fobj)
+ operations.append(operation)
+
+ return fobjs, operations
+
+
+def _prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]:
+ from huggingface_hub import CommitOperationAdd
+
+ zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip"
+ fobj = _archive_dir(data_dir)
+ operation = CommitOperationAdd(path_in_repo=zip_repo_path, path_or_fileobj=fobj)
+ return fobj, operation
+
+
+def _prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]:
+ from huggingface_hub import CommitOperationAdd
+
+ config_repo_path = project.project_id + ".cfg"
+ fobj = _get_project_config(project)
+ operation = CommitOperationAdd(path_in_repo=config_repo_path, path_or_fileobj=fobj)
+ return fobj, operation
+
+
+def _is_train_file(fname: str) -> bool:
+ train_file_patterns = ("-train", "tmp-")
+ for pat in train_file_patterns:
+ if pat in fname:
+ return True
+ return False
+
+
+def _archive_dir(data_dir: str) -> io.BufferedRandom:
+ fp = tempfile.TemporaryFile()
+ path = pathlib.Path(data_dir)
+ fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)]
+ with zipfile.ZipFile(fp, mode="w") as zfile:
+ zfile.comment = bytes(
+ f"Archived by Annif {importlib.metadata.version('annif')}",
+ encoding="utf-8",
+ )
+ for fpath in fpaths:
+ logger.debug(f"Adding {fpath}")
+ arcname = os.path.join(*fpath.parts[1:])
+ zfile.write(fpath, arcname=arcname)
+ fp.seek(0)
+ return fp
+
+
+def _get_project_config(project: AnnifProject) -> io.BytesIO:
+ fp = tempfile.TemporaryFile(mode="w+t")
+ config = configparser.ConfigParser()
+ config[project.project_id] = project.config
+ config.write(fp) # This needs tempfile in text mode
+ fp.seek(0)
+ # But for upload fobj needs to be in binary mode
+ return io.BytesIO(fp.read().encode("utf8"))
+
+
+def get_matching_project_ids_from_hf_hub(
+ project_ids_pattern: str, repo_id: str, token, revision: str
+) -> list[str]:
+ """Get project IDs of the projects in a Hugging Face Model Hub repository that match
+ the given pattern."""
+ all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision)
+ return [
+ path.rsplit(".cfg")[0]
+ for path in all_repo_file_paths
+ if fnmatch(path, f"{project_ids_pattern}.cfg")
+ ]
+
+
+def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]:
+ from huggingface_hub import list_repo_files
+ from huggingface_hub.utils import HfHubHTTPError, HFValidationError
+
+ try:
+ return [
+ repofile
+ for repofile in list_repo_files(
+ repo_id=repo_id, token=token, revision=revision
+ )
+ ]
+ except (HfHubHTTPError, HFValidationError) as err:
+ raise OperationFailedException(str(err))
+
+
+def download_from_hf_hub(
+ filename: str, repo_id: str, token: str, revision: str
+) -> list[str]:
+ from huggingface_hub import hf_hub_download
+ from huggingface_hub.utils import HfHubHTTPError, HFValidationError
+
+ try:
+ return hf_hub_download(
+ repo_id=repo_id,
+ filename=filename,
+ token=token,
+ revision=revision,
+ )
+ except (HfHubHTTPError, HFValidationError) as err:
+ raise OperationFailedException(str(err))
+
+
+def unzip_archive(src_path: str, force: bool) -> None:
+ """Unzip a zip archive of projects and vocabularies to a directory, by
+ default data/ under current directory."""
+ datadir = current_app.config["DATADIR"]
+ with zipfile.ZipFile(src_path, "r") as zfile:
+ archive_comment = str(zfile.comment, encoding="utf-8")
+ logger.debug(
+ f'Extracting archive {src_path}; archive comment: "{archive_comment}"'
+ )
+ for member in zfile.infolist():
+ _unzip_member(zfile, member, datadir, force)
+
+
+def _unzip_member(
+ zfile: zipfile.ZipFile, member: zipfile.ZipInfo, datadir: str, force: bool
+) -> None:
+ dest_path = os.path.join(datadir, member.filename)
+ if os.path.exists(dest_path) and not force:
+ _handle_existing_file(member, dest_path)
+ return
+ logger.debug(f"Unzipping to {dest_path}")
+ zfile.extract(member, path=datadir)
+ _restore_timestamps(member, dest_path)
+
+
+def _handle_existing_file(member: zipfile.ZipInfo, dest_path: str) -> None:
+ if _are_identical_member_and_file(member, dest_path):
+ logger.debug(f"Skipping unzip to {dest_path}; already in place")
+ else:
+ click.echo(f"Not overwriting {dest_path} (use --force to override)")
+
+
+def _are_identical_member_and_file(member: zipfile.ZipInfo, dest_path: str) -> bool:
+ path_crc = _compute_crc32(dest_path)
+ return path_crc == member.CRC
+
+
+def _restore_timestamps(member: zipfile.ZipInfo, dest_path: str) -> None:
+ date_time = time.mktime(member.date_time + (0, 0, -1))
+ os.utime(dest_path, (date_time, date_time))
+
+
+def copy_project_config(src_path: str, force: bool) -> None:
+ """Copy a given project configuration file to projects.d/ directory."""
+ project_configs_dest_dir = "projects.d"
+ os.makedirs(project_configs_dest_dir, exist_ok=True)
+
+ dest_path = os.path.join(project_configs_dest_dir, os.path.basename(src_path))
+ if os.path.exists(dest_path) and not force:
+ if _are_identical_files(src_path, dest_path):
+ logger.debug(f"Skipping copy to {dest_path}; already in place")
+ else:
+ click.echo(f"Not overwriting {dest_path} (use --force to override)")
+ else:
+ logger.debug(f"Copying to {dest_path}")
+ shutil.copy(src_path, dest_path)
+
+
+def _are_identical_files(src_path: str, dest_path: str) -> bool:
+ src_crc32 = _compute_crc32(src_path)
+ dest_crc32 = _compute_crc32(dest_path)
+ return src_crc32 == dest_crc32
+
+
+def _compute_crc32(path: str) -> int:
+ if os.path.isdir(path):
+ return 0
+
+ size = 1024 * 1024 * 10 # 10 MiB chunks
+ with open(path, "rb") as fp:
+ crcval = 0
+ while chunk := fp.read(size):
+ crcval = binascii.crc32(chunk, crcval)
+ return crcval
+
+
+def get_vocab_id_from_config(config_path: str) -> str:
+ """Get the vocabulary ID from a configuration file."""
+ config = configparser.ConfigParser()
+ config.read(config_path)
+ section = config.sections()[0]
+ return config[section]["vocab"]
diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py
index 37564a76d..b475d2975 100644
--- a/annif/lexical/mllm.py
+++ b/annif/lexical/mllm.py
@@ -1,4 +1,5 @@
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
+
from __future__ import annotations
import collections
@@ -223,7 +224,7 @@ def _prepare_train_index(
self._prepare_relations(graph, vocab)
self._vectorizer = CountVectorizer(
- binary=True, tokenizer=analyzer.tokenize_words
+ binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None
)
label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py
index 07c15705d..d577527ad 100644
--- a/annif/lexical/tokenset.py
+++ b/annif/lexical/tokenset.py
@@ -1,4 +1,5 @@
"""Index for fast matching of token sets."""
+
from __future__ import annotations
import collections
diff --git a/annif/lexical/util.py b/annif/lexical/util.py
index 28d21a141..ca3cb8fb3 100644
--- a/annif/lexical/util.py
+++ b/annif/lexical/util.py
@@ -1,4 +1,5 @@
"""Utility methods for lexical algorithms"""
+
from __future__ import annotations
import collections
diff --git a/annif/openapi/annif.yaml b/annif/openapi/annif.yaml
index c5143313d..74e8a4661 100644
--- a/annif/openapi/annif.yaml
+++ b/annif/openapi/annif.yaml
@@ -174,7 +174,9 @@ paths:
responses:
"204":
description: successful operation
- content: {}
+ content:
+ application/json:
+ {}
"404":
$ref: '#/components/responses/NotFound'
"503":
diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py
index 7f920b35d..2fce37732 100644
--- a/annif/openapi/validation.py
+++ b/annif/openapi/validation.py
@@ -1,49 +1,39 @@
"""Custom validator for the Annif API."""
+
from __future__ import annotations
import logging
+from typing import Any
-import jsonschema
-from connexion import decorators
from connexion.exceptions import BadRequestProblem
-from connexion.utils import is_null
+from connexion.json_schema import format_error_with_path
+from connexion.validators import JSONRequestBodyValidator
+from jsonschema.exceptions import ValidationError
logger = logging.getLogger("openapi.validation")
-class CustomRequestBodyValidator(decorators.validation.RequestBodyValidator):
+class CustomRequestBodyValidator(JSONRequestBodyValidator):
"""Custom request body validator that overrides the default error message for the
- 'maxItems' validator for the 'documents' property."""
+ 'maxItems' validator for the 'documents' property to prevent logging request body
+ with the contents of all documents."""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
- def validate_schema(
- self,
- data: list | dict,
- url: str,
- ) -> None:
- """Validate the request body against the schema."""
-
- if self.is_null_value_valid and is_null(data):
- return None # pragma: no cover
-
+ def _validate(self, body: Any) -> dict | None:
try:
- self.validator.validate(data)
- except jsonschema.ValidationError as exception:
+ return self._validator.validate(body)
+ except ValidationError as exception:
if exception.validator == "maxItems" and list(exception.schema_path) == [
"properties",
"documents",
"maxItems",
]:
exception.message = "too many items"
-
- error_path_msg = self._error_path_message(exception=exception)
+ error_path_msg = format_error_with_path(exception=exception)
logger.error(
- "{url} validation error: {error}{error_path_msg}".format(
- url=url, error=exception.message, error_path_msg=error_path_msg
- ),
+ f"Validation error: {exception.message}{error_path_msg}",
extra={"validator": "body"},
)
raise BadRequestProblem(detail=f"{exception.message}{error_path_msg}")
- return None
diff --git a/annif/parallel.py b/annif/parallel.py
index c6b293f8e..2fead01b9 100644
--- a/annif/parallel.py
+++ b/annif/parallel.py
@@ -1,4 +1,5 @@
"""Parallel processing functionality for Annif"""
+
from __future__ import annotations
import multiprocessing
diff --git a/annif/project.py b/annif/project.py
index 83f7eda7c..3a4aa5566 100644
--- a/annif/project.py
+++ b/annif/project.py
@@ -1,4 +1,5 @@
"""Project management functionality for Annif"""
+
from __future__ import annotations
import enum
diff --git a/annif/registry.py b/annif/registry.py
index 81bd541ef..01a5b9c82 100644
--- a/annif/registry.py
+++ b/annif/registry.py
@@ -1,6 +1,8 @@
"""Registry that keeps track of Annif projects"""
+
from __future__ import annotations
+import os
import re
from flask import Flask, current_app
@@ -35,9 +37,11 @@ def __init__(
self._projects_config_path = projects_config_path
self._datadir = datadir
self._init_vars()
+ projects_pattern = os.getenv("ANNIF_PROJECTS_INIT", ".*")
if init_projects:
for project in self._projects[self._rid].values():
- project.initialize()
+ if re.search(projects_pattern, project.project_id) is not None:
+ project.initialize()
def _init_vars(self) -> None:
# initialize the static variables, if necessary
@@ -99,14 +103,11 @@ def get_vocab(
vocab_id = match.group(1)
posargs, kwargs = parse_args(match.group(3))
language = posargs[0] if posargs else default_language
- vocab_key = (vocab_id, language)
self._init_vars()
- if vocab_key not in self._vocabs[self._rid]:
- self._vocabs[self._rid][vocab_key] = AnnifVocabulary(
- vocab_id, self._datadir
- )
- return self._vocabs[self._rid][vocab_key], language
+ if vocab_id not in self._vocabs[self._rid]:
+ self._vocabs[self._rid][vocab_id] = AnnifVocabulary(vocab_id, self._datadir)
+ return self._vocabs[self._rid][vocab_id], language
def initialize_projects(app: Flask) -> None:
diff --git a/annif/rest.py b/annif/rest.py
index f848117c8..c7f457687 100644
--- a/annif/rest.py
+++ b/annif/rest.py
@@ -1,5 +1,6 @@
"""Definitions for REST API operations. These are wired via Connexion to
methods defined in the OpenAPI specification."""
+
from __future__ import annotations
import importlib
@@ -13,8 +14,6 @@
from annif.project import Access
if TYPE_CHECKING:
- from datetime import datetime
-
from connexion.lifecycle import ConnexionResponse
from annif.corpus.subject import SubjectIndex
@@ -42,10 +41,11 @@ def server_error(
)
-def show_info() -> dict[str, str]:
+def show_info() -> tuple:
"""return version of annif and a title for the api according to OpenAPI spec"""
- return {"title": "Annif REST API", "version": importlib.metadata.version("annif")}
+ result = {"title": "Annif REST API", "version": importlib.metadata.version("annif")}
+ return result, 200, {"Content-Type": "application/json"}
def language_not_supported_error(lang: str) -> ConnexionResponse:
@@ -58,15 +58,16 @@ def language_not_supported_error(lang: str) -> ConnexionResponse:
)
-def list_projects() -> dict[str, list[dict[str, str | dict | bool | datetime | None]]]:
+def list_projects() -> tuple:
"""return a dict with projects formatted according to OpenAPI spec"""
- return {
+ result = {
"projects": [
proj.dump()
for proj in annif.registry.get_projects(min_access=Access.public).values()
]
}
+ return result, 200, {"Content-Type": "application/json"}
def show_project(
@@ -78,7 +79,7 @@ def show_project(
project = annif.registry.get_project(project_id, min_access=Access.hidden)
except ValueError:
return project_not_found_error(project_id)
- return project.dump()
+ return project.dump(), 200, {"Content-Type": "application/json"}
def _suggestion_to_dict(
@@ -123,7 +124,7 @@ def suggest(
if _is_error(result):
return result
- return result[0]
+ return result[0], 200, {"Content-Type": "application/json"}
def suggest_batch(
@@ -141,7 +142,7 @@ def suggest_batch(
return result
for document_results, document in zip(result, documents):
document_results["document_id"] = document.get("document_id")
- return result
+ return result, 200, {"Content-Type": "application/json"}
def _suggest(
@@ -213,4 +214,4 @@ def learn(
except AnnifException as err:
return server_error(err)
- return None, 204
+ return None, 204, {"Content-Type": "application/json"}
diff --git a/annif/static/css/style.css b/annif/static/css/style.css
index a2b2df37f..0a97471d6 100644
--- a/annif/static/css/style.css
+++ b/annif/static/css/style.css
@@ -31,6 +31,11 @@ h1 { font-size: 2rem; }
border-radius: 0px;
padding: 2px 7px;
}
+#annif-version {
+ float: right;
+ margin-top: 0.5rem;
+ font-size: 1rem;
+}
label, #suggestions {
border-top: 1px solid #343260;
margin-bottom: 0.5rem;
diff --git a/annif/suggestion.py b/annif/suggestion.py
index ddf3ec2e5..fd9eb48cc 100644
--- a/annif/suggestion.py
+++ b/annif/suggestion.py
@@ -1,4 +1,5 @@
"""Representing suggested subjects."""
+
from __future__ import annotations
import collections
@@ -38,7 +39,7 @@ def filter_suggestion(
data, rows, cols = [], [], []
for row in range(preds.shape[0]):
- arow = preds.getrow(row)
+ arow = preds[[row]]
if limit is not None and limit < len(arow.data):
topk_idx = arow.data.argpartition(-limit)[-limit:]
else:
diff --git a/annif/templates/home.html b/annif/templates/home.html
index 1e732366d..a7e6f2c58 100644
--- a/annif/templates/home.html
+++ b/annif/templates/home.html
@@ -51,6 +51,7 @@
<% problem.title %>
+
Annif v<% annif_version %>
@@ -233,6 +234,7 @@
Suggested subjects
\
el: '#app',
data: {
text: '',
+ annif_version: '',
project: '',
limit: 10,
projects: [],
@@ -243,6 +245,7 @@
Suggested subjects
\
mounted: function() {
// TBD: we can add a button to reload the list of projects later
this.loadProjects();
+ this.loadVersion();
},
methods: {
clearResults: function() {
@@ -282,6 +285,13 @@
Suggested subjects
\
}
});
},
+ loadVersion: function() {
+ var this_ = this;
+ axios.get('/v1/')
+ .then(res => {
+ this_.annif_version = res.data.version;
+ })
+ },
suggest: function(event) {
this.problems = [];
if (this.text.trim() === '') {
@@ -296,7 +306,7 @@
Suggested subjects
\
return;
}
var this_ = this;
- var formData = new FormData();
+ var formData = new URLSearchParams();
formData.append('text', this_.text);
formData.append('limit', this_.limit);
this_.loading = true;
diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py
index 716d874a2..680b02cc0 100644
--- a/annif/transform/__init__.py
+++ b/annif/transform/__init__.py
@@ -1,4 +1,5 @@
"""Functionality for obtaining text transformation from string specification"""
+
from __future__ import annotations
import re
diff --git a/annif/transform/inputlimiter.py b/annif/transform/inputlimiter.py
index 229766864..d57c6a56c 100644
--- a/annif/transform/inputlimiter.py
+++ b/annif/transform/inputlimiter.py
@@ -1,5 +1,6 @@
"""A simple transformation that truncates the text of input documents to a
given character length."""
+
from __future__ import annotations
from typing import TYPE_CHECKING
diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py
index cb6f05b81..e5cf8fdfe 100644
--- a/annif/transform/langfilter.py
+++ b/annif/transform/langfilter.py
@@ -1,5 +1,6 @@
"""Transformation filtering out parts of a text that are in a language
different from the language of the project."""
+
from __future__ import annotations
from typing import TYPE_CHECKING
diff --git a/annif/transform/transform.py b/annif/transform/transform.py
index db71fef37..9a96f2877 100644
--- a/annif/transform/transform.py
+++ b/annif/transform/transform.py
@@ -1,4 +1,5 @@
"""Common functionality for transforming text of input documents."""
+
from __future__ import annotations
import abc
diff --git a/annif/util.py b/annif/util.py
index 1702c2e4b..b03c63ec2 100644
--- a/annif/util.py
+++ b/annif/util.py
@@ -1,4 +1,5 @@
"""Utility functions for Annif"""
+
from __future__ import annotations
import glob
diff --git a/annif/vocab.py b/annif/vocab.py
index 333fa0d69..7f7e125c4 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -1,4 +1,5 @@
"""Vocabulary management functionality for Annif"""
+
from __future__ import annotations
import os.path
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 149b96271..760d396c0 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
-sphinx==4.5.*
+sphinx==7.2.*
sphinx-rtd-theme
-sphinxcontrib-apidoc==0.3.0
+sphinxcontrib-apidoc==0.5.*
sphinx-click
-docutils==0.16
+docutils==0.20.*
diff --git a/docs/source/commands.rst b/docs/source/commands.rst
index 65cca5e9d..c6a9fa59d 100644
--- a/docs/source/commands.rst
+++ b/docs/source/commands.rst
@@ -66,6 +66,20 @@ Project administration
N/A
+.. click:: annif.cli:run_upload
+ :prog: annif upload
+
+**REST equivalent**
+
+ N/A
+
+.. click:: annif.cli:run_download
+ :prog: annif download
+
+**REST equivalent**
+
+ N/A
+
****************************
Subject index administration
****************************
@@ -121,9 +135,20 @@ Subject index administration
N/A
-.. click:: flask.cli:run_command
+.. click:: annif.cli:run_app
:prog: annif run
**REST equivalent**
N/A
+
+*****
+Other
+*****
+
+.. click:: annif.cli:run_completion
+ :prog: annif completion
+
+**REST equivalent**
+
+ N/A
diff --git a/pyproject.toml b/pyproject.toml
index aa01281b1..62d6bf51d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "annif"
-version = "1.0.0-dev"
+version = "1.2.0-dev"
description = "Automated subject indexing and classification tool"
authors = ["National Library of Finland
"]
maintainers = [
@@ -18,58 +18,57 @@ keywords = [
"text-classification",
"rest-api",
"code4lib",
- "subject-indexing"
+ "subject-indexing",
]
-classifiers=[
+classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
- "Operating System :: OS Independent"
+ "Operating System :: OS Independent",
]
[tool.poetry.dependencies]
-python = ">=3.8,<3.11"
+python = ">=3.9,<3.13"
-connexion = {version = "2.14.*", extras = ["swagger-ui"]}
-flask = ">=1.0.4,<3"
-flask-cors = "3.0.*"
+connexion = { version = "~3.0.5", extras = ["flask", "uvicorn", "swagger-ui"] }
click = "8.1.*"
click-log = "0.4.*"
-joblib = "1.2.*"
+joblib = "1.4.*"
nltk = "3.8.*"
gensim = "4.3.*"
-scikit-learn = "1.2.2"
-scipy = "1.10.*"
-rdflib = ">=4.2,<7.0"
-gunicorn = "20.1.*"
-numpy = "1.24.*"
-optuna = "2.10.*"
-python-dateutil = "2.8.*"
-tomli = "2.0.*"
+scikit-learn = "1.4.*"
+scipy = "1.12.*"
+rdflib = "7.0.*"
+requests = "2.31.*"
+gunicorn = "22.0.*"
+numpy = "1.26.*"
+optuna = "3.6.*"
+python-dateutil = "2.9.*"
+tomli = { version = "2.0.*", python = "<3.11" }
simplemma = { git = "https://github.com/adbar/simplemma", branch = "main" }
-jsonschema = "4.17.*"
+jsonschema = "4.21.*"
+huggingface-hub = "0.22.*"
-fasttext-wheel = {version = "0.9.2", optional = true}
-voikko = {version = "0.5.*", optional = true}
-tensorflow-cpu = {version = "2.11.*", optional = true}
-lmdb = {version = "1.4.1", optional = true}
-omikuji = {version = "0.5.*", optional = true}
-yake = {version = "0.4.5", optional = true}
-spacy = {version = "3.5.*", optional = true}
-stwfsapy = {version="0.3.*", optional = true}
+fasttext-wheel = { version = "0.9.2", optional = true }
+voikko = { version = "0.5.*", optional = true }
+tensorflow-cpu = { version = "2.15.*", optional = true, python = "<3.12" }
+lmdb = { version = "1.4.1", optional = true }
+omikuji = { version = "0.5.*", optional = true }
+yake = { version = "0.4.8", optional = true }
+spacy = { version = "3.7.*", optional = true }
+stwfsapy = { version = "0.4.*", optional = true, python = "<3.12" }
[tool.poetry.dev-dependencies]
py = "*"
-pytest = "*"
-requests = "*"
+pytest = "8.*"
pytest-cov = "*"
pytest-watch = "*"
pytest-flask = "*"
flake8 = "*"
bumpversion = "*"
-black = "23.*"
+black = "24.*"
isort = "*"
-schemathesis = "3.19.*"
+schemathesis = "3.*.*"
[tool.poetry.extras]
fasttext = ["fasttext-wheel"]
@@ -93,4 +92,5 @@ line_length = "88"
skip_gitignore = true
[tool.pytest.ini_options]
+markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
addopts = "-m 'not slow'"
diff --git a/setup.cfg b/setup.cfg
index bf3f116d6..3055b4c72 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,9 +1,9 @@
[bumpversion]
-current_version = 1.0.0-dev
+current_version = 1.2.0-dev
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))?
-serialize =
+serialize =
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
@@ -13,7 +13,7 @@ serialize =
[bumpversion:part:release]
optional_value = prod
-values =
+values =
dev
prod
@@ -25,5 +25,5 @@ max-line-length = 88
ignore = E203 W503
[coverage:report]
-exclude_also =
- if TYPE_CHECKING:
+exclude_also =
+ if TYPE_CHECKING:
diff --git a/tests/conftest.py b/tests/conftest.py
index 76378a98d..7d7a851ee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,6 @@
"""common fixtures for use by all test classes"""
-import os.path
+import os
import shutil
import unittest.mock
@@ -15,27 +15,39 @@
@pytest.fixture(scope="module")
-def app():
+def cxapp():
# make sure the dummy vocab is in place because many tests depend on it
subjfile = os.path.join(os.path.dirname(__file__), "corpora", "dummy-subjects.csv")
- app = annif.create_app(config_name="annif.default_config.TestingConfig")
- with app.app_context():
+ cxapp = annif.create_app(config_name="annif.default_config.TestingConfig")
+ with cxapp.app.app_context():
project = annif.registry.get_project("dummy-en")
# the vocab is needed for both English and Finnish language projects
vocab = annif.corpus.SubjectFileCSV(subjfile)
project.vocab.load_vocabulary(vocab)
- return app
+ return cxapp
+
+
+@pytest.fixture(scope="module")
+def app(cxapp):
+ return cxapp.app
@pytest.fixture(scope="module")
def app_with_initialize():
- app = annif.create_app(config_name="annif.default_config.TestingInitializeConfig")
- return app
+ cxapp = annif.create_app(config_name="annif.default_config.TestingInitializeConfig")
+ return cxapp.app
+
+
+@pytest.fixture(scope="module")
+@unittest.mock.patch.dict(os.environ, {"ANNIF_PROJECTS_INIT": ".*-fi"})
+def app_with_initialize_fi_projects():
+ cxapp = annif.create_app(config_name="annif.default_config.TestingInitializeConfig")
+ return cxapp.app
@pytest.fixture
-def app_client(app):
- with app.test_client() as app_client:
+def app_client(cxapp):
+ with cxapp.test_client() as app_client:
yield app_client
diff --git a/tests/huggingface-cache/dummy-en.cfg b/tests/huggingface-cache/dummy-en.cfg
new file mode 100644
index 000000000..58398e8d0
--- /dev/null
+++ b/tests/huggingface-cache/dummy-en.cfg
@@ -0,0 +1,7 @@
+[dummy-en]
+name=Dummy English
+language=en
+backend=dummy
+analyzer=snowball(english)
+vocab=dummy
+access=hidden
diff --git a/tests/huggingface-cache/dummy-fi.cfg b/tests/huggingface-cache/dummy-fi.cfg
new file mode 100644
index 000000000..4d996f9b6
--- /dev/null
+++ b/tests/huggingface-cache/dummy-fi.cfg
@@ -0,0 +1,8 @@
+[dummy-fi]
+name=Dummy Finnish
+language=fi
+backend=dummy
+analyzer=snowball(finnish)
+key=value
+vocab=dummy
+access=public
diff --git a/tests/huggingface-cache/projects/dummy-en.zip b/tests/huggingface-cache/projects/dummy-en.zip
new file mode 100644
index 000000000..5325bf527
Binary files /dev/null and b/tests/huggingface-cache/projects/dummy-en.zip differ
diff --git a/tests/huggingface-cache/projects/dummy-fi.zip b/tests/huggingface-cache/projects/dummy-fi.zip
new file mode 100644
index 000000000..3c6f29f4a
Binary files /dev/null and b/tests/huggingface-cache/projects/dummy-fi.zip differ
diff --git a/tests/huggingface-cache/vocabs/dummy.zip b/tests/huggingface-cache/vocabs/dummy.zip
new file mode 100644
index 000000000..b43a5f3eb
Binary files /dev/null and b/tests/huggingface-cache/vocabs/dummy.zip differ
diff --git a/tests/test_backend_nn_ensemble.py b/tests/test_backend_nn_ensemble.py
index 1941e8665..1177f074f 100644
--- a/tests/test_backend_nn_ensemble.py
+++ b/tests/test_backend_nn_ensemble.py
@@ -1,14 +1,21 @@
"""Unit tests for the nn_ensemble backend in Annif"""
+import importlib
+import os.path
import time
from datetime import datetime, timedelta, timezone
+from unittest import mock
import py.path
import pytest
import annif.backend
import annif.corpus
-from annif.exception import NotInitializedException, NotSupportedException
+from annif.exception import (
+ NotInitializedException,
+ NotSupportedException,
+ OperationFailedException,
+)
pytest.importorskip("annif.backend.nn_ensemble")
lmdb = pytest.importorskip("lmdb")
@@ -105,11 +112,11 @@ def test_nn_ensemble_train_and_learn(registry, tmpdir):
assert nn_ensemble._model.optimizer.learning_rate.value() == 0.001
datadir = py.path.local(project.datadir)
- assert datadir.join("nn-model.h5").exists()
- assert datadir.join("nn-model.h5").size() > 0
+ assert datadir.join("nn-model.keras").exists()
+ assert datadir.join("nn-model.keras").size() > 0
# test online learning
- modelfile = datadir.join("nn-model.h5")
+ modelfile = datadir.join("nn-model.keras")
old_size = modelfile.size()
old_mtime = modelfile.mtime()
@@ -129,7 +136,7 @@ def test_nn_ensemble_train_cached(registry):
datadir = py.path.local(project.datadir)
assert datadir.join("nn-train.mdb").exists()
- datadir.join("nn-model.h5").remove()
+ datadir.join("nn-model.keras").remove()
nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
nn_ensemble = nn_ensemble_type(
@@ -140,8 +147,8 @@ def test_nn_ensemble_train_cached(registry):
nn_ensemble.train("cached")
- assert datadir.join("nn-model.h5").exists()
- assert datadir.join("nn-model.h5").size() > 0
+ assert datadir.join("nn-model.keras").exists()
+ assert datadir.join("nn-model.keras").size() > 0
def test_nn_ensemble_train_and_learn_params(registry, tmpdir, capfd):
@@ -192,6 +199,58 @@ def test_nn_ensemble_modification_time(app_project):
assert datetime.now(timezone.utc) - nn_ensemble.modification_time < timedelta(1)
+def test_nn_ensemble_get_model_metadata(app_project):
+ nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
+ nn_ensemble = nn_ensemble_type(
+ backend_id="nn_ensemble",
+ config_params={"sources": "dummy-en"},
+ project=app_project,
+ )
+ model_filename = os.path.join(nn_ensemble.datadir, nn_ensemble.MODEL_FILE)
+
+ expected_version = importlib.metadata.version("keras")
+ expected_date_saved = datetime.now(timezone.utc)
+ actual_metadata = nn_ensemble.get_model_metadata(model_filename)
+
+ assert actual_metadata["keras_version"] == expected_version
+ datetime_format = "%Y-%m-%d@%H:%M:%S"
+ actual_datetime = datetime.strptime(actual_metadata["date_saved"], datetime_format)
+ assert expected_date_saved - actual_datetime.astimezone(
+ tz=timezone.utc
+ ) < timedelta(1)
+
+
+def test_nn_ensemble_get_model_metadata_nonexistent_file(app_project):
+ nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
+ nn_ensemble = nn_ensemble_type(
+ backend_id="nn_ensemble",
+ config_params={"sources": "dummy-en"},
+ project=app_project,
+ )
+ nonexistent_model_file = "nonexistent.zip"
+ model_filename = os.path.join(nn_ensemble.datadir, nonexistent_model_file)
+
+ actual_metadata = nn_ensemble.get_model_metadata(model_filename)
+ assert actual_metadata is None
+
+
+@mock.patch("annif.backend.nn_ensemble.load_model", side_effect=Exception)
+def test_nn_ensemble_initialize_error(load_model, app_project):
+ nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
+ nn_ensemble = nn_ensemble_type(
+ backend_id="nn_ensemble",
+ config_params={"sources": "dummy-en"},
+ project=app_project,
+ )
+ assert nn_ensemble._model is None
+ with pytest.raises(
+ OperationFailedException,
+ match=r"loading Keras model from .*; model metadata: .*",
+ ):
+ nn_ensemble.initialize()
+ assert load_model.called
+
+
def test_nn_ensemble_initialize(app_project):
nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
nn_ensemble = nn_ensemble_type(
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 77adeab0f..d4c7f17d7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -12,8 +12,11 @@
from click.shell_completion import ShellComplete
from click.testing import CliRunner
+from huggingface_hub.utils import HFValidationError
import annif.cli
+import annif.cli_util
+import annif.hfh_util
import annif.parallel
runner = CliRunner(env={"ANNIF_CONFIG": "annif.default_config.TestingConfig"})
@@ -1051,25 +1054,270 @@ def test_version_option():
assert result.output.strip() == version.strip()
-def test_run():
+@mock.patch("connexion.FlaskApp.run")
+def test_run(run):
+ result = runner.invoke(annif.cli.cli, ["run"])
+ assert not result.exception
+ assert result.exit_code == 0
+ assert run.called
+
+
+def test_run_help():
result = runner.invoke(annif.cli.cli, ["run", "--help"])
assert not result.exception
assert result.exit_code == 0
- assert "Run a local development server." in result.output
+ assert "Run Annif in server mode for development." in result.output
-def test_routes_with_flask_app():
- # When using plain Flask only the static endpoint exists
- result = runner.invoke(annif.cli.cli, ["routes"])
- assert re.search(r"static\s+GET\s+\/static\/\", result.output)
- assert not re.search(r"app.home\s+GET\s+\/", result.output)
+@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
+@mock.patch("huggingface_hub.CommitOperationAdd")
+@mock.patch("huggingface_hub.HfApi.create_commit")
+def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
+ result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"])
+ assert not result.exception
+ assert create_commit.call_count == 1
+ assert CommitOperationAdd.call_count == 3 # projects, vocab, config
+ assert (
+ mock.call(
+ path_or_fileobj=mock.ANY, # io.BufferedRandom object
+ path_in_repo="data/vocabs/dummy.zip",
+ )
+ in CommitOperationAdd.call_args_list
+ )
+ assert (
+ mock.call(
+ path_or_fileobj=mock.ANY, # io.BufferedRandom object
+ path_in_repo="data/projects/dummy-fi.zip",
+ )
+ in CommitOperationAdd.call_args_list
+ )
+ assert (
+ mock.call(
+ path_or_fileobj=mock.ANY, # io.BytesIO object
+ path_in_repo="dummy-fi.cfg",
+ )
+ in CommitOperationAdd.call_args_list
+ )
+ assert (
+ mock.call(
+ repo_id="dummy-repo",
+ operations=mock.ANY,
+ commit_message="Upload project(s) dummy-fi with Annif",
+ token=None,
+ revision=None,
+ )
+ in create_commit.call_args_list
+ )
+
+
+@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
+@mock.patch("huggingface_hub.CommitOperationAdd")
+@mock.patch("huggingface_hub.HfApi.create_commit")
+def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files):
+ result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"])
+ assert not result.exception
+ assert create_commit.call_count == 1
+ assert CommitOperationAdd.call_count == 11
-def test_routes_with_connexion_app():
- # When using Connexion all endpoints exist
- result = os.popen("python annif/cli.py routes").read()
- assert re.search(r"static\s+GET\s+\/static\/", result)
- assert re.search(r"app.home\s+GET\s+\/", result)
+def test_upload_nonexistent_repo():
+ failed_result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "nonexistent"])
+ assert failed_result.exception
+ assert failed_result.exit_code != 0
+ assert "Repository Not Found for url:" in failed_result.output
+
+
+def hf_hub_download_mock_side_effect(filename, repo_id, token, revision):
+ return "tests/huggingface-cache/" + filename # Mocks the downloaded file paths
+
+
+@mock.patch(
+ "huggingface_hub.list_repo_files",
+ return_value=[ # Mocks the filenames in repo
+ "projects/dummy-fi.zip",
+ "vocabs/dummy.zip",
+ "dummy-fi.cfg",
+ "projects/dummy-en.zip",
+ "vocabs/dummy.zip",
+ "dummy-en.cfg",
+ ],
+)
+@mock.patch(
+ "huggingface_hub.hf_hub_download",
+ side_effect=hf_hub_download_mock_side_effect,
+)
+@mock.patch("annif.hfh_util.copy_project_config")
+def test_download_dummy_fi(
+ copy_project_config, hf_hub_download, list_repo_files, testdatadir
+):
+ result = runner.invoke(
+ annif.cli.cli,
+ [
+ "download",
+ "dummy-fi",
+ "mock-repo",
+ ],
+ )
+ assert not result.exception
+ assert list_repo_files.called
+ assert hf_hub_download.called
+ assert hf_hub_download.call_args_list == [
+ mock.call(
+ repo_id="mock-repo",
+ filename="projects/dummy-fi.zip",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="dummy-fi.cfg",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="vocabs/dummy.zip",
+ token=None,
+ revision=None,
+ ),
+ ]
+ dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ fpath = os.path.join(str(dirpath), "file.txt")
+ assert os.path.exists(fpath)
+ assert copy_project_config.call_args_list == [
+ mock.call("tests/huggingface-cache/dummy-fi.cfg", False)
+ ]
+
+
+@mock.patch(
+ "huggingface_hub.list_repo_files",
+ return_value=[ # Mock filenames in repo
+ "projects/dummy-fi.zip",
+ "vocabs/dummy.zip",
+ "dummy-fi.cfg",
+ "projects/dummy-en.zip",
+ "vocabs/dummy.zip",
+ "dummy-en.cfg",
+ ],
+)
+@mock.patch(
+ "huggingface_hub.hf_hub_download",
+ side_effect=hf_hub_download_mock_side_effect,
+)
+@mock.patch("annif.hfh_util.copy_project_config")
+def test_download_dummy_fi_and_en(
+ copy_project_config, hf_hub_download, list_repo_files, testdatadir
+):
+ result = runner.invoke(
+ annif.cli.cli,
+ [
+ "download",
+ "dummy-??",
+ "mock-repo",
+ ],
+ )
+ assert not result.exception
+ assert list_repo_files.called
+ assert hf_hub_download.called
+ assert hf_hub_download.call_args_list == [
+ mock.call(
+ repo_id="mock-repo",
+ filename="projects/dummy-fi.zip",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="dummy-fi.cfg",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="projects/dummy-en.zip",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="dummy-en.cfg",
+ token=None,
+ revision=None,
+ ),
+ mock.call(
+ repo_id="mock-repo",
+ filename="vocabs/dummy.zip",
+ token=None,
+ revision=None,
+ ),
+ ]
+ dirpath_fi = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ fpath_fi = os.path.join(str(dirpath_fi), "file.txt")
+ assert os.path.exists(fpath_fi)
+ dirpath_en = os.path.join(str(testdatadir), "projects", "dummy-en")
+ fpath_en = os.path.join(str(dirpath_en), "file.txt")
+ assert os.path.exists(fpath_en)
+ assert copy_project_config.call_args_list == [
+ mock.call("tests/huggingface-cache/dummy-fi.cfg", False),
+ mock.call("tests/huggingface-cache/dummy-en.cfg", False),
+ ]
+
+
+@mock.patch(
+ "huggingface_hub.list_repo_files",
+ side_effect=HFValidationError,
+)
+@mock.patch(
+ "huggingface_hub.hf_hub_download",
+)
+def test_download_list_repo_files_failed(
+ hf_hub_download,
+ list_repo_files,
+):
+ failed_result = runner.invoke(
+ annif.cli.cli,
+ [
+ "download",
+ "dummy-fi",
+ "mock-repo",
+ ],
+ )
+ assert failed_result.exception
+ assert failed_result.exit_code != 0
+ assert "Error: Operation failed:" in failed_result.output
+ assert list_repo_files.called
+ assert not hf_hub_download.called
+
+
+@mock.patch(
+ "huggingface_hub.list_repo_files",
+ return_value=[ # Mock filenames in repo
+ "projects/dummy-fi.zip",
+ "vocabs/dummy.zip",
+ "dummy-fi.cfg",
+ ],
+)
+@mock.patch(
+ "huggingface_hub.hf_hub_download",
+ side_effect=HFValidationError,
+)
+def test_download_hf_hub_download_failed(
+ hf_hub_download,
+ list_repo_files,
+):
+ failed_result = runner.invoke(
+ annif.cli.cli,
+ [
+ "download",
+ "dummy-fi",
+ "mock-repo",
+ ],
+ )
+ assert failed_result.exception
+ assert failed_result.exit_code != 0
+ assert "Error: Operation failed:" in failed_result.output
+ assert list_repo_files.called
+ assert hf_hub_download.called
def test_completion_script_generation():
diff --git a/tests/test_hfh_util.py b/tests/test_hfh_util.py
new file mode 100644
index 000000000..ce3d6aac9
--- /dev/null
+++ b/tests/test_hfh_util.py
@@ -0,0 +1,103 @@
+"""Unit test module for Hugging Face Hub utilities."""
+
+import io
+import os.path
+import zipfile
+from datetime import datetime, timezone
+from unittest import mock
+
+import annif.hfh_util
+
+
+def test_archive_dir(testdatadir):
+ dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ os.makedirs(dirpath, exist_ok=True)
+ open(os.path.join(str(dirpath), "foo.txt"), "a").close()
+ open(os.path.join(str(dirpath), "-train.txt"), "a").close()
+
+ fobj = annif.hfh_util._archive_dir(dirpath)
+ assert isinstance(fobj, io.BufferedRandom)
+
+ with zipfile.ZipFile(fobj, mode="r") as zfile:
+ archived_files = zfile.namelist()
+ assert len(archived_files) == 1
+ assert os.path.split(archived_files[0])[1] == "foo.txt"
+
+
+def test_get_project_config(app_project):
+ result = annif.hfh_util._get_project_config(app_project)
+ assert isinstance(result, io.BytesIO)
+ string_result = result.read().decode("UTF-8")
+ assert "[dummy-en]" in string_result
+
+
+def test_unzip_archive_initial(testdatadir):
+ dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ fpath = os.path.join(str(dirpath), "file.txt")
+ annif.hfh_util.unzip_archive(
+ os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"),
+ force=False,
+ )
+ assert os.path.exists(fpath)
+ assert os.path.getsize(fpath) == 0 # Zero content from zip
+ ts = os.path.getmtime(fpath)
+ assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime(
+ 1980, 1, 1, 0, 0
+ ).astimezone(tz=timezone.utc)
+
+
+def test_unzip_archive_no_overwrite(testdatadir):
+ dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ fpath = os.path.join(str(dirpath), "file.txt")
+ os.makedirs(dirpath, exist_ok=True)
+ with open(fpath, "wt") as pf:
+ print("Existing content", file=pf)
+
+ annif.hfh_util.unzip_archive(
+ os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"),
+ force=False,
+ )
+ assert os.path.exists(fpath)
+ assert os.path.getsize(fpath) == 17 # Existing content
+ assert datetime.now().timestamp() - os.path.getmtime(fpath) < 1
+
+
+def test_unzip_archive_overwrite(testdatadir):
+ dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi")
+ fpath = os.path.join(str(dirpath), "file.txt")
+ os.makedirs(dirpath, exist_ok=True)
+ with open(fpath, "wt") as pf:
+ print("Existing content", file=pf)
+
+ annif.hfh_util.unzip_archive(
+ os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"),
+ force=True,
+ )
+ assert os.path.exists(fpath)
+ assert os.path.getsize(fpath) == 0 # Zero content from zip
+ ts = os.path.getmtime(fpath)
+ assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime(
+ 1980, 1, 1, 0, 0
+ ).astimezone(tz=timezone.utc)
+
+
+@mock.patch("os.path.exists", return_value=True)
+@mock.patch("annif.hfh_util._compute_crc32", return_value=0)
+@mock.patch("shutil.copy")
+def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists):
+ annif.hfh_util.copy_project_config(
+ os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False
+ )
+ assert not copy.called
+
+
+@mock.patch("os.path.exists", return_value=True)
+@mock.patch("shutil.copy")
+def test_copy_project_config_overwrite(copy, exists):
+ annif.hfh_util.copy_project_config(
+ os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True
+ )
+ assert copy.called
+ assert copy.call_args == mock.call(
+ "tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"
+ )
diff --git a/tests/test_openapi.py b/tests/test_openapi.py
index 26e33e4ea..76f33695f 100644
--- a/tests/test_openapi.py
+++ b/tests/test_openapi.py
@@ -7,37 +7,50 @@
schema = schemathesis.from_path("annif/openapi/annif.yaml")
-@schemathesis.check
-def check_cors(response, case):
- assert response.headers["access-control-allow-origin"] == "*"
+@schemathesis.hook("filter_path_parameters")
+def filter_path_parameters(context, path_parameters):
+ # Exclude path parameters containing newline which crashes application
+ # https://github.com/spec-first/connexion/issues/1908
+ if path_parameters is not None and "project_id" in path_parameters:
+ return "%0A" not in path_parameters["project_id"]
+ return True
@schema.parametrize()
@settings(max_examples=10)
-def test_openapi_fuzzy(case, app):
- response = case.call_wsgi(app)
- case.validate_response(response, additional_checks=(check_cors,))
+def test_openapi_fuzzy(case, cxapp):
+ response = case.call_asgi(cxapp)
+ case.validate_response(response)
@pytest.mark.slow
@schema.parametrize(endpoint="/v1/projects/{project_id}")
@settings(max_examples=50)
-def test_openapi_fuzzy_target_dummy_fi(case, app):
+def test_openapi_fuzzy_target_dummy_fi(case, cxapp):
case.path_parameters = {"project_id": "dummy-fi"}
- response = case.call_wsgi(app)
+ response = case.call_asgi(cxapp)
case.validate_response(response)
+def test_openapi_cors(app_client):
+ # test that the service supports CORS by simulating a cross-origin request
+ app_client.headers = {"Origin": "http://somedomain.com"}
+ req = app_client.get(
+ "http://localhost:8000/v1/projects",
+ )
+ assert req.headers["access-control-allow-origin"] == "*"
+
+
def test_openapi_list_projects(app_client):
req = app_client.get("http://localhost:8000/v1/projects")
assert req.status_code == 200
- assert "projects" in req.get_json()
+ assert "projects" in req.json()
def test_openapi_show_project(app_client):
req = app_client.get("http://localhost:8000/v1/projects/dummy-fi")
assert req.status_code == 200
- assert req.get_json()["project_id"] == "dummy-fi"
+ assert req.json()["project_id"] == "dummy-fi"
def test_openapi_show_project_nonexistent(app_client):
@@ -51,7 +64,7 @@ def test_openapi_suggest(app_client):
"http://localhost:8000/v1/projects/dummy-fi/suggest", data=data
)
assert req.status_code == 200
- assert "results" in req.get_json()
+ assert "results" in req.json()
def test_openapi_suggest_nonexistent(app_client):
@@ -76,7 +89,7 @@ def test_openapi_suggest_batch(app_client):
"http://localhost:8000/v1/projects/dummy-fi/suggest-batch", json=data
)
assert req.status_code == 200
- body = req.get_json()
+ body = req.json()
assert len(body) == 32
assert body[0]["results"][0]["label"] == "dummy-fi"
@@ -87,7 +100,7 @@ def test_openapi_suggest_batch_too_many_documents(app_client):
"http://localhost:8000/v1/projects/dummy-fi/suggest-batch", json=data
)
assert req.status_code == 400
- assert req.get_json()["detail"] == "too many items - 'documents'"
+ assert req.json()["detail"] == "too many items - 'documents'"
def test_openapi_learn(app_client):
diff --git a/tests/test_project.py b/tests/test_project.py
index e626e180c..6f1d55d62 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -132,10 +132,10 @@ def test_get_project_default_params_fasttext(registry):
def test_get_project_invalid_config_file():
- app = annif.create_app(
+ cxapp = annif.create_app(
config_name="annif.default_config.TestingInvalidProjectsConfig"
)
- with app.app_context():
+ with cxapp.app.app_context():
with pytest.raises(ConfigurationException):
annif.registry.get_project("duplicatedvocab")
@@ -300,24 +300,38 @@ def test_project_initialized(app_with_initialize):
assert project.backend.initialized
+def test_project_not_initialized_with_selection(app_with_initialize_fi_projects):
+ with app_with_initialize_fi_projects.app_context():
+ project = annif.registry.get_project("dummy-en")
+ assert not project.initialized
+ assert not project.backend.initialized
+
+
+def test_project_initialized_with_selection(app_with_initialize_fi_projects):
+ with app_with_initialize_fi_projects.app_context():
+ project = annif.registry.get_project("dummy-fi")
+ assert project.initialized
+ assert project.backend.initialized
+
+
def test_project_file_not_found():
- app = annif.create_app(config_name="annif.default_config.TestingNoProjectsConfig")
- with app.app_context():
+ cxapp = annif.create_app(config_name="annif.default_config.TestingNoProjectsConfig")
+ with cxapp.app.app_context():
with pytest.raises(ValueError):
annif.registry.get_project("dummy-en")
def test_project_file_toml():
- app = annif.create_app(config_name="annif.default_config.TestingTOMLConfig")
- with app.app_context():
+ cxapp = annif.create_app(config_name="annif.default_config.TestingTOMLConfig")
+ with cxapp.app.app_context():
assert len(annif.registry.get_projects()) == 2
assert annif.registry.get_project("dummy-fi-toml").project_id == "dummy-fi-toml"
assert annif.registry.get_project("dummy-en-toml").project_id == "dummy-en-toml"
def test_project_directory():
- app = annif.create_app(config_name="annif.default_config.TestingDirectoryConfig")
- with app.app_context():
+ cxapp = annif.create_app(config_name="annif.default_config.TestingDirectoryConfig")
+ with cxapp.app.app_context():
assert len(annif.registry.get_projects()) == 18 + 2
assert annif.registry.get_project("dummy-fi").project_id == "dummy-fi"
assert annif.registry.get_project("dummy-fi-toml").project_id == "dummy-fi-toml"
diff --git a/tests/test_rest.py b/tests/test_rest.py
index e56a24b21..c905fc1de 100644
--- a/tests/test_rest.py
+++ b/tests/test_rest.py
@@ -7,7 +7,7 @@
def test_rest_list_projects(app):
with app.app_context():
- result = annif.rest.list_projects()
+ result = annif.rest.list_projects()[0]
project_ids = [proj["project_id"] for proj in result["projects"]]
# public project should be returned
assert "dummy-fi" in project_ids
@@ -21,7 +21,7 @@ def test_rest_list_projects(app):
def test_rest_show_info(app):
with app.app_context():
- result = annif.rest.show_info()
+ result = annif.rest.show_info()[0]
version = importlib.metadata.version("annif")
assert result == {"title": "Annif REST API", "version": version}
@@ -29,14 +29,14 @@ def test_rest_show_info(app):
def test_rest_show_project_public(app):
# public projects should be accessible via REST
with app.app_context():
- result = annif.rest.show_project("dummy-fi")
+ result = annif.rest.show_project("dummy-fi")[0]
assert result["project_id"] == "dummy-fi"
def test_rest_show_project_hidden(app):
# hidden projects should be accessible if you know the project id
with app.app_context():
- result = annif.rest.show_project("dummy-en")
+ result = annif.rest.show_project("dummy-en")[0]
assert result["project_id"] == "dummy-en"
@@ -58,7 +58,7 @@ def test_rest_suggest_public(app):
with app.app_context():
result = annif.rest.suggest(
"dummy-fi", {"text": "example text", "limit": 10, "threshold": 0.0}
- )
+ )[0]
assert "results" in result
@@ -67,7 +67,7 @@ def test_rest_suggest_hidden(app):
with app.app_context():
result = annif.rest.suggest(
"dummy-en", {"text": "example text", "limit": 10, "threshold": 0.0}
- )
+ )[0]
assert "results" in result
@@ -101,7 +101,7 @@ def test_rest_suggest_with_language_override(app):
result = annif.rest.suggest(
"dummy-vocablang",
{"text": "example text", "limit": 10, "threshold": 0.0, "language": "en"},
- )
+ )[0]
assert result["results"][0]["label"] == "dummy"
@@ -120,7 +120,7 @@ def test_rest_suggest_with_different_vocab_language(app):
with app.app_context():
result = annif.rest.suggest(
"dummy-vocablang", {"text": "example text", "limit": 10, "threshold": 0.0}
- )
+ )[0]
assert result["results"][0]["label"] == "dummy-fi"
@@ -128,7 +128,7 @@ def test_rest_suggest_with_notations(app):
with app.app_context():
result = annif.rest.suggest(
"dummy-fi", {"text": "example text", "limit": 10, "threshold": 0.0}
- )
+ )[0]
assert result["results"][0]["notation"] is None
@@ -136,7 +136,7 @@ def test_rest_suggest_batch_one_doc(app):
with app.app_context():
result = annif.rest.suggest_batch(
"dummy-fi", {"documents": [{"text": "example text"}]}
- )
+ )[0]
assert len(result) == 1
assert result[0]["results"][0]["label"] == "dummy-fi"
assert result[0]["document_id"] is None
@@ -147,7 +147,7 @@ def test_rest_suggest_batch_one_doc_with_id(app):
result = annif.rest.suggest_batch(
"dummy-fi",
{"documents": [{"text": "example text", "document_id": "doc-0"}]},
- )
+ )[0]
assert len(result) == 1
assert result[0]["results"][0]["label"] == "dummy-fi"
assert result[0]["document_id"] == "doc-0"
@@ -163,7 +163,7 @@ def test_rest_suggest_batch_two_docs(app):
{"text": "another example text"},
]
},
- )
+ )[0]
assert len(result) == 2
assert result[1]["results"][0]["label"] == "dummy-fi"
@@ -176,7 +176,7 @@ def test_rest_suggest_batch_with_language_override(app):
"documents": [{"text": "example text"}],
},
language="en",
- )
+ )[0]
assert result[0]["results"][0]["label"] == "dummy"
@@ -188,14 +188,18 @@ def test_rest_suggest_batch_with_limit_override(app):
"documents": [{"text": "example text"}],
},
limit=0,
- )
+ )[0]
assert len(result[0]["results"]) == 0
def test_rest_learn_empty(app):
with app.app_context():
response = annif.rest.learn("dummy-en", [])
- assert response == (None, 204) # success, no output
+ assert response == (
+ None,
+ 204,
+ {"Content-Type": "application/json"},
+ ) # success, no output
def test_rest_learn(app):
@@ -207,11 +211,15 @@ def test_rest_learn(app):
]
with app.app_context():
response = annif.rest.learn("dummy-en", documents)
- assert response == (None, 204) # success, no output
+ assert response == (
+ None,
+ 204,
+ {"Content-Type": "application/json"},
+ ) # success, no output
result = annif.rest.suggest(
"dummy-en", {"text": "example text", "limit": 10, "threshold": 0.0}
- )
+ )[0]
assert "results" in result
assert result["results"][0]["uri"] == "http://example.org/none"
assert result["results"][0]["label"] == "none"
diff --git a/tests/test_vocab_csv.py b/tests/test_vocab_csv.py
index 0bc3820dd..bdb66e25c 100644
--- a/tests/test_vocab_csv.py
+++ b/tests/test_vocab_csv.py
@@ -1,6 +1,5 @@
"""Unit tests for CSV vocabulary functionality in Annif"""
-
from annif.corpus import SubjectFileCSV, SubjectIndex
diff --git a/tests/test_vocab_skos.py b/tests/test_vocab_skos.py
index a3c1e508c..fd93ca6d1 100644
--- a/tests/test_vocab_skos.py
+++ b/tests/test_vocab_skos.py
@@ -1,6 +1,5 @@
"""Unit tests for SKOS vocabulary functionality in Annif"""
-
import os.path
from annif.corpus.skos import SubjectFileSKOS
diff --git a/tests/test_vocab_tsv.py b/tests/test_vocab_tsv.py
index 1fc318e0f..4d0956d5a 100644
--- a/tests/test_vocab_tsv.py
+++ b/tests/test_vocab_tsv.py
@@ -1,6 +1,5 @@
"""Unit tests for TSV vocabulary functionality in Annif"""
-
from annif.corpus import SubjectFileTSV, SubjectIndex
diff --git a/tests/time-startup.sh b/tests/time-startup.sh
index dabb56134..9204c4cf2 100755
--- a/tests/time-startup.sh
+++ b/tests/time-startup.sh
@@ -18,7 +18,7 @@ average_startup_time=$(echo "scale=3; ($startup_time1 + $startup_time2 + $startu
echo "Average Startup time: $average_startup_time seconds"
# Set the threshold for acceptable startup time in seconds
-threshold=0.300
+threshold=0.400
# Compare the average startup time with the threshold
if (( $(echo "$average_startup_time > $threshold" | bc -l) )); then