Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement annif upload and annif download commands for Hugging Face Hub integration #762

Merged
merged 39 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f6d2b7d
Initial functionality for HF Hub upload
juhoinkinen Feb 1, 2024
ab5e4bf
Use tempfile module and file-like objects for uploads
juhoinkinen Feb 5, 2024
d3dd888
Separate files for each project, vocab and config
juhoinkinen Feb 6, 2024
9d030c6
Catch also HFValidationError in HFH uploads
juhoinkinen Feb 6, 2024
3135114
Initial functionality for HF Hub download
juhoinkinen Feb 7, 2024
038d86d
Upgrade to huggingface-hub 0.21.*
juhoinkinen Feb 29, 2024
5afb251
Drop -projects part from upload/download CLI commands
juhoinkinen Feb 29, 2024
13191fc
Speed up CLI startup by moving imports in functions
juhoinkinen Feb 29, 2024
7666de8
Add --force option to allow overwrite local contents on download
juhoinkinen Mar 1, 2024
301d787
Resolve CodeQL complaint about imports
juhoinkinen Mar 1, 2024
d5b4abe
Restore datafile timestamps after unzipping
juhoinkinen Mar 4, 2024
a1e7605
Add comment to zip file with used Annif version
juhoinkinen Mar 4, 2024
25a46dc
Catch HFH Errors in listing files in repo
juhoinkinen Mar 4, 2024
86714d8
Unzip archive contents to used DATADIR
juhoinkinen Mar 6, 2024
6ba1e08
Add tests
juhoinkinen Mar 7, 2024
4d06be6
Create /.cache/huggingface/ with full access rights in Dockerimage
juhoinkinen Mar 7, 2024
a4f0f6f
Merge branch 'update-dependencies-v1.1' into issue760-hugging-face-hu…
juhoinkinen Mar 8, 2024
7575fff
Fix and improve tests and increase coverage
juhoinkinen Mar 8, 2024
16bacfb
Remove todos
juhoinkinen Mar 8, 2024
2952f64
Create /Annif/projects.d/ for tests in Dockerfile
juhoinkinen Mar 8, 2024
ed3cf2c
Refactor to address quality complains; improve names
juhoinkinen Mar 8, 2024
5b16952
Add docstrings
juhoinkinen Mar 12, 2024
c87675c
Add type hints
juhoinkinen Mar 12, 2024
2fe5b73
Update RTD CLI commands page
juhoinkinen Mar 12, 2024
d7be137
Remove --revision option of download command
juhoinkinen Mar 13, 2024
47f7ee4
Upgrade to huggingface-hub 0.22.*
juhoinkinen Mar 25, 2024
a488d07
Revert "Remove --revision option of download command"
juhoinkinen Mar 26, 2024
0c57bf2
Preupload lfs files
juhoinkinen Mar 26, 2024
df105a3
Fix HF Hub caching in Dockerfile
juhoinkinen Mar 27, 2024
d14ff30
Refactor to address quality complains
juhoinkinen Apr 12, 2024
cc0c989
Again: Refactor & simplify to address quality complains
juhoinkinen Apr 12, 2024
9443c8f
Fix typo in mocked filenames in repo
juhoinkinen Apr 19, 2024
156bbf5
Detect projects present in repo by .cfg files, not .zip files
juhoinkinen Apr 19, 2024
3f60456
Add --revision option to upload command
juhoinkinen Apr 19, 2024
2dd359d
Enable completion of project_id argument in upload command
juhoinkinen Apr 19, 2024
63076cd
Adapt test for adding revision option to upload command
juhoinkinen Apr 19, 2024
a0a3850
Move functions for HuggingFaceHub interactions to own file
juhoinkinen Apr 23, 2024
638aa07
Move unit tests for HuggingFaceHub util fns to own file
juhoinkinen Apr 23, 2024
6f35fff
Make io import conditional to TYPE_CHECKING
juhoinkinen Apr 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ RUN annif completion --bash >> /etc/bash.bashrc # Enable tab completion
RUN groupadd -g 998 annif_user && \
useradd -r -u 998 -g annif_user annif_user && \
chmod -R a+rX /Annif && \
mkdir -p /Annif/tests/data && \
mkdir -p /Annif/tests/data /Annif/projects.d && \
chown -R annif_user:annif_user /annif-projects /Annif/tests/data
USER annif_user
ENV HF_HOME="/tmp"

CMD annif
126 changes: 124 additions & 2 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
import annif.parallel
import annif.project
import annif.registry
from annif import cli_util
from annif.exception import NotInitializedException, NotSupportedException
from annif import cli_util, hfh_util
from annif.exception import (
NotInitializedException,
NotSupportedException,
OperationFailedException,
)
from annif.project import Access
from annif.util import metric_code

Expand Down Expand Up @@ -582,6 +586,124 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi
click.echo("---")


@cli.command("upload")
@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param)
@click.argument("repo_id")
@click.option(
"--token",
help="""Authentication token, obtained from the Hugging Face Hub.
Will default to the stored token.""",
)
@click.option(
"--revision",
help="""An optional git revision to commit from. Defaults to the head of the "main"
branch.""",
)
@click.option(
"--commit-message",
help="""The summary / title / first line of the generated commit.""",
)
@cli_util.common_options
def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
"""
Upload selected projects and their vocabularies to a Hugging Face Hub repository.
\f
This command zips the project directories and vocabularies of the projects
that match the given `project_ids_pattern` to archive files, and uploads the
archives along with the project configurations to the specified Hugging Face
Hub repository. An authentication token and commit message can be given with
options.
"""
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError, HFValidationError

projects = hfh_util.get_matching_projects(project_ids_pattern)
click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}")

commit_message = (
commit_message
if commit_message is not None
else f"Upload project(s) {project_ids_pattern} with Annif"
)

fobjs, operations = [], []
try:
fobjs, operations = hfh_util.prepare_commits(projects, repo_id)
api = HfApi()
api.create_commit(
repo_id=repo_id,
operations=operations,
commit_message=commit_message,
revision=revision,
token=token,
)
except (HfHubHTTPError, HFValidationError) as err:
raise OperationFailedException(str(err))
finally:
for fobj in fobjs:
fobj.close()


@cli.command("download")
@click.argument("project_ids_pattern")
@click.argument("repo_id")
@click.option(
"--token",
help="""Authentication token, obtained from the Hugging Face Hub.
Will default to the stored token.""",
)
@click.option(
"--revision",
help="""
An optional Git revision id which can be a branch name, a tag, or a commit
hash.
""",
)
@click.option(
"--force",
"-f",
default=False,
is_flag=True,
help="Replace an existing project/vocabulary/config with the downloaded one",
)
@cli_util.common_options
def run_download(project_ids_pattern, repo_id, token, revision, force):
"""
Download selected projects and their vocabularies from a Hugging Face Hub
repository.
\f
This command downloads the project and vocabulary archives and the
configuration files of the projects that match the given
`project_ids_pattern` from the specified Hugging Face Hub repository and
unzips the archives to `data/` directory and places the configuration files
to `projects.d/` directory. An authentication token and revision can
be given with options.
"""

project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
project_ids_pattern, repo_id, token, revision
)
click.echo(f"Downloading project(s): {', '.join(project_ids)}")

vocab_ids = set()
for project_id in project_ids:
project_zip_cache_path = hfh_util.download_from_hf_hub(
f"projects/{project_id}.zip", repo_id, token, revision
)
hfh_util.unzip_archive(project_zip_cache_path, force)
config_file_cache_path = hfh_util.download_from_hf_hub(
f"{project_id}.cfg", repo_id, token, revision
)
vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path))
hfh_util.copy_project_config(config_file_cache_path, force)

for vocab_id in vocab_ids:
vocab_zip_cache_path = hfh_util.download_from_hf_hub(
f"vocabs/{vocab_id}.zip", repo_id, token, revision
)
hfh_util.unzip_archive(vocab_zip_cache_path, force)


@cli.command("completion")
@click.option("--bash", "shell", flag_value="bash")
@click.option("--zsh", "shell", flag_value="zsh")
Expand Down
6 changes: 3 additions & 3 deletions annif/cli_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import collections
import io
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
import itertools
import os
import sys
Expand All @@ -18,7 +19,6 @@

if TYPE_CHECKING:
from datetime import datetime
from io import TextIOWrapper

from click.core import Argument, Context, Option

Expand Down Expand Up @@ -185,7 +185,7 @@ def show_hits(
hits: SuggestionResult,
project: AnnifProject,
lang: str,
file: TextIOWrapper | None = None,
file: io.TextIOWrapper | None = None,
) -> None:
"""
Print subject suggestions to the console or a file. The suggestions are displayed as
Expand Down Expand Up @@ -234,7 +234,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float
def _get_completion_choices(
param: Argument,
) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list:
if param.name == "project_id":
if param.name in ("project_id", "project_ids_pattern"):
return annif.registry.get_projects()
elif param.name == "vocab_id":
return annif.registry.get_vocabs()
Expand Down
Loading
Loading