From f6d2b7dc38280d0a87b055dd8188be56641cb105 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 1 Feb 2024 10:39:11 +0200 Subject: [PATCH 01/38] Initial functionality for HF Hub upload --- annif/cli.py | 48 +++++++++++++++++++++++++++++++ annif/cli_util.py | 61 +++++++++++++++++++++++++++++++++++++++- docs/source/commands.rst | 7 +++++ pyproject.toml | 1 + 4 files changed, 116 insertions(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index 73f18f02e..45a170565 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -8,6 +8,7 @@ import os.path import re import sys +from fnmatch import fnmatch import click import click_log @@ -583,6 +584,53 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi click.echo("---") +@cli.command("upload-projects") +@click.argument("project_ids_pattern") +@click.argument("repo_id") +@click.option( + "--token", + help="""Authentication token, obtained from the Hugging Face Hub. + Will default to the stored token.""", +) +@click.option( + "--commit-message", + help="""The summary / title / first line of the generated commit.""", +) +@cli_util.common_options +def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): + """ + Upload selected projects to a Hugging Face Hub repository + \f + This command zips the project directories and vocabularies of the projects + that match the given `project_ids_pattern`, and uploads the archives along + with the projects configuration to the specified Hugging Face Hub repository. + An authentication token and commit message can be given with options. + """ + projects = [ + proj + for proj in annif.registry.get_projects(min_access=Access.private).values() + if fnmatch(proj.project_id, project_ids_pattern) + ] + click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") + project_dirs = {p.datadir for p in projects} + vocab_dirs = {p.vocab.datadir for p in projects} + + projects_zip_fname = "projects.zip" + vocabs_zip_fname = "vocabs.zip" + projects_conf_fname = "projects.cfg" + + cli_util.archive_dirs(project_dirs, projects_zip_fname) + cli_util.archive_dirs(vocab_dirs, vocabs_zip_fname) + cli_util.write_tmp_project_configs_file(projects, projects_conf_fname) + + try: + cli_util.upload_to_hf_hub(projects_zip_fname, repo_id, token, commit_message) + cli_util.upload_to_hf_hub(vocabs_zip_fname, repo_id, token, commit_message) + cli_util.upload_to_hf_hub(projects_conf_fname, repo_id, token, commit_message) + finally: + cli_util.remove_tmp_files() + + @cli.command("completion") @click.option("--bash", "shell", flag_value="bash") @click.option("--zsh", "shell", flag_value="zsh") diff --git a/annif/cli_util.py b/annif/cli_util.py index bbfa96df4..51b2019fe 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -2,17 +2,22 @@ from __future__ import annotations import collections +import configparser import itertools import os +import pathlib import sys +import zipfile from typing import TYPE_CHECKING import click import click_log from flask import current_app +from huggingface_hub import HfApi +from huggingface_hub.utils import HfHubHTTPError import annif -from annif.exception import ConfigurationException +from annif.exception import ConfigurationException, OperationFailedException from annif.project import Access if TYPE_CHECKING: @@ -230,6 +235,60 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float return list(itertools.product(limits, thresholds)) +def is_train_file(fname): + train_file_patterns = ("-train", "tmp-") + for pat in train_file_patterns: + if pat in fname: + return True + return False + + +TMPF_PREFIX = "tmp-upload-" + + +def archive_dirs(dirs, zip_fname): + logger.debug(f"Creating archive {zip_fname}") + with zipfile.ZipFile(TMPF_PREFIX + zip_fname, mode="w") as zfile: + for pdir in dirs: + directory = pathlib.Path(pdir) + fpaths = [ + fpath for fpath in directory.iterdir() if not is_train_file(fpath.name) + ] + for fpath in fpaths: + logger.debug(f"Adding {fpath}") + zfile.write(fpath, arcname=fpath) + + +def upload_to_hf_hub(fname, repo_id, token, commit_message): + commit_message = ( + commit_message if commit_message is not None else f"Upload {fname} with Annif" + ) + api = HfApi() + try: + api.upload_file( + path_or_fileobj=TMPF_PREFIX + fname, + path_in_repo=fname, + repo_id=repo_id, + token=token, + commit_message=commit_message, + ) + except HfHubHTTPError as err: + raise OperationFailedException(str(err)) + + +def write_tmp_project_configs_file(projects, projects_conf_fname): + config = configparser.ConfigParser() + for proj in projects: + config[proj.project_id] = proj.config + with open(TMPF_PREFIX + projects_conf_fname, "w") as tmp_projects_file: + config.write(tmp_projects_file) + + +def remove_tmp_files(): + for tmp_file_path in pathlib.Path(".").glob(TMPF_PREFIX + "*"): + tmp_file_path.unlink() + + def _get_completion_choices( param: Argument, ) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: diff --git a/docs/source/commands.rst b/docs/source/commands.rst index 849f6aadf..6a762828e 100644 --- a/docs/source/commands.rst +++ b/docs/source/commands.rst @@ -66,6 +66,13 @@ Project administration N/A +.. click:: annif.cli:run_upload_projects + :prog: annif upload-projects + +**REST equivalent** + + N/A + **************************** Subject index administration **************************** diff --git a/pyproject.toml b/pyproject.toml index 951fa3a0a..e01b4afce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ python-dateutil = "2.8.*" tomli = { version = "2.0.*", python = "<3.11" } simplemma = "0.9.*" jsonschema = "4.17.*" +huggingface-hub = "0.20.*" fasttext-wheel = {version = "0.9.2", optional = true} voikko = {version = "0.5.*", optional = true} From ab5e4bf5346f88dbf497302790a6de853083bc10 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:35:17 +0200 Subject: [PATCH 02/38] Use tempfile module and file-like objects for uploads --- annif/cli.py | 34 ++++++++++++++++++++------------ annif/cli_util.py | 50 ++++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 45a170565..b4bb860fe 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -612,23 +612,33 @@ def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): if fnmatch(proj.project_id, project_ids_pattern) ] click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") + + commit_message = ( + commit_message + if commit_message is not None + else f"Upload project(s) {project_ids_pattern} with Annif" + ) + project_dirs = {p.datadir for p in projects} vocab_dirs = {p.vocab.datadir for p in projects} projects_zip_fname = "projects.zip" vocabs_zip_fname = "vocabs.zip" - projects_conf_fname = "projects.cfg" - - cli_util.archive_dirs(project_dirs, projects_zip_fname) - cli_util.archive_dirs(vocab_dirs, vocabs_zip_fname) - cli_util.write_tmp_project_configs_file(projects, projects_conf_fname) - - try: - cli_util.upload_to_hf_hub(projects_zip_fname, repo_id, token, commit_message) - cli_util.upload_to_hf_hub(vocabs_zip_fname, repo_id, token, commit_message) - cli_util.upload_to_hf_hub(projects_conf_fname, repo_id, token, commit_message) - finally: - cli_util.remove_tmp_files() + configs_fname = "projects.cfg" + + projects_fobj = cli_util.archive_dirs(project_dirs) + vocabs_fobj = cli_util.archive_dirs(vocab_dirs) + configs_fobj = cli_util.write_configs(projects) + + cli_util.upload_to_hf_hub( + projects_fobj, projects_zip_fname, repo_id, token, commit_message + ) + cli_util.upload_to_hf_hub( + vocabs_fobj, vocabs_zip_fname, repo_id, token, commit_message + ) + cli_util.upload_to_hf_hub( + configs_fobj, configs_fname, repo_id, token, commit_message + ) @cli.command("completion") diff --git a/annif/cli_util.py b/annif/cli_util.py index 51b2019fe..49584f31a 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -3,10 +3,12 @@ import collections import configparser +import io import itertools import os import pathlib import sys +import tempfile import zipfile from typing import TYPE_CHECKING @@ -235,7 +237,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float return list(itertools.product(limits, thresholds)) -def is_train_file(fname): +def _is_train_file(fname): train_file_patterns = ("-train", "tmp-") for pat in train_file_patterns: if pat in fname: @@ -243,31 +245,38 @@ def is_train_file(fname): return False -TMPF_PREFIX = "tmp-upload-" - - -def archive_dirs(dirs, zip_fname): - logger.debug(f"Creating archive {zip_fname}") - with zipfile.ZipFile(TMPF_PREFIX + zip_fname, mode="w") as zfile: +def archive_dirs(dirs): + fp = tempfile.TemporaryFile() + with zipfile.ZipFile(fp, mode="w") as zfile: for pdir in dirs: directory = pathlib.Path(pdir) fpaths = [ - fpath for fpath in directory.iterdir() if not is_train_file(fpath.name) + fpath for fpath in directory.iterdir() if not _is_train_file(fpath.name) ] for fpath in fpaths: logger.debug(f"Adding {fpath}") zfile.write(fpath, arcname=fpath) + fp.seek(0) + return fp -def upload_to_hf_hub(fname, repo_id, token, commit_message): - commit_message = ( - commit_message if commit_message is not None else f"Upload {fname} with Annif" - ) +def write_configs(projects): + fp = tempfile.TemporaryFile(mode="w+t") + config = configparser.ConfigParser() + for proj in projects: + config[proj.project_id] = proj.config + config.write(fp) # This needs tempfile in text mode + fp.seek(0) + # But for upload fobj needs to be in binary mode + return io.BytesIO(fp.read().encode("utf8")) + + +def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): api = HfApi() try: api.upload_file( - path_or_fileobj=TMPF_PREFIX + fname, - path_in_repo=fname, + path_or_fileobj=fileobj, + path_in_repo=filename, repo_id=repo_id, token=token, commit_message=commit_message, @@ -276,19 +285,6 @@ def upload_to_hf_hub(fname, repo_id, token, commit_message): raise OperationFailedException(str(err)) -def write_tmp_project_configs_file(projects, projects_conf_fname): - config = configparser.ConfigParser() - for proj in projects: - config[proj.project_id] = proj.config - with open(TMPF_PREFIX + projects_conf_fname, "w") as tmp_projects_file: - config.write(tmp_projects_file) - - -def remove_tmp_files(): - for tmp_file_path in pathlib.Path(".").glob(TMPF_PREFIX + "*"): - tmp_file_path.unlink() - - def _get_completion_choices( param: Argument, ) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: From d3dd8889ef3052478403d1774a47f27add03b0b5 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 6 Feb 2024 13:50:30 +0200 Subject: [PATCH 03/38] Separate files for each project, vocab and config --- annif/cli.py | 31 +++++++++++++------------------ annif/cli_util.py | 20 ++++++++------------ 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index b4bb860fe..d471a09a8 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -621,24 +621,19 @@ def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): project_dirs = {p.datadir for p in projects} vocab_dirs = {p.vocab.datadir for p in projects} - - projects_zip_fname = "projects.zip" - vocabs_zip_fname = "vocabs.zip" - configs_fname = "projects.cfg" - - projects_fobj = cli_util.archive_dirs(project_dirs) - vocabs_fobj = cli_util.archive_dirs(vocab_dirs) - configs_fobj = cli_util.write_configs(projects) - - cli_util.upload_to_hf_hub( - projects_fobj, projects_zip_fname, repo_id, token, commit_message - ) - cli_util.upload_to_hf_hub( - vocabs_fobj, vocabs_zip_fname, repo_id, token, commit_message - ) - cli_util.upload_to_hf_hub( - configs_fobj, configs_fname, repo_id, token, commit_message - ) + data_dirs = project_dirs.union(vocab_dirs) + + for data_dir in data_dirs: + zip_path = data_dir.split(os.path.sep, 1)[1] + ".zip" # TODO Check this + fobj = cli_util.archive_dir(data_dir) + cli_util.upload_to_hf_hub(fobj, zip_path, repo_id, token, commit_message) + fobj.close() + + for project in projects: + config_path = project.project_id + ".cfg" + fobj = cli_util.write_config(project) + cli_util.upload_to_hf_hub(fobj, config_path, repo_id, token, commit_message) + fobj.close() @cli.command("completion") diff --git a/annif/cli_util.py b/annif/cli_util.py index 49584f31a..11267fc5f 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -245,26 +245,22 @@ def _is_train_file(fname): return False -def archive_dirs(dirs): +def archive_dir(data_dir): fp = tempfile.TemporaryFile() + path = pathlib.Path(data_dir) + fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] with zipfile.ZipFile(fp, mode="w") as zfile: - for pdir in dirs: - directory = pathlib.Path(pdir) - fpaths = [ - fpath for fpath in directory.iterdir() if not _is_train_file(fpath.name) - ] - for fpath in fpaths: - logger.debug(f"Adding {fpath}") - zfile.write(fpath, arcname=fpath) + for fpath in fpaths: + logger.debug(f"Adding {fpath}") + zfile.write(fpath) fp.seek(0) return fp -def write_configs(projects): +def write_config(project): fp = tempfile.TemporaryFile(mode="w+t") config = configparser.ConfigParser() - for proj in projects: - config[proj.project_id] = proj.config + config[project.project_id] = project.config config.write(fp) # This needs tempfile in text mode fp.seek(0) # But for upload fobj needs to be in binary mode From 9d030c632267e26b6851c0e0e54290aec7cce927 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 6 Feb 2024 14:00:28 +0200 Subject: [PATCH 04/38] Catch also HFValidationError in HFH uploads --- annif/cli_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index 11267fc5f..9701aab18 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -16,7 +16,7 @@ import click_log from flask import current_app from huggingface_hub import HfApi -from huggingface_hub.utils import HfHubHTTPError +from huggingface_hub.utils import HfHubHTTPError, HFValidationError import annif from annif.exception import ConfigurationException, OperationFailedException @@ -277,7 +277,7 @@ def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): token=token, commit_message=commit_message, ) - except HfHubHTTPError as err: + except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) From 3135114ce0fb9fd69cf17ce47525f4e03de5b722 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 7 Feb 2024 14:22:17 +0200 Subject: [PATCH 05/38] Initial functionality for HF Hub download --- annif/cli.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++- annif/cli_util.py | 44 ++++++++++++++++++++++++++++++++++++- 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index d471a09a8..dc25a1477 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -1,12 +1,12 @@ """Definitions for command-line (Click) commands for invoking Annif operations and printing the results to console.""" - import collections import importlib import json import os.path import re +import shutil import sys from fnmatch import fnmatch @@ -636,6 +636,60 @@ def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): fobj.close() +@cli.command("download-projects") +@click.argument("project_ids_pattern") +@click.argument("repo_id") +@click.option( + "--token", + help="""Authentication token, obtained from the Hugging Face Hub. + Will default to the stored token.""", +) +@click.option( + "--revision", + help=""" + An optional Git revision id which can be a branch name, a tag, or a commit + hash. + """, +) +@cli_util.common_options +def run_download_projects(project_ids_pattern, repo_id, token, revision): + """ + Download selected projects from a Hugging Face Hub repository + \f + This command downloads the project and vocabulary archives and the + configuration files of the projects that match the given + `project_ids_pattern` from the specified Hugging Face Hub repository and + unzips the archives to `data/` directory and places the configuration files + to `projects.d/` directory. An authentication token and revision can + be given with options. + """ + + project_ids = cli_util.get_selected_project_ids_from_hf_hub( + project_ids_pattern, repo_id, token, revision + ) + click.echo(f"Downding project(s): {', '.join(project_ids)}") + + if not os.path.isdir("projects.d"): + os.mkdir("projects.d") + vocab_ids = set() + for project_id in project_ids: + project_zip_local_cache_path = cli_util.download_from_hf_hub( + f"projects/{project_id}.zip", repo_id, token, revision + ) + cli_util.unzip(project_zip_local_cache_path) + local_config_cache_path = cli_util.download_from_hf_hub( + f"{project_id}.cfg", repo_id, token, revision + ) + vocab_ids.add(cli_util.get_vocab_id(local_config_cache_path)) + shutil.copy(local_config_cache_path, "projects.d") # TODO Disallow overwrite + + for vocab_id in vocab_ids: + vocab_zip_local_cache_path = cli_util.download_from_hf_hub( + f"vocabs/{vocab_id}.zip", repo_id, token, revision + ) + cli_util.unzip(vocab_zip_local_cache_path) + + @cli.command("completion") @click.option("--bash", "shell", flag_value="bash") @click.option("--zsh", "shell", flag_value="zsh") diff --git a/annif/cli_util.py b/annif/cli_util.py index 9701aab18..797d62213 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -1,4 +1,5 @@ """Utility functions for Annif CLI commands""" + from __future__ import annotations import collections @@ -10,12 +11,13 @@ import sys import tempfile import zipfile +from fnmatch import fnmatch from typing import TYPE_CHECKING import click import click_log from flask import current_app -from huggingface_hub import HfApi +from huggingface_hub import HfApi, hf_hub_download, list_repo_files from huggingface_hub.utils import HfHubHTTPError, HFValidationError import annif @@ -281,6 +283,46 @@ def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): raise OperationFailedException(str(err)) +def get_selected_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): + all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) + return [ + path.rsplit(".zip")[0].split("projects/")[1] # TODO Try-catch this + for path in all_repo_file_paths + if fnmatch(path, f"projects/{project_ids_pattern}.zip") + ] + + +def _list_files_in_hf_hub(repo_id, token, revision): + return [ + repofile + for repofile in list_repo_files(repo_id=repo_id, token=token, revision=revision) + ] + + +def download_from_hf_hub(filename, repo_id, token, revision): + try: + return hf_hub_download( + repo_id=repo_id, + filename=filename, + token=token, + revision=revision, + ) + except (HfHubHTTPError, HFValidationError) as err: + raise OperationFailedException(str(err)) + + +def unzip(source_path): + with zipfile.ZipFile(source_path, "r") as zfile: + zfile.extractall() # TODO Disallow overwrite + + +def get_vocab_id(config_path): + config = configparser.ConfigParser() + config.read(config_path) + section = config.sections()[0] + return config[section]["vocab"] + + def _get_completion_choices( param: Argument, ) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: From 038d86dcd954a23ff8a29902c70a4434df7e2d62 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:07:50 +0200 Subject: [PATCH 06/38] Upgrade to huggingface-hub 0.21.* --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e01b4afce..6487ea274 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ python-dateutil = "2.8.*" tomli = { version = "2.0.*", python = "<3.11" } simplemma = "0.9.*" jsonschema = "4.17.*" -huggingface-hub = "0.20.*" +huggingface-hub = "0.21.*" fasttext-wheel = {version = "0.9.2", optional = true} voikko = {version = "0.5.*", optional = true} From 5afb251f825c34ef3434318e0bdded999a99a889 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:10:18 +0200 Subject: [PATCH 07/38] Drop -projects part from upload/download CLI commands --- annif/cli.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index dc25a1477..196579551 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -584,7 +584,7 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi click.echo("---") -@cli.command("upload-projects") +@cli.command("upload") @click.argument("project_ids_pattern") @click.argument("repo_id") @click.option( @@ -597,9 +597,9 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi help="""The summary / title / first line of the generated commit.""", ) @cli_util.common_options -def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): +def run_upload(project_ids_pattern, repo_id, token, commit_message): """ - Upload selected projects to a Hugging Face Hub repository + Upload selected projects and their vocabularies to a Hugging Face Hub repository \f This command zips the project directories and vocabularies of the projects that match the given `project_ids_pattern`, and uploads the archives along @@ -636,7 +636,7 @@ def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): fobj.close() -@cli.command("download-projects") +@cli.command("download") @click.argument("project_ids_pattern") @click.argument("repo_id") @click.option( @@ -652,9 +652,9 @@ def run_upload_projects(project_ids_pattern, repo_id, token, commit_message): """, ) @cli_util.common_options -def run_download_projects(project_ids_pattern, repo_id, token, revision): +def run_download(project_ids_pattern, repo_id, token, revision): """ - Download selected projects from a Hugging Face Hub repository + Download selected projects and their vocabularies from a Hugging Face Hub repository \f This command downloads the project and vocabulary archives and the configuration files of the projects that match the given @@ -667,7 +667,7 @@ def run_download_projects(project_ids_pattern, repo_id, token, revision): project_ids = cli_util.get_selected_project_ids_from_hf_hub( project_ids_pattern, repo_id, token, revision ) - click.echo(f"Downding project(s): {', '.join(project_ids)}") + click.echo(f"Downloading project(s): {', '.join(project_ids)}") if not os.path.isdir("projects.d"): os.mkdir("projects.d") From 13191fc3104830f3b9048a7bd474cbc60617e858 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:40:05 +0200 Subject: [PATCH 08/38] Speed up CLI startup by moving imports in functions --- annif/cli_util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index 797d62213..d26ff1ba7 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -17,8 +17,6 @@ import click import click_log from flask import current_app -from huggingface_hub import HfApi, hf_hub_download, list_repo_files -from huggingface_hub.utils import HfHubHTTPError, HFValidationError import annif from annif.exception import ConfigurationException, OperationFailedException @@ -270,6 +268,9 @@ def write_config(project): def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): + from huggingface_hub import HfApi + from huggingface_hub.utils import HfHubHTTPError, HFValidationError + api = HfApi() try: api.upload_file( @@ -293,6 +294,8 @@ def get_selected_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, re def _list_files_in_hf_hub(repo_id, token, revision): + from huggingface_hub import list_repo_files + return [ repofile for repofile in list_repo_files(repo_id=repo_id, token=token, revision=revision) @@ -300,6 +303,9 @@ def _list_files_in_hf_hub(repo_id, token, revision): def download_from_hf_hub(filename, repo_id, token, revision): + from huggingface_hub import hf_hub_download + from huggingface_hub.utils import HfHubHTTPError, HFValidationError + try: return hf_hub_download( repo_id=repo_id, From 7666de8a9a0dcb7d575c0ae7ae285f0d94244648 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 1 Mar 2024 15:46:29 +0200 Subject: [PATCH 09/38] Add --force option to allow overwrite local contents on download --- annif/cli.py | 16 ++++++++++----- annif/cli_util.py | 50 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 196579551..673cd0c75 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -6,7 +6,6 @@ import json import os.path import re -import shutil import sys from fnmatch import fnmatch @@ -651,8 +650,15 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): hash. """, ) +@click.option( + "--force", + "-f", + default=False, + is_flag=True, + help="Replace an existing project/vocabulary/config with the downloaded one", +) @cli_util.common_options -def run_download(project_ids_pattern, repo_id, token, revision): +def run_download(project_ids_pattern, repo_id, token, revision, force): """ Download selected projects and their vocabularies from a Hugging Face Hub repository \f @@ -676,18 +682,18 @@ def run_download(project_ids_pattern, repo_id, token, revision): project_zip_local_cache_path = cli_util.download_from_hf_hub( f"projects/{project_id}.zip", repo_id, token, revision ) - cli_util.unzip(project_zip_local_cache_path) + cli_util.unzip(project_zip_local_cache_path, force) local_config_cache_path = cli_util.download_from_hf_hub( f"{project_id}.cfg", repo_id, token, revision ) vocab_ids.add(cli_util.get_vocab_id(local_config_cache_path)) - shutil.copy(local_config_cache_path, "projects.d") # TODO Disallow overwrite + cli_util.move_project_config(local_config_cache_path, force) for vocab_id in vocab_ids: vocab_zip_local_cache_path = cli_util.download_from_hf_hub( f"vocabs/{vocab_id}.zip", repo_id, token, revision ) - cli_util.unzip(vocab_zip_local_cache_path) + cli_util.unzip(vocab_zip_local_cache_path, force) @cli.command("completion") diff --git a/annif/cli_util.py b/annif/cli_util.py index d26ff1ba7..d8a83906f 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -2,12 +2,14 @@ from __future__ import annotations +import binascii import collections import configparser import io import itertools import os import pathlib +import shutil import sys import tempfile import zipfile @@ -317,9 +319,51 @@ def download_from_hf_hub(filename, repo_id, token, revision): raise OperationFailedException(str(err)) -def unzip(source_path): - with zipfile.ZipFile(source_path, "r") as zfile: - zfile.extractall() # TODO Disallow overwrite +def unzip(src_path, force): + with zipfile.ZipFile(src_path, "r") as zfile: + for member in zfile.infolist(): + if os.path.exists(member.filename) and not force: + if _is_existing_identical(member): + logger.debug( + f"Skipping unzip of {member.filename}; already in place" + ) + else: + click.echo( + f"Not overwriting {member.filename} (use --force to override)" + ) + else: + logger.debug(f"Unzipping {member.filename}") + zfile.extract(member) + + +def move_project_config(src_path, force): + dst_path = os.path.join("projects.d", os.path.basename(src_path)) + if os.path.exists(dst_path) and not force: + if _compute_crc32(dst_path) == _compute_crc32(src_path): + logger.debug( + f"Skipping move of {os.path.basename(src_path)}; already in place" + ) + else: + click.echo(f"Not overwriting {dst_path} (use --force to override)") + else: + shutil.copy(src_path, dst_path) + + +def _is_existing_identical(member): + file_crc = _compute_crc32(member.filename) + return file_crc == member.CRC + + +def _compute_crc32(path): + if os.path.isdir(path): + return 0 + + size = 1024 * 1024 * 10 # 10 MiB chunks + with open(path, "rb") as fp: + crcval = 0 + while chunk := fp.read(size): + crcval = binascii.crc32(chunk, crcval) + return crcval def get_vocab_id(config_path): From 301d787b9ea0d7ba9d16b30c7871f333a11544ef Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 1 Mar 2024 16:08:07 +0200 Subject: [PATCH 10/38] Resolve CodeQL complaint about imports --- annif/cli_util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index d8a83906f..fc3027413 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -26,7 +26,6 @@ if TYPE_CHECKING: from datetime import datetime - from io import TextIOWrapper from click.core import Argument, Context, Option @@ -193,7 +192,7 @@ def show_hits( hits: SuggestionResult, project: AnnifProject, lang: str, - file: TextIOWrapper | None = None, + file: io.TextIOWrapper | None = None, ) -> None: """ Print subject suggestions to the console or a file. The suggestions are displayed as From d5b4abea1586a3b0520cce061d52588788f92db6 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:22:03 +0200 Subject: [PATCH 11/38] Restore datafile timestamps after unzipping --- annif/cli_util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/annif/cli_util.py b/annif/cli_util.py index fc3027413..07d4d051c 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -12,6 +12,7 @@ import shutil import sys import tempfile +import time import zipfile from fnmatch import fnmatch from typing import TYPE_CHECKING @@ -333,6 +334,8 @@ def unzip(src_path, force): else: logger.debug(f"Unzipping {member.filename}") zfile.extract(member) + date_time = time.mktime(member.date_time + (0, 0, -1)) + os.utime(member.filename, (date_time, date_time)) def move_project_config(src_path, force): From a1e7605676942caed4add986b6e2aed41e0a2994 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:37:33 +0200 Subject: [PATCH 12/38] Add comment to zip file with used Annif version --- annif/cli_util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/annif/cli_util.py b/annif/cli_util.py index 07d4d051c..92d2265dd 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -5,6 +5,7 @@ import binascii import collections import configparser +import importlib import io import itertools import os @@ -252,6 +253,10 @@ def archive_dir(data_dir): path = pathlib.Path(data_dir) fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] with zipfile.ZipFile(fp, mode="w") as zfile: + zfile.comment = bytes( + f"Archived by Annif {importlib.metadata.version('annif')}", + encoding="utf-8", + ) for fpath in fpaths: logger.debug(f"Adding {fpath}") zfile.write(fpath) @@ -321,6 +326,10 @@ def download_from_hf_hub(filename, repo_id, token, revision): def unzip(src_path, force): with zipfile.ZipFile(src_path, "r") as zfile: + logger.debug( + f"Extracting archive {src_path}; archive comment: " + f"\"{str(zfile.comment, encoding='utf-8')}\"" + ) for member in zfile.infolist(): if os.path.exists(member.filename) and not force: if _is_existing_identical(member): From 25a46dc0bc7896b2c14266b5ce56ae14c0b07694 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 4 Mar 2024 15:33:48 +0200 Subject: [PATCH 13/38] Catch HFH Errors in listing files in repo --- annif/cli_util.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index 92d2265dd..5e4a9b62c 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -302,11 +302,17 @@ def get_selected_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, re def _list_files_in_hf_hub(repo_id, token, revision): from huggingface_hub import list_repo_files + from huggingface_hub.utils import HfHubHTTPError, HFValidationError - return [ - repofile - for repofile in list_repo_files(repo_id=repo_id, token=token, revision=revision) - ] + try: + return [ + repofile + for repofile in list_repo_files( + repo_id=repo_id, token=token, revision=revision + ) + ] + except (HfHubHTTPError, HFValidationError) as err: + raise OperationFailedException(str(err)) def download_from_hf_hub(filename, repo_id, token, revision): From 86714d8b6304fc701110b8d593f3c46122491c94 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 6 Mar 2024 14:34:40 +0200 Subject: [PATCH 14/38] Unzip archive contents to used DATADIR --- annif/cli_util.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index 5e4a9b62c..394475535 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -259,7 +259,8 @@ def archive_dir(data_dir): ) for fpath in fpaths: logger.debug(f"Adding {fpath}") - zfile.write(fpath) + arcname = os.path.join(*fpath.parts[1:]) + zfile.write(fpath, arcname=arcname) fp.seek(0) return fp @@ -331,44 +332,43 @@ def download_from_hf_hub(filename, repo_id, token, revision): def unzip(src_path, force): + datadir = current_app.config["DATADIR"] with zipfile.ZipFile(src_path, "r") as zfile: logger.debug( f"Extracting archive {src_path}; archive comment: " f"\"{str(zfile.comment, encoding='utf-8')}\"" ) for member in zfile.infolist(): - if os.path.exists(member.filename) and not force: - if _is_existing_identical(member): - logger.debug( - f"Skipping unzip of {member.filename}; already in place" - ) + dest_path = os.path.join(datadir, member.filename) + if os.path.exists(dest_path) and not force: + if _are_identical(member, dest_path): + logger.debug(f"Skipping unzip to {dest_path}; already in place") else: - click.echo( - f"Not overwriting {member.filename} (use --force to override)" - ) + click.echo(f"Not overwriting {dest_path} (use --force to override)") else: - logger.debug(f"Unzipping {member.filename}") - zfile.extract(member) + logger.debug(f"Unzipping to {dest_path}") + zfile.extract(member, path=datadir) date_time = time.mktime(member.date_time + (0, 0, -1)) - os.utime(member.filename, (date_time, date_time)) + os.utime(dest_path, (date_time, date_time)) def move_project_config(src_path, force): - dst_path = os.path.join("projects.d", os.path.basename(src_path)) - if os.path.exists(dst_path) and not force: - if _compute_crc32(dst_path) == _compute_crc32(src_path): + dest_path = os.path.join("projects.d", os.path.basename(src_path)) + if os.path.exists(dest_path) and not force: + if _compute_crc32(dest_path) == _compute_crc32(src_path): logger.debug( f"Skipping move of {os.path.basename(src_path)}; already in place" ) else: - click.echo(f"Not overwriting {dst_path} (use --force to override)") + click.echo(f"Not overwriting {dest_path} (use --force to override)") else: - shutil.copy(src_path, dst_path) + logger.debug(f"Moving to {dest_path}") + shutil.copy(src_path, dest_path) -def _is_existing_identical(member): - file_crc = _compute_crc32(member.filename) - return file_crc == member.CRC +def _are_identical(member, dest_path): + path_crc = _compute_crc32(dest_path) + return path_crc == member.CRC def _compute_crc32(path): From 6ba1e081a73e86f5812a88093ffb685ecaa2f98b Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 7 Mar 2024 11:50:18 +0200 Subject: [PATCH 15/38] Add tests --- tests/huggingface-cache/dummy-en.cfg | 7 + tests/huggingface-cache/dummy-fi.cfg | 8 + tests/huggingface-cache/projects/dummy-en.zip | Bin 0 -> 366 bytes tests/huggingface-cache/projects/dummy-fi.zip | Bin 0 -> 366 bytes tests/huggingface-cache/vocabs/dummy.zip | Bin 0 -> 2378 bytes tests/test_cli.py | 303 +++++++++++++++++- 6 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 tests/huggingface-cache/dummy-en.cfg create mode 100644 tests/huggingface-cache/dummy-fi.cfg create mode 100644 tests/huggingface-cache/projects/dummy-en.zip create mode 100644 tests/huggingface-cache/projects/dummy-fi.zip create mode 100644 tests/huggingface-cache/vocabs/dummy.zip diff --git a/tests/huggingface-cache/dummy-en.cfg b/tests/huggingface-cache/dummy-en.cfg new file mode 100644 index 000000000..58398e8d0 --- /dev/null +++ b/tests/huggingface-cache/dummy-en.cfg @@ -0,0 +1,7 @@ +[dummy-en] +name=Dummy English +language=en +backend=dummy +analyzer=snowball(english) +vocab=dummy +access=hidden diff --git a/tests/huggingface-cache/dummy-fi.cfg b/tests/huggingface-cache/dummy-fi.cfg new file mode 100644 index 000000000..4d996f9b6 --- /dev/null +++ b/tests/huggingface-cache/dummy-fi.cfg @@ -0,0 +1,8 @@ +[dummy-fi] +name=Dummy Finnish +language=fi +backend=dummy +analyzer=snowball(finnish) +key=value +vocab=dummy +access=public diff --git a/tests/huggingface-cache/projects/dummy-en.zip b/tests/huggingface-cache/projects/dummy-en.zip new file mode 100644 index 0000000000000000000000000000000000000000..5325bf5277f92a5334b990df16177134f230407b GIT binary patch literal 366 zcmWIWW@h1H0D+CM=@DQCln`Q&VJIld&q_@$Db`OZ&CRXUP0iB}4dG;9-rMss^)3*X zR&X;gvaIA0U|>0LOqU@5Zq^z!v!t+^m6n;4s#j7`0yc9C&^!>vFq4rQSHo literal 0 HcmV?d00001 diff --git a/tests/huggingface-cache/projects/dummy-fi.zip b/tests/huggingface-cache/projects/dummy-fi.zip new file mode 100644 index 0000000000000000000000000000000000000000..3c6f29f4a8102fed294ea0c7b0e77fd25be7d73b GIT binary patch literal 366 zcmWIWW@h1H0D)rf^awBmN(eE?FccK!XQd{W6zivy=H^!Fre*4fhHx@4XXU(1tpVcF z3T_5QmX$mL3@itZ=`sYs&0=6sL^g^+3aeRZnK`L?B^4!LGao$u|Gy2JnT$+w%(z@A z0kuVd;eaEEiOs#N5cguZ6=5!>YhmUxFf=s!U^N%vcA)tnw_}*k$_8=~6Ad2X0Kfq(76bu+ zv^G@$@-xJ<*K-E7x!CCJRz3z1gLA#%;fBRP*aQI(uh8QdWbgBS(#T2q*2!DS?o_uw zx-VwH+rdqUWMo+}LZyxr{xesk>`Y?eZbFIab<#*Wl~9>S4_eK7mqq9f>e1Q01Y<4tRm*JMuio`!R^NysJtlxmB7%z7 z45U`ffH~5dDp$_9pCdv9go7d*MNJ&6s;)y?2(px(7Ig2{sF{Ix2H3JZB(=gDqgQU+ zT&Sq4dja6pt3kGD=wx1Q>t$u9MKMt7zUj(Omr_v^<$ z<#JZK6J!Vqj+U5FWv=-;gTf=yb7x9htXl-~`M*ACaiPnz@^zL?Lf`nf6gc&2ErOEz z%WRC~LYdDMD{cd*8*R@W%?*}Q~K+H0j%P$5@!x|<(2bp(nS5vPP?lk zPebA#kweT<@wmwWMO*X5g>;EF<$>Gs!LYOPK0MYu$%COLUoE)`rUd4qx>Y6TG?`+k zm8@$dXT0lpRBQ2a+1;@V?V3z}NtO&pO9l`yk_uP2R=x zp`!y2_mxAmGV3ERZ`!Wu#r-s`R7wcXEUW(IKuO?dnC_ra!zL z<|vQUfo2h7=3W@fW(g``;{&vmxD7jg%e_@=+yoLfl(z)tC%=hu?ui}@^j+=&D)50l z6*je=w`&5oBSv#r&&Qs)H_dX0`O!c>5Eh^##WayHJ5jysUcuR^HZ8vs>(%XDK4=lFXQW0Bt){9!-qyHJ zuuIVk?!Th|nk2f({m46 zH+)hP5zHtLrk!#B1*!|{R?q#IF3I{wJ(oWn&eE*sc}M4AT_OTy z=}Ig$BKJ>pah48|>pESFr9tF*OBZ43?g91aP}apg?nb&Siy*PmjZk|c$@=yodDN$? z$q)OtI6wHcR_b`}joHhTv?Yw^h{_sc-17G7)gf?bnL3;*i>?!?rp0+9rxqEo zaGbR3w939m6?j9ArCMJ%r4fV3{aH`RVvlN~Kk)2fsLa*OC>sff)_#GusS9!Nq>$6| zXl<>#O=?maQeFXwsJHQx&+pDOya^V9{Fu@>>HnT*UzgcG){m)6lL^~gc=__01T#UW zXskL_Tiy*-UzPgv>tAuYRS{VS=zulHEk}vLN$c2WylMOg9946$wLa1lPjM&G}0= zx6oiwj6fi2YyE!T7TO$x0pqZ=WC&92!S{9^&f(4_apc4ta-#4(K9pcyNbUd0E&b3< z$Ye`Z=lDXoSgh}Haw&28{PYW;Z?<(?K{Xl4!*(gC0Puv|U@>>5rT5pxH8o}@V~{N9 z0dXR8bZdC&BuPm~QA&1#in<3nAwsbF;M9(Q8bQ91xQ(EjUida2}P)66}xgu|NQt{>SIh8O`-6qyLo8pQC@C(EmXP0PcQ| zItKmEd3_w)QB(Jcjr~X2Y|HEu!cp(?U$(;;{4%D`2j+j7GP%Ei>2Q3GV&Y;W002Pt M^^h$-ndjr~U&{#HLjV8( literal 0 HcmV?d00001 diff --git a/tests/test_cli.py b/tests/test_cli.py index 77adeab0f..5efd80918 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,18 +2,22 @@ import contextlib import importlib +import io import json import os.path import random import re import shutil -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from unittest import mock +import huggingface_hub from click.shell_completion import ShellComplete from click.testing import CliRunner +from huggingface_hub.utils import HFValidationError import annif.cli +import annif.cli_util import annif.parallel runner = CliRunner(env={"ANNIF_CONFIG": "annif.default_config.TestingConfig"}) @@ -1072,6 +1076,303 @@ def test_routes_with_connexion_app(): assert re.search(r"app.home\s+GET\s+\/", result) +@mock.patch("huggingface_hub.HfApi.upload_file") +def test_upload(upload_file): + result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"]) + assert not result.exception + assert huggingface_hub.HfApi.upload_file.call_count == 3 + assert ( + mock.call( + path_or_fileobj=mock.ANY, # io.BufferedRandom object + path_in_repo="data/vocabs/dummy.zip", + repo_id="dummy-repo", + token=None, + commit_message="Upload project(s) dummy-fi with Annif", + ) + in huggingface_hub.HfApi.upload_file.call_args_list + ) + assert ( + mock.call( + path_or_fileobj=mock.ANY, # io.BufferedRandom object + path_in_repo="data/projects/dummy-fi.zip", + repo_id="dummy-repo", + token=None, + commit_message="Upload project(s) dummy-fi with Annif", + ) + in huggingface_hub.HfApi.upload_file.call_args_list + ) + assert ( + mock.call( + path_or_fileobj=mock.ANY, # io.BytesIO object + path_in_repo="dummy-fi.cfg", + repo_id="dummy-repo", + token=None, + commit_message="Upload project(s) dummy-fi with Annif", + ) + in huggingface_hub.HfApi.upload_file.call_args_list + ) + + +@mock.patch("huggingface_hub.HfApi.upload_file") +def test_upload_many(upload_file): + result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"]) + assert not result.exception + assert huggingface_hub.HfApi.upload_file.call_count == 11 + + +def test_upload_nonexistent_repo(): + failed_result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "nonexistent"]) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Repository Not Found for url:" in failed_result.output + + +def test_write_config(app_project): + result = annif.cli_util.write_config(app_project) + assert isinstance(result, io.BytesIO) + string_result = result.read().decode("UTF-8") + assert "[dummy-en]" in string_result + + +def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): + return "tests/huggingface-cache/" + filename # Mocks the downloaded file paths + + +@mock.patch( + "huggingface_hub.list_repo_files", + return_value=[ # Mocks the filenames in repo + "projects/dummy-fi.zip", + "vocabs/dummy.zip", + "dummy-fi.cfg", + "projects/dummy-en.zip", + "vocabs/dummy.zip", + "dummy-.cfg", + ], +) +@mock.patch( + "huggingface_hub.hf_hub_download", + side_effect=hf_hub_download_mock_side_effect, +) +@mock.patch("shutil.copy") # Avoid overwrite files in projects.d/ +def test_download_dummy_fi(copy, hf_hub_download, list_repo_files, testdatadir): + result = runner.invoke( + annif.cli.cli, + [ + "download", + "dummy-fi", + "mock-repo", + ], + ) + assert not result.exception + assert list_repo_files.called + assert hf_hub_download.called + assert hf_hub_download.call_args_list == [ + mock.call( + repo_id="mock-repo", + filename="projects/dummy-fi.zip", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="dummy-fi.cfg", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="vocabs/dummy.zip", + token=None, + revision=None, + ), + ] + assert shutil.copy.call_args_list == [ + mock.call("tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg") + ] + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + assert os.path.exists(fpath) + + +@mock.patch( + "huggingface_hub.list_repo_files", + return_value=[ # Mock filenames in repo + "projects/dummy-fi.zip", + "vocabs/dummy.zip", + "dummy-fi.cfg", + "projects/dummy-en.zip", + "vocabs/dummy.zip", + "dummy-.cfg", + ], +) +@mock.patch( + "huggingface_hub.hf_hub_download", + side_effect=hf_hub_download_mock_side_effect, +) +@mock.patch("shutil.copy") # Avoid overwrite files in projects.d/ +def test_download_dummy_fi_and_en(copy, hf_hub_download, list_repo_files, testdatadir): + result = runner.invoke( + annif.cli.cli, + [ + "download", + "dummy-??", + "mock-repo", + ], + ) + assert not result.exception + assert list_repo_files.called + assert hf_hub_download.called + assert hf_hub_download.call_args_list == [ + mock.call( + repo_id="mock-repo", + filename="projects/dummy-fi.zip", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="dummy-fi.cfg", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="projects/dummy-en.zip", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="dummy-en.cfg", + token=None, + revision=None, + ), + mock.call( + repo_id="mock-repo", + filename="vocabs/dummy.zip", + token=None, + revision=None, + ), + ] + assert shutil.copy.call_args_list == [ + mock.call("tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"), + mock.call("tests/huggingface-cache/dummy-en.cfg", "projects.d/dummy-en.cfg"), + ] + dirpath_fi = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath_fi = os.path.join(str(dirpath_fi), "file.txt") + assert os.path.exists(fpath_fi) + dirpath_en = os.path.join(str(testdatadir), "projects", "dummy-en") + fpath_en = os.path.join(str(dirpath_en), "file.txt") + assert os.path.exists(fpath_en) + + +@mock.patch( + "huggingface_hub.list_repo_files", + side_effect=HFValidationError, +) +@mock.patch( + "huggingface_hub.hf_hub_download", +) +def test_download_list_repo_files_failed( + hf_hub_download, + list_repo_files, +): + failed_result = runner.invoke( + annif.cli.cli, + [ + "download", + "dummy-fi", + "mock-repo", + ], + ) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Error: Operation failed:" in failed_result.output + assert list_repo_files.called + assert not hf_hub_download.called + + +@mock.patch( + "huggingface_hub.list_repo_files", + return_value=[ # Mock filenames in repo + "projects/dummy-fi.zip", + "vocabs/dummy.zip", + "dummy-fi.cfg", + ], +) +@mock.patch( + "huggingface_hub.hf_hub_download", + side_effect=HFValidationError, +) +def test_download_hf_hub_download_failed( + hf_hub_download, + list_repo_files, +): + failed_result = runner.invoke( + annif.cli.cli, + [ + "download", + "dummy-fi", + "mock-repo", + ], + ) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert "Error: Operation failed:" in failed_result.output + assert list_repo_files.called + assert hf_hub_download.called + + +def test_unzip_initial(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + annif.cli_util.unzip( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=False, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 0 # Zero content from zip + ts = os.path.getmtime(fpath) + assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( + 1980, 1, 1, 0, 0 + ).astimezone(tz=timezone.utc) + + +def test_unzip_no_overwrite(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + os.makedirs(dirpath, exist_ok=True) + with open(fpath, "wt") as pf: + print("Existing content", file=pf) + + annif.cli_util.unzip( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=False, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 17 # Existing content + assert abs(os.path.getmtime(fpath) - datetime.now().timestamp()) < 1 + + +def test_unzip_overwrite(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + os.makedirs(dirpath, exist_ok=True) + with open(fpath, "wt") as pf: + print("Existing content", file=pf) + + annif.cli_util.unzip( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=True, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 0 # Zero content from zip + ts = os.path.getmtime(fpath) + assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( + 1980, 1, 1, 0, 0 + ).astimezone(tz=timezone.utc) + + def test_completion_script_generation(): result = runner.invoke(annif.cli.cli, ["completion", "--bash"]) assert not result.exception From 4d06be679bd7c7b5152715943de65a2ab219bcb6 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 7 Mar 2024 17:02:34 +0200 Subject: [PATCH 16/38] Create /.cache/huggingface/ with full access rights in Dockerimage --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 52198a69f..896bacefd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,8 @@ RUN groupadd -g 998 annif_user && \ useradd -r -u 998 -g annif_user annif_user && \ chmod -R a+rX /Annif && \ mkdir -p /Annif/tests/data && \ + mkdir -p /.cache/huggingface && \ + chmod -R a+rwX /.cache/huggingface && \ chown -R annif_user:annif_user /annif-projects /Annif/tests/data USER annif_user From 7575ffff72e2f7be4281943d062183328d6d8a02 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 8 Mar 2024 11:39:45 +0200 Subject: [PATCH 17/38] Fix and improve tests and increase coverage --- tests/test_cli.py | 66 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 5efd80918..e834fe998 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,6 +8,7 @@ import random import re import shutil +import zipfile from datetime import datetime, timedelta, timezone from unittest import mock @@ -1127,6 +1128,21 @@ def test_upload_nonexistent_repo(): assert "Repository Not Found for url:" in failed_result.output +def test_archive_dir(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + os.makedirs(dirpath, exist_ok=True) + open(os.path.join(str(dirpath), "foo.txt"), "a").close() + open(os.path.join(str(dirpath), "-train.txt"), "a").close() + + fobj = annif.cli_util.archive_dir(dirpath) + assert isinstance(fobj, io.BufferedRandom) + + with zipfile.ZipFile(fobj, mode="r") as zfile: + archived_files = zfile.namelist() + assert len(archived_files) == 1 + assert os.path.split(archived_files[0])[1] == "foo.txt" + + def test_write_config(app_project): result = annif.cli_util.write_config(app_project) assert isinstance(result, io.BytesIO) @@ -1153,8 +1169,10 @@ def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("shutil.copy") # Avoid overwrite files in projects.d/ -def test_download_dummy_fi(copy, hf_hub_download, list_repo_files, testdatadir): +@mock.patch("annif.cli_util.move_project_config") +def test_download_dummy_fi( + move_project_config, hf_hub_download, list_repo_files, testdatadir +): result = runner.invoke( annif.cli.cli, [ @@ -1186,12 +1204,12 @@ def test_download_dummy_fi(copy, hf_hub_download, list_repo_files, testdatadir): revision=None, ), ] - assert shutil.copy.call_args_list == [ - mock.call("tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg") - ] dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") assert os.path.exists(fpath) + assert move_project_config.call_args_list == [ + mock.call("tests/huggingface-cache/dummy-fi.cfg", False) + ] @mock.patch( @@ -1209,8 +1227,10 @@ def test_download_dummy_fi(copy, hf_hub_download, list_repo_files, testdatadir): "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("shutil.copy") # Avoid overwrite files in projects.d/ -def test_download_dummy_fi_and_en(copy, hf_hub_download, list_repo_files, testdatadir): +@mock.patch("annif.cli_util.move_project_config") +def test_download_dummy_fi_and_en( + move_project_config, hf_hub_download, list_repo_files, testdatadir +): result = runner.invoke( annif.cli.cli, [ @@ -1254,16 +1274,16 @@ def test_download_dummy_fi_and_en(copy, hf_hub_download, list_repo_files, testda revision=None, ), ] - assert shutil.copy.call_args_list == [ - mock.call("tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"), - mock.call("tests/huggingface-cache/dummy-en.cfg", "projects.d/dummy-en.cfg"), - ] dirpath_fi = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath_fi = os.path.join(str(dirpath_fi), "file.txt") assert os.path.exists(fpath_fi) dirpath_en = os.path.join(str(testdatadir), "projects", "dummy-en") fpath_en = os.path.join(str(dirpath_en), "file.txt") assert os.path.exists(fpath_en) + assert move_project_config.call_args_list == [ + mock.call("tests/huggingface-cache/dummy-fi.cfg", False), + mock.call("tests/huggingface-cache/dummy-en.cfg", False), + ] @mock.patch( @@ -1351,7 +1371,7 @@ def test_unzip_no_overwrite(testdatadir): ) assert os.path.exists(fpath) assert os.path.getsize(fpath) == 17 # Existing content - assert abs(os.path.getmtime(fpath) - datetime.now().timestamp()) < 1 + assert datetime.now().timestamp() - os.path.getmtime(fpath) < 1 def test_unzip_overwrite(testdatadir): @@ -1373,6 +1393,28 @@ def test_unzip_overwrite(testdatadir): ).astimezone(tz=timezone.utc) +@mock.patch("os.path.exists", return_value=True) +@mock.patch("annif.cli_util._compute_crc32", return_value=0) +@mock.patch("shutil.copy") +def test_move_project_config_no_overwrite(copy, _compute_crc32, exists): + annif.cli_util.move_project_config( + os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False + ) + assert not copy.called + + +@mock.patch("os.path.exists", return_value=True) +@mock.patch("shutil.copy") +def test_move_project_config_overwrite(copy, exists): + annif.cli_util.move_project_config( + os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True + ) + assert copy.called + assert copy.call_args == mock.call( + "tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg" + ) + + def test_completion_script_generation(): result = runner.invoke(annif.cli.cli, ["completion", "--bash"]) assert not result.exception From 16bacfbaf44de6fce482f6854d78b979f13d1cdc Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:18:26 +0200 Subject: [PATCH 18/38] Remove todos --- annif/cli.py | 2 +- annif/cli_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 673cd0c75..7057cac5e 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -623,7 +623,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): data_dirs = project_dirs.union(vocab_dirs) for data_dir in data_dirs: - zip_path = data_dir.split(os.path.sep, 1)[1] + ".zip" # TODO Check this + zip_path = data_dir.split(os.path.sep, 1)[1] + ".zip" fobj = cli_util.archive_dir(data_dir) cli_util.upload_to_hf_hub(fobj, zip_path, repo_id, token, commit_message) fobj.close() diff --git a/annif/cli_util.py b/annif/cli_util.py index 394475535..2635f0c64 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -295,7 +295,7 @@ def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): def get_selected_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) return [ - path.rsplit(".zip")[0].split("projects/")[1] # TODO Try-catch this + path.rsplit(".zip")[0].split("projects/")[1] for path in all_repo_file_paths if fnmatch(path, f"projects/{project_ids_pattern}.zip") ] From 2952f640e664608119c95df125f1e254cebdac11 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:52:20 +0200 Subject: [PATCH 19/38] Create /Annif/projects.d/ for tests in Dockerfile --- Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 896bacefd..444d18df9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,9 +50,7 @@ RUN annif completion --bash >> /etc/bash.bashrc # Enable tab completion RUN groupadd -g 998 annif_user && \ useradd -r -u 998 -g annif_user annif_user && \ chmod -R a+rX /Annif && \ - mkdir -p /Annif/tests/data && \ - mkdir -p /.cache/huggingface && \ - chmod -R a+rwX /.cache/huggingface && \ + mkdir -p /Annif/tests/data /.cache/huggingface /Annif/projects.d && \ chown -R annif_user:annif_user /annif-projects /Annif/tests/data USER annif_user From ed3cf2ce0773d37ed4cb089f2303afea83bb6452 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:04:59 +0200 Subject: [PATCH 20/38] Refactor to address quality complains; improve names --- annif/cli.py | 35 +++++----------- annif/cli_util.py | 103 +++++++++++++++++++++++++++++++++------------- tests/test_cli.py | 38 ++++++++--------- 3 files changed, 104 insertions(+), 72 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 7057cac5e..8e9472e3d 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -7,7 +7,6 @@ import os.path import re import sys -from fnmatch import fnmatch import click import click_log @@ -605,11 +604,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): with the projects configuration to the specified Hugging Face Hub repository. An authentication token and commit message can be given with options. """ - projects = [ - proj - for proj in annif.registry.get_projects(min_access=Access.private).values() - if fnmatch(proj.project_id, project_ids_pattern) - ] + projects = cli_util.get_matching_projects(project_ids_pattern) click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") commit_message = ( @@ -623,16 +618,10 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): data_dirs = project_dirs.union(vocab_dirs) for data_dir in data_dirs: - zip_path = data_dir.split(os.path.sep, 1)[1] + ".zip" - fobj = cli_util.archive_dir(data_dir) - cli_util.upload_to_hf_hub(fobj, zip_path, repo_id, token, commit_message) - fobj.close() + cli_util.upload_datadir(data_dir, repo_id, token, commit_message) for project in projects: - config_path = project.project_id + ".cfg" - fobj = cli_util.write_config(project) - cli_util.upload_to_hf_hub(fobj, config_path, repo_id, token, commit_message) - fobj.close() + cli_util.upload_config(project, repo_id, token, commit_message) @cli.command("download") @@ -670,30 +659,28 @@ def run_download(project_ids_pattern, repo_id, token, revision, force): be given with options. """ - project_ids = cli_util.get_selected_project_ids_from_hf_hub( + project_ids = cli_util.get_matching_project_ids_from_hf_hub( project_ids_pattern, repo_id, token, revision ) click.echo(f"Downloading project(s): {', '.join(project_ids)}") - if not os.path.isdir("projects.d"): - os.mkdir("projects.d") vocab_ids = set() for project_id in project_ids: - project_zip_local_cache_path = cli_util.download_from_hf_hub( + project_zip_cache_path = cli_util.download_from_hf_hub( f"projects/{project_id}.zip", repo_id, token, revision ) - cli_util.unzip(project_zip_local_cache_path, force) - local_config_cache_path = cli_util.download_from_hf_hub( + cli_util.unzip_archive(project_zip_cache_path, force) + config_file_cache_path = cli_util.download_from_hf_hub( f"{project_id}.cfg", repo_id, token, revision ) - vocab_ids.add(cli_util.get_vocab_id(local_config_cache_path)) - cli_util.move_project_config(local_config_cache_path, force) + vocab_ids.add(cli_util.get_vocab_id_from_config(config_file_cache_path)) + cli_util.copy_project_config(config_file_cache_path, force) for vocab_id in vocab_ids: - vocab_zip_local_cache_path = cli_util.download_from_hf_hub( + vocab_zip_cache_path = cli_util.download_from_hf_hub( f"vocabs/{vocab_id}.zip", repo_id, token, revision ) - cli_util.unzip(vocab_zip_local_cache_path, force) + cli_util.unzip_archive(vocab_zip_cache_path, force) @cli.command("completion") diff --git a/annif/cli_util.py b/annif/cli_util.py index 2635f0c64..f50b83804 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -240,6 +240,35 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float return list(itertools.product(limits, thresholds)) +def get_matching_projects(pattern): + """ + Get projects that match the given pattern. + """ + return [ + proj + for proj in annif.registry.get_projects(min_access=Access.private).values() + if fnmatch(proj.project_id, pattern) + ] + + +def upload_datadir(data_dir, repo_id, token, commit_message): + """ + Upload a data directory to HuggingFace Hub. + """ + zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" + with _archive_dir(data_dir) as fobj: + _upload_to_hf_hub(fobj, zip_repo_path, repo_id, token, commit_message) + + +def upload_config(project, repo_id, token, commit_message): + """ + Upload a project configuration to HuggingFace Hub. + """ + config_repo_path = project.project_id + ".cfg" + with _get_project_config(project) as fobj: + _upload_to_hf_hub(fobj, config_repo_path, repo_id, token, commit_message) + + def _is_train_file(fname): train_file_patterns = ("-train", "tmp-") for pat in train_file_patterns: @@ -248,7 +277,7 @@ def _is_train_file(fname): return False -def archive_dir(data_dir): +def _archive_dir(data_dir): fp = tempfile.TemporaryFile() path = pathlib.Path(data_dir) fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] @@ -265,7 +294,7 @@ def archive_dir(data_dir): return fp -def write_config(project): +def _get_project_config(project): fp = tempfile.TemporaryFile(mode="w+t") config = configparser.ConfigParser() config[project.project_id] = project.config @@ -275,7 +304,7 @@ def write_config(project): return io.BytesIO(fp.read().encode("utf8")) -def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): +def _upload_to_hf_hub(fileobj, path_in_repo, repo_id, token, commit_message): from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -283,7 +312,7 @@ def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): try: api.upload_file( path_or_fileobj=fileobj, - path_in_repo=filename, + path_in_repo=path_in_repo, repo_id=repo_id, token=token, commit_message=commit_message, @@ -292,7 +321,7 @@ def upload_to_hf_hub(fileobj, filename, repo_id, token, commit_message): raise OperationFailedException(str(err)) -def get_selected_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): +def get_matching_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) return [ path.rsplit(".zip")[0].split("projects/")[1] @@ -331,46 +360,62 @@ def download_from_hf_hub(filename, repo_id, token, revision): raise OperationFailedException(str(err)) -def unzip(src_path, force): +def unzip_archive(src_path, force): datadir = current_app.config["DATADIR"] with zipfile.ZipFile(src_path, "r") as zfile: + archive_comment = str(zfile.comment, encoding="utf-8") logger.debug( - f"Extracting archive {src_path}; archive comment: " - f"\"{str(zfile.comment, encoding='utf-8')}\"" + f'Extracting archive {src_path}; archive comment: "{archive_comment}"' ) for member in zfile.infolist(): - dest_path = os.path.join(datadir, member.filename) - if os.path.exists(dest_path) and not force: - if _are_identical(member, dest_path): - logger.debug(f"Skipping unzip to {dest_path}; already in place") - else: - click.echo(f"Not overwriting {dest_path} (use --force to override)") - else: - logger.debug(f"Unzipping to {dest_path}") - zfile.extract(member, path=datadir) - date_time = time.mktime(member.date_time + (0, 0, -1)) - os.utime(dest_path, (date_time, date_time)) + _unzip_member(zfile, member, datadir, force) -def move_project_config(src_path, force): - dest_path = os.path.join("projects.d", os.path.basename(src_path)) +def _unzip_member(zfile, member, datadir, force): + dest_path = os.path.join(datadir, member.filename) if os.path.exists(dest_path) and not force: - if _compute_crc32(dest_path) == _compute_crc32(src_path): - logger.debug( - f"Skipping move of {os.path.basename(src_path)}; already in place" - ) + if _are_identical_member_and_file(member, dest_path): + logger.debug(f"Skipping unzip to {dest_path}; already in place") else: click.echo(f"Not overwriting {dest_path} (use --force to override)") else: - logger.debug(f"Moving to {dest_path}") - shutil.copy(src_path, dest_path) + logger.debug(f"Unzipping to {dest_path}") + zfile.extract(member, path=datadir) + _restore_timestamps(member, dest_path) -def _are_identical(member, dest_path): +def _are_identical_member_and_file(member, dest_path): path_crc = _compute_crc32(dest_path) return path_crc == member.CRC +def _restore_timestamps(member, dest_path): + date_time = time.mktime(member.date_time + (0, 0, -1)) + os.utime(dest_path, (date_time, date_time)) + + +def copy_project_config(src_path, force): + project_configs_dest_dir = "projects.d" + if not os.path.isdir(project_configs_dest_dir): + os.mkdir(project_configs_dest_dir) + + dest_path = os.path.join(project_configs_dest_dir, os.path.basename(src_path)) + if os.path.exists(dest_path) and not force: + if _are_identical_files(src_path, dest_path): + logger.debug(f"Skipping copy to {dest_path}; already in place") + else: + click.echo(f"Not overwriting {dest_path} (use --force to override)") + else: + logger.debug(f"Copying to {dest_path}") + shutil.copy(src_path, dest_path) + + +def _are_identical_files(src_path, dest_path): + src_crc32 = _compute_crc32(src_path) + dest_crc32 = _compute_crc32(dest_path) + return src_crc32 == dest_crc32 + + def _compute_crc32(path): if os.path.isdir(path): return 0 @@ -383,7 +428,7 @@ def _compute_crc32(path): return crcval -def get_vocab_id(config_path): +def get_vocab_id_from_config(config_path): config = configparser.ConfigParser() config.read(config_path) section = config.sections()[0] diff --git a/tests/test_cli.py b/tests/test_cli.py index e834fe998..6fc790b88 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1134,7 +1134,7 @@ def test_archive_dir(testdatadir): open(os.path.join(str(dirpath), "foo.txt"), "a").close() open(os.path.join(str(dirpath), "-train.txt"), "a").close() - fobj = annif.cli_util.archive_dir(dirpath) + fobj = annif.cli_util._archive_dir(dirpath) assert isinstance(fobj, io.BufferedRandom) with zipfile.ZipFile(fobj, mode="r") as zfile: @@ -1143,8 +1143,8 @@ def test_archive_dir(testdatadir): assert os.path.split(archived_files[0])[1] == "foo.txt" -def test_write_config(app_project): - result = annif.cli_util.write_config(app_project) +def test_get_project_config(app_project): + result = annif.cli_util._get_project_config(app_project) assert isinstance(result, io.BytesIO) string_result = result.read().decode("UTF-8") assert "[dummy-en]" in string_result @@ -1169,9 +1169,9 @@ def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("annif.cli_util.move_project_config") +@mock.patch("annif.cli_util.copy_project_config") def test_download_dummy_fi( - move_project_config, hf_hub_download, list_repo_files, testdatadir + copy_project_config, hf_hub_download, list_repo_files, testdatadir ): result = runner.invoke( annif.cli.cli, @@ -1207,7 +1207,7 @@ def test_download_dummy_fi( dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") assert os.path.exists(fpath) - assert move_project_config.call_args_list == [ + assert copy_project_config.call_args_list == [ mock.call("tests/huggingface-cache/dummy-fi.cfg", False) ] @@ -1227,9 +1227,9 @@ def test_download_dummy_fi( "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("annif.cli_util.move_project_config") +@mock.patch("annif.cli_util.copy_project_config") def test_download_dummy_fi_and_en( - move_project_config, hf_hub_download, list_repo_files, testdatadir + copy_project_config, hf_hub_download, list_repo_files, testdatadir ): result = runner.invoke( annif.cli.cli, @@ -1280,7 +1280,7 @@ def test_download_dummy_fi_and_en( dirpath_en = os.path.join(str(testdatadir), "projects", "dummy-en") fpath_en = os.path.join(str(dirpath_en), "file.txt") assert os.path.exists(fpath_en) - assert move_project_config.call_args_list == [ + assert copy_project_config.call_args_list == [ mock.call("tests/huggingface-cache/dummy-fi.cfg", False), mock.call("tests/huggingface-cache/dummy-en.cfg", False), ] @@ -1343,10 +1343,10 @@ def test_download_hf_hub_download_failed( assert hf_hub_download.called -def test_unzip_initial(testdatadir): +def test_unzip_archive_initial(testdatadir): dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") - annif.cli_util.unzip( + annif.cli_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=False, ) @@ -1358,14 +1358,14 @@ def test_unzip_initial(testdatadir): ).astimezone(tz=timezone.utc) -def test_unzip_no_overwrite(testdatadir): +def test_unzip_archive_no_overwrite(testdatadir): dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") os.makedirs(dirpath, exist_ok=True) with open(fpath, "wt") as pf: print("Existing content", file=pf) - annif.cli_util.unzip( + annif.cli_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=False, ) @@ -1374,14 +1374,14 @@ def test_unzip_no_overwrite(testdatadir): assert datetime.now().timestamp() - os.path.getmtime(fpath) < 1 -def test_unzip_overwrite(testdatadir): +def test_unzip_archive_overwrite(testdatadir): dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") os.makedirs(dirpath, exist_ok=True) with open(fpath, "wt") as pf: print("Existing content", file=pf) - annif.cli_util.unzip( + annif.cli_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=True, ) @@ -1396,8 +1396,8 @@ def test_unzip_overwrite(testdatadir): @mock.patch("os.path.exists", return_value=True) @mock.patch("annif.cli_util._compute_crc32", return_value=0) @mock.patch("shutil.copy") -def test_move_project_config_no_overwrite(copy, _compute_crc32, exists): - annif.cli_util.move_project_config( +def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists): + annif.cli_util.copy_project_config( os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False ) assert not copy.called @@ -1405,8 +1405,8 @@ def test_move_project_config_no_overwrite(copy, _compute_crc32, exists): @mock.patch("os.path.exists", return_value=True) @mock.patch("shutil.copy") -def test_move_project_config_overwrite(copy, exists): - annif.cli_util.move_project_config( +def test_copy_project_config_overwrite(copy, exists): + annif.cli_util.copy_project_config( os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True ) assert copy.called From 5b169524907a6b1516693b6d53f172edcc2e7251 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:32:45 +0200 Subject: [PATCH 21/38] Add docstrings --- annif/cli_util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index f50b83804..be2d563ba 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -253,7 +253,7 @@ def get_matching_projects(pattern): def upload_datadir(data_dir, repo_id, token, commit_message): """ - Upload a data directory to HuggingFace Hub. + Upload a data directory to a Hugging Face Hub repository. """ zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" with _archive_dir(data_dir) as fobj: @@ -262,7 +262,7 @@ def upload_datadir(data_dir, repo_id, token, commit_message): def upload_config(project, repo_id, token, commit_message): """ - Upload a project configuration to HuggingFace Hub. + Upload a project configuration to a Hugging Face Hub repository. """ config_repo_path = project.project_id + ".cfg" with _get_project_config(project) as fobj: @@ -322,6 +322,8 @@ def _upload_to_hf_hub(fileobj, path_in_repo, repo_id, token, commit_message): def get_matching_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): + """Get project IDs of the projects in a Hugging Face Model Hub repository that match + the given pattern.""" all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) return [ path.rsplit(".zip")[0].split("projects/")[1] @@ -361,6 +363,8 @@ def download_from_hf_hub(filename, repo_id, token, revision): def unzip_archive(src_path, force): + """Unzip a zip archive of projects and vocabularies to a directory, by + default data/ under current directory.""" datadir = current_app.config["DATADIR"] with zipfile.ZipFile(src_path, "r") as zfile: archive_comment = str(zfile.comment, encoding="utf-8") @@ -395,6 +399,7 @@ def _restore_timestamps(member, dest_path): def copy_project_config(src_path, force): + """Copy a given project configuration file to projects.d/ directory.""" project_configs_dest_dir = "projects.d" if not os.path.isdir(project_configs_dest_dir): os.mkdir(project_configs_dest_dir) @@ -429,6 +434,7 @@ def _compute_crc32(path): def get_vocab_id_from_config(config_path): + """Get the vocabulary ID from a configuration file.""" config = configparser.ConfigParser() config.read(config_path) section = config.sections()[0] From c87675c44f83f718d3265885d0e0b382e73108bd Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 12 Mar 2024 10:45:08 +0200 Subject: [PATCH 22/38] Add type hints --- annif/cli_util.py | 52 +++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index be2d563ba..2a59d2f85 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -240,7 +240,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float return list(itertools.product(limits, thresholds)) -def get_matching_projects(pattern): +def get_matching_projects(pattern: str) -> list[AnnifProject]: """ Get projects that match the given pattern. """ @@ -251,7 +251,9 @@ def get_matching_projects(pattern): ] -def upload_datadir(data_dir, repo_id, token, commit_message): +def upload_datadir( + data_dir: str, repo_id: str, token: str, commit_message: str +) -> None: """ Upload a data directory to a Hugging Face Hub repository. """ @@ -260,7 +262,9 @@ def upload_datadir(data_dir, repo_id, token, commit_message): _upload_to_hf_hub(fobj, zip_repo_path, repo_id, token, commit_message) -def upload_config(project, repo_id, token, commit_message): +def upload_config( + project: AnnifProject, repo_id: str, token: str, commit_message: str +) -> None: """ Upload a project configuration to a Hugging Face Hub repository. """ @@ -269,7 +273,7 @@ def upload_config(project, repo_id, token, commit_message): _upload_to_hf_hub(fobj, config_repo_path, repo_id, token, commit_message) -def _is_train_file(fname): +def _is_train_file(fname: str) -> bool: train_file_patterns = ("-train", "tmp-") for pat in train_file_patterns: if pat in fname: @@ -277,7 +281,7 @@ def _is_train_file(fname): return False -def _archive_dir(data_dir): +def _archive_dir(data_dir: str) -> io.BufferedRandom: fp = tempfile.TemporaryFile() path = pathlib.Path(data_dir) fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] @@ -294,7 +298,7 @@ def _archive_dir(data_dir): return fp -def _get_project_config(project): +def _get_project_config(project: AnnifProject) -> io.BytesIO: fp = tempfile.TemporaryFile(mode="w+t") config = configparser.ConfigParser() config[project.project_id] = project.config @@ -304,7 +308,13 @@ def _get_project_config(project): return io.BytesIO(fp.read().encode("utf8")) -def _upload_to_hf_hub(fileobj, path_in_repo, repo_id, token, commit_message): +def _upload_to_hf_hub( + fileobj: io.BytesIO | io.BufferedRandom, + path_in_repo: str, + repo_id: str, + token: str, + commit_message: str, +) -> None: from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -321,7 +331,9 @@ def _upload_to_hf_hub(fileobj, path_in_repo, repo_id, token, commit_message): raise OperationFailedException(str(err)) -def get_matching_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, revision): +def get_matching_project_ids_from_hf_hub( + project_ids_pattern: str, repo_id: str, token, revision: str +) -> list[str]: """Get project IDs of the projects in a Hugging Face Model Hub repository that match the given pattern.""" all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) @@ -332,7 +344,7 @@ def get_matching_project_ids_from_hf_hub(project_ids_pattern, repo_id, token, re ] -def _list_files_in_hf_hub(repo_id, token, revision): +def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]: from huggingface_hub import list_repo_files from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -347,7 +359,9 @@ def _list_files_in_hf_hub(repo_id, token, revision): raise OperationFailedException(str(err)) -def download_from_hf_hub(filename, repo_id, token, revision): +def download_from_hf_hub( + filename: str, repo_id: str, token: str, revision: str +) -> list[str]: from huggingface_hub import hf_hub_download from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -362,7 +376,7 @@ def download_from_hf_hub(filename, repo_id, token, revision): raise OperationFailedException(str(err)) -def unzip_archive(src_path, force): +def unzip_archive(src_path: str, force: bool) -> None: """Unzip a zip archive of projects and vocabularies to a directory, by default data/ under current directory.""" datadir = current_app.config["DATADIR"] @@ -375,7 +389,9 @@ def unzip_archive(src_path, force): _unzip_member(zfile, member, datadir, force) -def _unzip_member(zfile, member, datadir, force): +def _unzip_member( + zfile: zipfile.ZipFile, member: zipfile.ZipInfo, datadir: str, force: bool +) -> None: dest_path = os.path.join(datadir, member.filename) if os.path.exists(dest_path) and not force: if _are_identical_member_and_file(member, dest_path): @@ -388,17 +404,17 @@ def _unzip_member(zfile, member, datadir, force): _restore_timestamps(member, dest_path) -def _are_identical_member_and_file(member, dest_path): +def _are_identical_member_and_file(member: zipfile.ZipInfo, dest_path: str) -> bool: path_crc = _compute_crc32(dest_path) return path_crc == member.CRC -def _restore_timestamps(member, dest_path): +def _restore_timestamps(member: zipfile.ZipInfo, dest_path: str) -> None: date_time = time.mktime(member.date_time + (0, 0, -1)) os.utime(dest_path, (date_time, date_time)) -def copy_project_config(src_path, force): +def copy_project_config(src_path: str, force: bool) -> None: """Copy a given project configuration file to projects.d/ directory.""" project_configs_dest_dir = "projects.d" if not os.path.isdir(project_configs_dest_dir): @@ -415,13 +431,13 @@ def copy_project_config(src_path, force): shutil.copy(src_path, dest_path) -def _are_identical_files(src_path, dest_path): +def _are_identical_files(src_path: str, dest_path: str) -> bool: src_crc32 = _compute_crc32(src_path) dest_crc32 = _compute_crc32(dest_path) return src_crc32 == dest_crc32 -def _compute_crc32(path): +def _compute_crc32(path: str) -> int: if os.path.isdir(path): return 0 @@ -433,7 +449,7 @@ def _compute_crc32(path): return crcval -def get_vocab_id_from_config(config_path): +def get_vocab_id_from_config(config_path: str) -> str: """Get the vocabulary ID from a configuration file.""" config = configparser.ConfigParser() config.read(config_path) From 2fe5b739085e159c2fc3320db85f1ac6f9832605 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 12 Mar 2024 11:10:08 +0200 Subject: [PATCH 23/38] Update RTD CLI commands page --- annif/cli.py | 12 +++++++----- docs/source/commands.rst | 11 +++++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 8e9472e3d..b12dfe48c 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -597,12 +597,13 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi @cli_util.common_options def run_upload(project_ids_pattern, repo_id, token, commit_message): """ - Upload selected projects and their vocabularies to a Hugging Face Hub repository + Upload selected projects and their vocabularies to a Hugging Face Hub repository. \f This command zips the project directories and vocabularies of the projects - that match the given `project_ids_pattern`, and uploads the archives along - with the projects configuration to the specified Hugging Face Hub repository. - An authentication token and commit message can be given with options. + that match the given `project_ids_pattern` to archive files, and uploads the + archives along with the project configurations to the specified Hugging Face + Hub repository. An authentication token and commit message can be given with + options. """ projects = cli_util.get_matching_projects(project_ids_pattern) click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") @@ -649,7 +650,8 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): @cli_util.common_options def run_download(project_ids_pattern, repo_id, token, revision, force): """ - Download selected projects and their vocabularies from a Hugging Face Hub repository + Download selected projects and their vocabularies from a Hugging Face Hub + repository. \f This command downloads the project and vocabulary archives and the configuration files of the projects that match the given diff --git a/docs/source/commands.rst b/docs/source/commands.rst index 6a762828e..a5ae2f46f 100644 --- a/docs/source/commands.rst +++ b/docs/source/commands.rst @@ -66,8 +66,15 @@ Project administration N/A -.. click:: annif.cli:run_upload_projects - :prog: annif upload-projects +.. click:: annif.cli:run_upload + :prog: annif upload + +**REST equivalent** + + N/A + +.. click:: annif.cli:run_download + :prog: annif download **REST equivalent** From d7be137c7da1d907d59ca4a9992754c0dd6a0653 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 13 Mar 2024 15:20:58 +0200 Subject: [PATCH 24/38] Remove --revision option of download command --- annif/cli.py | 23 +++++++++-------------- annif/cli_util.py | 18 +++++------------- tests/test_cli.py | 10 +--------- 3 files changed, 15 insertions(+), 36 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index b12dfe48c..925f74a87 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -633,13 +633,6 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): help="""Authentication token, obtained from the Hugging Face Hub. Will default to the stored token.""", ) -@click.option( - "--revision", - help=""" - An optional Git revision id which can be a branch name, a tag, or a commit - hash. - """, -) @click.option( "--force", "-f", @@ -648,7 +641,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): help="Replace an existing project/vocabulary/config with the downloaded one", ) @cli_util.common_options -def run_download(project_ids_pattern, repo_id, token, revision, force): +def run_download(project_ids_pattern, repo_id, token, force): """ Download selected projects and their vocabularies from a Hugging Face Hub repository. @@ -657,30 +650,32 @@ def run_download(project_ids_pattern, repo_id, token, revision, force): configuration files of the projects that match the given `project_ids_pattern` from the specified Hugging Face Hub repository and unzips the archives to `data/` directory and places the configuration files - to `projects.d/` directory. An authentication token and revision can - be given with options. + to `projects.d/` directory. An authentication token can be given with + `--token` option. """ project_ids = cli_util.get_matching_project_ids_from_hf_hub( - project_ids_pattern, repo_id, token, revision + project_ids_pattern, + repo_id, + token, ) click.echo(f"Downloading project(s): {', '.join(project_ids)}") vocab_ids = set() for project_id in project_ids: project_zip_cache_path = cli_util.download_from_hf_hub( - f"projects/{project_id}.zip", repo_id, token, revision + f"projects/{project_id}.zip", repo_id, token ) cli_util.unzip_archive(project_zip_cache_path, force) config_file_cache_path = cli_util.download_from_hf_hub( - f"{project_id}.cfg", repo_id, token, revision + f"{project_id}.cfg", repo_id, token ) vocab_ids.add(cli_util.get_vocab_id_from_config(config_file_cache_path)) cli_util.copy_project_config(config_file_cache_path, force) for vocab_id in vocab_ids: vocab_zip_cache_path = cli_util.download_from_hf_hub( - f"vocabs/{vocab_id}.zip", repo_id, token, revision + f"vocabs/{vocab_id}.zip", repo_id, token ) cli_util.unzip_archive(vocab_zip_cache_path, force) diff --git a/annif/cli_util.py b/annif/cli_util.py index 2a59d2f85..3e43a34ba 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -332,11 +332,11 @@ def _upload_to_hf_hub( def get_matching_project_ids_from_hf_hub( - project_ids_pattern: str, repo_id: str, token, revision: str + project_ids_pattern: str, repo_id: str, token ) -> list[str]: """Get project IDs of the projects in a Hugging Face Model Hub repository that match the given pattern.""" - all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) + all_repo_file_paths = _list_files_in_hf_hub(repo_id, token) return [ path.rsplit(".zip")[0].split("projects/")[1] for path in all_repo_file_paths @@ -344,24 +344,17 @@ def get_matching_project_ids_from_hf_hub( ] -def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]: +def _list_files_in_hf_hub(repo_id: str, token: str) -> list[str]: from huggingface_hub import list_repo_files from huggingface_hub.utils import HfHubHTTPError, HFValidationError try: - return [ - repofile - for repofile in list_repo_files( - repo_id=repo_id, token=token, revision=revision - ) - ] + return [repofile for repofile in list_repo_files(repo_id=repo_id, token=token)] except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) -def download_from_hf_hub( - filename: str, repo_id: str, token: str, revision: str -) -> list[str]: +def download_from_hf_hub(filename: str, repo_id: str, token: str) -> list[str]: from huggingface_hub import hf_hub_download from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -370,7 +363,6 @@ def download_from_hf_hub( repo_id=repo_id, filename=filename, token=token, - revision=revision, ) except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) diff --git a/tests/test_cli.py b/tests/test_cli.py index 6fc790b88..77a30683e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1150,7 +1150,7 @@ def test_get_project_config(app_project): assert "[dummy-en]" in string_result -def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): +def hf_hub_download_mock_side_effect(filename, repo_id, token): return "tests/huggingface-cache/" + filename # Mocks the downloaded file paths @@ -1189,19 +1189,16 @@ def test_download_dummy_fi( repo_id="mock-repo", filename="projects/dummy-fi.zip", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-fi.cfg", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="vocabs/dummy.zip", token=None, - revision=None, ), ] dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") @@ -1247,31 +1244,26 @@ def test_download_dummy_fi_and_en( repo_id="mock-repo", filename="projects/dummy-fi.zip", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-fi.cfg", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="projects/dummy-en.zip", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-en.cfg", token=None, - revision=None, ), mock.call( repo_id="mock-repo", filename="vocabs/dummy.zip", token=None, - revision=None, ), ] dirpath_fi = os.path.join(str(testdatadir), "projects", "dummy-fi") From 47f7ee479e519f0f8c0df543828fbd6afb28166f Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 25 Mar 2024 16:39:11 +0200 Subject: [PATCH 25/38] Upgrade to huggingface-hub 0.22.* --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 23c8151e8..be416961b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ python-dateutil = "2.8.*" tomli = { version = "2.0.*", python = "<3.11" } simplemma = "0.9.*" jsonschema = "4.21.*" -huggingface-hub = "0.21.*" +huggingface-hub = "0.22.*" fasttext-wheel = { version = "0.9.2", optional = true } voikko = { version = "0.5.*", optional = true } From a488d070d1317374ae8dd6db7aeddd03ba33e092 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:06:01 +0200 Subject: [PATCH 26/38] Revert "Remove --revision option of download command" This reverts commit d7be137c7da1d907d59ca4a9992754c0dd6a0653. --- annif/cli.py | 23 ++++++++++++++--------- annif/cli_util.py | 18 +++++++++++++----- tests/test_cli.py | 10 +++++++++- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 925f74a87..b12dfe48c 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -633,6 +633,13 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): help="""Authentication token, obtained from the Hugging Face Hub. Will default to the stored token.""", ) +@click.option( + "--revision", + help=""" + An optional Git revision id which can be a branch name, a tag, or a commit + hash. + """, +) @click.option( "--force", "-f", @@ -641,7 +648,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): help="Replace an existing project/vocabulary/config with the downloaded one", ) @cli_util.common_options -def run_download(project_ids_pattern, repo_id, token, force): +def run_download(project_ids_pattern, repo_id, token, revision, force): """ Download selected projects and their vocabularies from a Hugging Face Hub repository. @@ -650,32 +657,30 @@ def run_download(project_ids_pattern, repo_id, token, force): configuration files of the projects that match the given `project_ids_pattern` from the specified Hugging Face Hub repository and unzips the archives to `data/` directory and places the configuration files - to `projects.d/` directory. An authentication token can be given with - `--token` option. + to `projects.d/` directory. An authentication token and revision can + be given with options. """ project_ids = cli_util.get_matching_project_ids_from_hf_hub( - project_ids_pattern, - repo_id, - token, + project_ids_pattern, repo_id, token, revision ) click.echo(f"Downloading project(s): {', '.join(project_ids)}") vocab_ids = set() for project_id in project_ids: project_zip_cache_path = cli_util.download_from_hf_hub( - f"projects/{project_id}.zip", repo_id, token + f"projects/{project_id}.zip", repo_id, token, revision ) cli_util.unzip_archive(project_zip_cache_path, force) config_file_cache_path = cli_util.download_from_hf_hub( - f"{project_id}.cfg", repo_id, token + f"{project_id}.cfg", repo_id, token, revision ) vocab_ids.add(cli_util.get_vocab_id_from_config(config_file_cache_path)) cli_util.copy_project_config(config_file_cache_path, force) for vocab_id in vocab_ids: vocab_zip_cache_path = cli_util.download_from_hf_hub( - f"vocabs/{vocab_id}.zip", repo_id, token + f"vocabs/{vocab_id}.zip", repo_id, token, revision ) cli_util.unzip_archive(vocab_zip_cache_path, force) diff --git a/annif/cli_util.py b/annif/cli_util.py index 3e43a34ba..2a59d2f85 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -332,11 +332,11 @@ def _upload_to_hf_hub( def get_matching_project_ids_from_hf_hub( - project_ids_pattern: str, repo_id: str, token + project_ids_pattern: str, repo_id: str, token, revision: str ) -> list[str]: """Get project IDs of the projects in a Hugging Face Model Hub repository that match the given pattern.""" - all_repo_file_paths = _list_files_in_hf_hub(repo_id, token) + all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) return [ path.rsplit(".zip")[0].split("projects/")[1] for path in all_repo_file_paths @@ -344,17 +344,24 @@ def get_matching_project_ids_from_hf_hub( ] -def _list_files_in_hf_hub(repo_id: str, token: str) -> list[str]: +def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]: from huggingface_hub import list_repo_files from huggingface_hub.utils import HfHubHTTPError, HFValidationError try: - return [repofile for repofile in list_repo_files(repo_id=repo_id, token=token)] + return [ + repofile + for repofile in list_repo_files( + repo_id=repo_id, token=token, revision=revision + ) + ] except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) -def download_from_hf_hub(filename: str, repo_id: str, token: str) -> list[str]: +def download_from_hf_hub( + filename: str, repo_id: str, token: str, revision: str +) -> list[str]: from huggingface_hub import hf_hub_download from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -363,6 +370,7 @@ def download_from_hf_hub(filename: str, repo_id: str, token: str) -> list[str]: repo_id=repo_id, filename=filename, token=token, + revision=revision, ) except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) diff --git a/tests/test_cli.py b/tests/test_cli.py index 77a30683e..6fc790b88 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1150,7 +1150,7 @@ def test_get_project_config(app_project): assert "[dummy-en]" in string_result -def hf_hub_download_mock_side_effect(filename, repo_id, token): +def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): return "tests/huggingface-cache/" + filename # Mocks the downloaded file paths @@ -1189,16 +1189,19 @@ def test_download_dummy_fi( repo_id="mock-repo", filename="projects/dummy-fi.zip", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-fi.cfg", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="vocabs/dummy.zip", token=None, + revision=None, ), ] dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") @@ -1244,26 +1247,31 @@ def test_download_dummy_fi_and_en( repo_id="mock-repo", filename="projects/dummy-fi.zip", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-fi.cfg", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="projects/dummy-en.zip", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="dummy-en.cfg", token=None, + revision=None, ), mock.call( repo_id="mock-repo", filename="vocabs/dummy.zip", token=None, + revision=None, ), ] dirpath_fi = os.path.join(str(testdatadir), "projects", "dummy-fi") From 0c57bf2492b49fb6b1330cc5418afa17521b514a Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:46:20 +0200 Subject: [PATCH 27/38] Preupload lfs files --- annif/cli.py | 40 ++++++++++++++++++++++++++++++------ annif/cli_util.py | 52 ++++++++++++----------------------------------- tests/test_cli.py | 39 +++++++++++++++++++---------------- 3 files changed, 69 insertions(+), 62 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index b12dfe48c..95553f858 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -18,7 +18,11 @@ import annif.project import annif.registry from annif import cli_util -from annif.exception import NotInitializedException, NotSupportedException +from annif.exception import ( + NotInitializedException, + NotSupportedException, + OperationFailedException, +) from annif.project import Access from annif.util import metric_code @@ -605,6 +609,9 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): Hub repository. An authentication token and commit message can be given with options. """ + from huggingface_hub import HfApi, preupload_lfs_files + from huggingface_hub.utils import HfHubHTTPError, HFValidationError + projects = cli_util.get_matching_projects(project_ids_pattern) click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") @@ -618,11 +625,32 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): vocab_dirs = {p.vocab.datadir for p in projects} data_dirs = project_dirs.union(vocab_dirs) - for data_dir in data_dirs: - cli_util.upload_datadir(data_dir, repo_id, token, commit_message) - - for project in projects: - cli_util.upload_config(project, repo_id, token, commit_message) + fobjs, operations = [], [] + try: + for data_dir in data_dirs: + logger.debug(f"Archiving directory {data_dir}") + fobj, operation = cli_util.prepare_datadir_commit(data_dir) + logger.debug(f"Preuploading to {operation.path_in_repo}") + preupload_lfs_files(repo_id, additions=[operation]) + fobjs.append(fobj) + operations.append(operation) + for project in projects: + fobj, operation = cli_util.prepare_config_commit(project) + fobjs.append(fobj) + operations.append(operation) + + api = HfApi() + api.create_commit( + repo_id=repo_id, + operations=operations, + commit_message=commit_message, + token=token, + ) + except (HfHubHTTPError, HFValidationError) as err: + raise OperationFailedException(str(err)) + finally: + for fobj in fobjs: + fobj.close() @cli.command("download") diff --git a/annif/cli_util.py b/annif/cli_util.py index 2a59d2f85..978214483 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -28,6 +28,7 @@ if TYPE_CHECKING: from datetime import datetime + from typing import Any from click.core import Argument, Context, Option @@ -251,26 +252,22 @@ def get_matching_projects(pattern: str) -> list[AnnifProject]: ] -def upload_datadir( - data_dir: str, repo_id: str, token: str, commit_message: str -) -> None: - """ - Upload a data directory to a Hugging Face Hub repository. - """ +def prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: + from huggingface_hub import CommitOperationAdd + zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" - with _archive_dir(data_dir) as fobj: - _upload_to_hf_hub(fobj, zip_repo_path, repo_id, token, commit_message) + fobj = _archive_dir(data_dir) + operation = CommitOperationAdd(path_in_repo=zip_repo_path, path_or_fileobj=fobj) + return fobj, operation -def upload_config( - project: AnnifProject, repo_id: str, token: str, commit_message: str -) -> None: - """ - Upload a project configuration to a Hugging Face Hub repository. - """ +def prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]: + from huggingface_hub import CommitOperationAdd + config_repo_path = project.project_id + ".cfg" - with _get_project_config(project) as fobj: - _upload_to_hf_hub(fobj, config_repo_path, repo_id, token, commit_message) + fobj = _get_project_config(project) + operation = CommitOperationAdd(path_in_repo=config_repo_path, path_or_fileobj=fobj) + return fobj, operation def _is_train_file(fname: str) -> bool: @@ -308,29 +305,6 @@ def _get_project_config(project: AnnifProject) -> io.BytesIO: return io.BytesIO(fp.read().encode("utf8")) -def _upload_to_hf_hub( - fileobj: io.BytesIO | io.BufferedRandom, - path_in_repo: str, - repo_id: str, - token: str, - commit_message: str, -) -> None: - from huggingface_hub import HfApi - from huggingface_hub.utils import HfHubHTTPError, HFValidationError - - api = HfApi() - try: - api.upload_file( - path_or_fileobj=fileobj, - path_in_repo=path_in_repo, - repo_id=repo_id, - token=token, - commit_message=commit_message, - ) - except (HfHubHTTPError, HFValidationError) as err: - raise OperationFailedException(str(err)) - - def get_matching_project_ids_from_hf_hub( project_ids_pattern: str, repo_id: str, token, revision: str ) -> list[str]: diff --git a/tests/test_cli.py b/tests/test_cli.py index 6fc790b88..6a56f3b52 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,6 @@ from datetime import datetime, timedelta, timezone from unittest import mock -import huggingface_hub from click.shell_completion import ShellComplete from click.testing import CliRunner from huggingface_hub.utils import HFValidationError @@ -1077,48 +1076,54 @@ def test_routes_with_connexion_app(): assert re.search(r"app.home\s+GET\s+\/", result) -@mock.patch("huggingface_hub.HfApi.upload_file") -def test_upload(upload_file): +@mock.patch("huggingface_hub.HfApi.preupload_lfs_files") +@mock.patch("huggingface_hub.CommitOperationAdd") +@mock.patch("huggingface_hub.HfApi.create_commit") +def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files): result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"]) assert not result.exception - assert huggingface_hub.HfApi.upload_file.call_count == 3 + assert create_commit.call_count == 1 + assert CommitOperationAdd.call_count == 3 # projects, vocab, config assert ( mock.call( path_or_fileobj=mock.ANY, # io.BufferedRandom object path_in_repo="data/vocabs/dummy.zip", - repo_id="dummy-repo", - token=None, - commit_message="Upload project(s) dummy-fi with Annif", ) - in huggingface_hub.HfApi.upload_file.call_args_list + in CommitOperationAdd.call_args_list ) assert ( mock.call( path_or_fileobj=mock.ANY, # io.BufferedRandom object path_in_repo="data/projects/dummy-fi.zip", - repo_id="dummy-repo", - token=None, - commit_message="Upload project(s) dummy-fi with Annif", ) - in huggingface_hub.HfApi.upload_file.call_args_list + in CommitOperationAdd.call_args_list ) assert ( mock.call( path_or_fileobj=mock.ANY, # io.BytesIO object path_in_repo="dummy-fi.cfg", + ) + in CommitOperationAdd.call_args_list + ) + assert ( + mock.call( repo_id="dummy-repo", - token=None, + operations=mock.ANY, commit_message="Upload project(s) dummy-fi with Annif", + token=None, ) - in huggingface_hub.HfApi.upload_file.call_args_list + in create_commit.call_args_list ) -@mock.patch("huggingface_hub.HfApi.upload_file") -def test_upload_many(upload_file): +@mock.patch("huggingface_hub.HfApi.preupload_lfs_files") +@mock.patch("huggingface_hub.CommitOperationAdd") +@mock.patch("huggingface_hub.HfApi.create_commit") +def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files): result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"]) assert not result.exception - assert huggingface_hub.HfApi.upload_file.call_count == 11 + assert create_commit.call_count == 1 + assert CommitOperationAdd.call_count == 11 def test_upload_nonexistent_repo(): From df105a3aee62ecdf96f42560defefb9facbeab76 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:08:43 +0200 Subject: [PATCH 28/38] Fix HF Hub caching in Dockerfile --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 444d18df9..e033f7a81 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,8 +50,9 @@ RUN annif completion --bash >> /etc/bash.bashrc # Enable tab completion RUN groupadd -g 998 annif_user && \ useradd -r -u 998 -g annif_user annif_user && \ chmod -R a+rX /Annif && \ - mkdir -p /Annif/tests/data /.cache/huggingface /Annif/projects.d && \ + mkdir -p /Annif/tests/data /Annif/projects.d && \ chown -R annif_user:annif_user /annif-projects /Annif/tests/data USER annif_user +ENV HF_HOME="/tmp" CMD annif From d14ff302dbd206da3f4163bc38148efef64f9f19 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:15:11 +0300 Subject: [PATCH 29/38] Refactor to address quality complains --- annif/cli.py | 19 ++----------------- annif/cli_util.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 95553f858..a0178bfe3 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -609,7 +609,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): Hub repository. An authentication token and commit message can be given with options. """ - from huggingface_hub import HfApi, preupload_lfs_files + from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError, HFValidationError projects = cli_util.get_matching_projects(project_ids_pattern) @@ -621,24 +621,9 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): else f"Upload project(s) {project_ids_pattern} with Annif" ) - project_dirs = {p.datadir for p in projects} - vocab_dirs = {p.vocab.datadir for p in projects} - data_dirs = project_dirs.union(vocab_dirs) - fobjs, operations = [], [] try: - for data_dir in data_dirs: - logger.debug(f"Archiving directory {data_dir}") - fobj, operation = cli_util.prepare_datadir_commit(data_dir) - logger.debug(f"Preuploading to {operation.path_in_repo}") - preupload_lfs_files(repo_id, additions=[operation]) - fobjs.append(fobj) - operations.append(operation) - for project in projects: - fobj, operation = cli_util.prepare_config_commit(project) - fobjs.append(fobj) - operations.append(operation) - + fobjs, operations = cli_util.prepare_commits(projects, repo_id) api = HfApi() api.create_commit( repo_id=repo_id, diff --git a/annif/cli_util.py b/annif/cli_util.py index 978214483..7a67d1f8a 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -252,7 +252,31 @@ def get_matching_projects(pattern: str) -> list[AnnifProject]: ] -def prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: +def prepare_commits(projects: list[AnnifProject], repo_id: str) -> tuple[list, list]: + """Prepare and pre-upload data and config commit operations for projects to a + Hugging Face Hub repository.""" + from huggingface_hub import preupload_lfs_files + + fobjs, operations = [], [] + data_dirs = {p.datadir for p in projects} + vocab_dirs = {p.vocab.datadir for p in projects} + all_dirs = data_dirs.union(vocab_dirs) + + for data_dir in all_dirs: + fobj, operation = _prepare_datadir_commit(data_dir) + preupload_lfs_files(repo_id, additions=[operation]) + fobjs.append(fobj) + operations.append(operation) + + for project in projects: + fobj, operation = _prepare_config_commit(project) + fobjs.append(fobj) + operations.append(operation) + + return fobjs, operations + + +def _prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: from huggingface_hub import CommitOperationAdd zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" @@ -261,7 +285,7 @@ def prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: return fobj, operation -def prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]: +def _prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]: from huggingface_hub import CommitOperationAdd config_repo_path = project.project_id + ".cfg" From cc0c9893dcb39246c0ab90aa20ced2e942fc295f Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:46:02 +0300 Subject: [PATCH 30/38] Again: Refactor & simplify to address quality complains --- annif/cli_util.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index 7a67d1f8a..af037208c 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -392,14 +392,18 @@ def _unzip_member( ) -> None: dest_path = os.path.join(datadir, member.filename) if os.path.exists(dest_path) and not force: - if _are_identical_member_and_file(member, dest_path): - logger.debug(f"Skipping unzip to {dest_path}; already in place") - else: - click.echo(f"Not overwriting {dest_path} (use --force to override)") + _handle_existing_file(member, dest_path) + return + logger.debug(f"Unzipping to {dest_path}") + zfile.extract(member, path=datadir) + _restore_timestamps(member, dest_path) + + +def _handle_existing_file(member: zipfile.ZipInfo, dest_path: str) -> None: + if _are_identical_member_and_file(member, dest_path): + logger.debug(f"Skipping unzip to {dest_path}; already in place") else: - logger.debug(f"Unzipping to {dest_path}") - zfile.extract(member, path=datadir) - _restore_timestamps(member, dest_path) + click.echo(f"Not overwriting {dest_path} (use --force to override)") def _are_identical_member_and_file(member: zipfile.ZipInfo, dest_path: str) -> bool: @@ -415,8 +419,7 @@ def _restore_timestamps(member: zipfile.ZipInfo, dest_path: str) -> None: def copy_project_config(src_path: str, force: bool) -> None: """Copy a given project configuration file to projects.d/ directory.""" project_configs_dest_dir = "projects.d" - if not os.path.isdir(project_configs_dest_dir): - os.mkdir(project_configs_dest_dir) + os.makedirs(project_configs_dest_dir, exist_ok=True) dest_path = os.path.join(project_configs_dest_dir, os.path.basename(src_path)) if os.path.exists(dest_path) and not force: From 9443c8f2f376a350dd69b5bdf21b2b0c264305fb Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:02:07 +0300 Subject: [PATCH 31/38] Fix typo in mocked filenames in repo --- tests/test_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 6a56f3b52..e38fd46a2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1167,7 +1167,7 @@ def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): "dummy-fi.cfg", "projects/dummy-en.zip", "vocabs/dummy.zip", - "dummy-.cfg", + "dummy-en.cfg", ], ) @mock.patch( @@ -1225,7 +1225,7 @@ def test_download_dummy_fi( "dummy-fi.cfg", "projects/dummy-en.zip", "vocabs/dummy.zip", - "dummy-.cfg", + "dummy-en.cfg", ], ) @mock.patch( From 156bbf54e328b7f3ae18f52f816eb73aa977a72b Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:04:55 +0300 Subject: [PATCH 32/38] Detect projects present in repo by .cfg files, not .zip files --- annif/cli_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index af037208c..a01f24c13 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -336,9 +336,9 @@ def get_matching_project_ids_from_hf_hub( the given pattern.""" all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) return [ - path.rsplit(".zip")[0].split("projects/")[1] + path.rsplit(".cfg")[0] for path in all_repo_file_paths - if fnmatch(path, f"projects/{project_ids_pattern}.zip") + if fnmatch(path, f"{project_ids_pattern}.cfg") ] From 3f60456da75d84483d8724b65b98ea1ba83c7677 Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:06:32 +0300 Subject: [PATCH 33/38] Add --revision option to upload command --- annif/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index a0178bfe3..544db8548 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -594,12 +594,17 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi help="""Authentication token, obtained from the Hugging Face Hub. Will default to the stored token.""", ) +@click.option( + "--revision", + help="""An optional git revision to commit from. Defaults to the head of the "main" + branch.""", +) @click.option( "--commit-message", help="""The summary / title / first line of the generated commit.""", ) @cli_util.common_options -def run_upload(project_ids_pattern, repo_id, token, commit_message): +def run_upload(project_ids_pattern, repo_id, token, revision, commit_message): """ Upload selected projects and their vocabularies to a Hugging Face Hub repository. \f @@ -629,6 +634,7 @@ def run_upload(project_ids_pattern, repo_id, token, commit_message): repo_id=repo_id, operations=operations, commit_message=commit_message, + revision=revision, token=token, ) except (HfHubHTTPError, HFValidationError) as err: From 2dd359d6f1caab93c47b2418d5690cb4c90dd5dc Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:09:02 +0300 Subject: [PATCH 34/38] Enable completion of project_id argument in upload command --- annif/cli.py | 2 +- annif/cli_util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 544db8548..3577db215 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -587,7 +587,7 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi @cli.command("upload") -@click.argument("project_ids_pattern") +@click.argument("project_ids_pattern", shell_complete=cli_util.complete_param) @click.argument("repo_id") @click.option( "--token", diff --git a/annif/cli_util.py b/annif/cli_util.py index a01f24c13..c0812bd6a 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -461,7 +461,7 @@ def get_vocab_id_from_config(config_path: str) -> str: def _get_completion_choices( param: Argument, ) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: - if param.name == "project_id": + if param.name in ("project_id", "project_ids_pattern"): return annif.registry.get_projects() elif param.name == "vocab_id": return annif.registry.get_vocabs() From 63076cdc9f76f42a995bd62a3108a6b6726a5ab1 Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:35:12 +0300 Subject: [PATCH 35/38] Adapt test for adding revision option to upload command --- tests/test_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index e38fd46a2..bb2a8b53d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1111,6 +1111,7 @@ def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files): operations=mock.ANY, commit_message="Upload project(s) dummy-fi with Annif", token=None, + revision=None, ) in create_commit.call_args_list ) From a0a3850e1ac2722314f10e2658ee35e875692918 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 Apr 2024 09:09:26 +0300 Subject: [PATCH 36/38] Move functions for HuggingFaceHub interactions to own file --- annif/cli.py | 22 ++--- annif/cli_util.py | 229 +------------------------------------------ annif/hfh_util.py | 240 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 21 ++-- 4 files changed, 263 insertions(+), 249 deletions(-) create mode 100644 annif/hfh_util.py diff --git a/annif/cli.py b/annif/cli.py index 3577db215..cc62a0f96 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -17,7 +17,7 @@ import annif.parallel import annif.project import annif.registry -from annif import cli_util +from annif import cli_util, hfh_util from annif.exception import ( NotInitializedException, NotSupportedException, @@ -617,7 +617,7 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message): from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError, HFValidationError - projects = cli_util.get_matching_projects(project_ids_pattern) + projects = hfh_util.get_matching_projects(project_ids_pattern) click.echo(f"Uploading project(s): {', '.join([p.project_id for p in projects])}") commit_message = ( @@ -628,7 +628,7 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message): fobjs, operations = [], [] try: - fobjs, operations = cli_util.prepare_commits(projects, repo_id) + fobjs, operations = hfh_util.prepare_commits(projects, repo_id) api = HfApi() api.create_commit( repo_id=repo_id, @@ -680,28 +680,28 @@ def run_download(project_ids_pattern, repo_id, token, revision, force): be given with options. """ - project_ids = cli_util.get_matching_project_ids_from_hf_hub( + project_ids = hfh_util.get_matching_project_ids_from_hf_hub( project_ids_pattern, repo_id, token, revision ) click.echo(f"Downloading project(s): {', '.join(project_ids)}") vocab_ids = set() for project_id in project_ids: - project_zip_cache_path = cli_util.download_from_hf_hub( + project_zip_cache_path = hfh_util.download_from_hf_hub( f"projects/{project_id}.zip", repo_id, token, revision ) - cli_util.unzip_archive(project_zip_cache_path, force) - config_file_cache_path = cli_util.download_from_hf_hub( + hfh_util.unzip_archive(project_zip_cache_path, force) + config_file_cache_path = hfh_util.download_from_hf_hub( f"{project_id}.cfg", repo_id, token, revision ) - vocab_ids.add(cli_util.get_vocab_id_from_config(config_file_cache_path)) - cli_util.copy_project_config(config_file_cache_path, force) + vocab_ids.add(hfh_util.get_vocab_id_from_config(config_file_cache_path)) + hfh_util.copy_project_config(config_file_cache_path, force) for vocab_id in vocab_ids: - vocab_zip_cache_path = cli_util.download_from_hf_hub( + vocab_zip_cache_path = hfh_util.download_from_hf_hub( f"vocabs/{vocab_id}.zip", repo_id, token, revision ) - cli_util.unzip_archive(vocab_zip_cache_path, force) + hfh_util.unzip_archive(vocab_zip_cache_path, force) @cli.command("completion") diff --git a/annif/cli_util.py b/annif/cli_util.py index c0812bd6a..c47196169 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -2,20 +2,11 @@ from __future__ import annotations -import binascii import collections -import configparser -import importlib import io import itertools import os -import pathlib -import shutil import sys -import tempfile -import time -import zipfile -from fnmatch import fnmatch from typing import TYPE_CHECKING import click @@ -23,12 +14,11 @@ from flask import current_app import annif -from annif.exception import ConfigurationException, OperationFailedException +from annif.exception import ConfigurationException from annif.project import Access if TYPE_CHECKING: from datetime import datetime - from typing import Any from click.core import Argument, Context, Option @@ -241,223 +231,6 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float return list(itertools.product(limits, thresholds)) -def get_matching_projects(pattern: str) -> list[AnnifProject]: - """ - Get projects that match the given pattern. - """ - return [ - proj - for proj in annif.registry.get_projects(min_access=Access.private).values() - if fnmatch(proj.project_id, pattern) - ] - - -def prepare_commits(projects: list[AnnifProject], repo_id: str) -> tuple[list, list]: - """Prepare and pre-upload data and config commit operations for projects to a - Hugging Face Hub repository.""" - from huggingface_hub import preupload_lfs_files - - fobjs, operations = [], [] - data_dirs = {p.datadir for p in projects} - vocab_dirs = {p.vocab.datadir for p in projects} - all_dirs = data_dirs.union(vocab_dirs) - - for data_dir in all_dirs: - fobj, operation = _prepare_datadir_commit(data_dir) - preupload_lfs_files(repo_id, additions=[operation]) - fobjs.append(fobj) - operations.append(operation) - - for project in projects: - fobj, operation = _prepare_config_commit(project) - fobjs.append(fobj) - operations.append(operation) - - return fobjs, operations - - -def _prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: - from huggingface_hub import CommitOperationAdd - - zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" - fobj = _archive_dir(data_dir) - operation = CommitOperationAdd(path_in_repo=zip_repo_path, path_or_fileobj=fobj) - return fobj, operation - - -def _prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]: - from huggingface_hub import CommitOperationAdd - - config_repo_path = project.project_id + ".cfg" - fobj = _get_project_config(project) - operation = CommitOperationAdd(path_in_repo=config_repo_path, path_or_fileobj=fobj) - return fobj, operation - - -def _is_train_file(fname: str) -> bool: - train_file_patterns = ("-train", "tmp-") - for pat in train_file_patterns: - if pat in fname: - return True - return False - - -def _archive_dir(data_dir: str) -> io.BufferedRandom: - fp = tempfile.TemporaryFile() - path = pathlib.Path(data_dir) - fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] - with zipfile.ZipFile(fp, mode="w") as zfile: - zfile.comment = bytes( - f"Archived by Annif {importlib.metadata.version('annif')}", - encoding="utf-8", - ) - for fpath in fpaths: - logger.debug(f"Adding {fpath}") - arcname = os.path.join(*fpath.parts[1:]) - zfile.write(fpath, arcname=arcname) - fp.seek(0) - return fp - - -def _get_project_config(project: AnnifProject) -> io.BytesIO: - fp = tempfile.TemporaryFile(mode="w+t") - config = configparser.ConfigParser() - config[project.project_id] = project.config - config.write(fp) # This needs tempfile in text mode - fp.seek(0) - # But for upload fobj needs to be in binary mode - return io.BytesIO(fp.read().encode("utf8")) - - -def get_matching_project_ids_from_hf_hub( - project_ids_pattern: str, repo_id: str, token, revision: str -) -> list[str]: - """Get project IDs of the projects in a Hugging Face Model Hub repository that match - the given pattern.""" - all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) - return [ - path.rsplit(".cfg")[0] - for path in all_repo_file_paths - if fnmatch(path, f"{project_ids_pattern}.cfg") - ] - - -def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]: - from huggingface_hub import list_repo_files - from huggingface_hub.utils import HfHubHTTPError, HFValidationError - - try: - return [ - repofile - for repofile in list_repo_files( - repo_id=repo_id, token=token, revision=revision - ) - ] - except (HfHubHTTPError, HFValidationError) as err: - raise OperationFailedException(str(err)) - - -def download_from_hf_hub( - filename: str, repo_id: str, token: str, revision: str -) -> list[str]: - from huggingface_hub import hf_hub_download - from huggingface_hub.utils import HfHubHTTPError, HFValidationError - - try: - return hf_hub_download( - repo_id=repo_id, - filename=filename, - token=token, - revision=revision, - ) - except (HfHubHTTPError, HFValidationError) as err: - raise OperationFailedException(str(err)) - - -def unzip_archive(src_path: str, force: bool) -> None: - """Unzip a zip archive of projects and vocabularies to a directory, by - default data/ under current directory.""" - datadir = current_app.config["DATADIR"] - with zipfile.ZipFile(src_path, "r") as zfile: - archive_comment = str(zfile.comment, encoding="utf-8") - logger.debug( - f'Extracting archive {src_path}; archive comment: "{archive_comment}"' - ) - for member in zfile.infolist(): - _unzip_member(zfile, member, datadir, force) - - -def _unzip_member( - zfile: zipfile.ZipFile, member: zipfile.ZipInfo, datadir: str, force: bool -) -> None: - dest_path = os.path.join(datadir, member.filename) - if os.path.exists(dest_path) and not force: - _handle_existing_file(member, dest_path) - return - logger.debug(f"Unzipping to {dest_path}") - zfile.extract(member, path=datadir) - _restore_timestamps(member, dest_path) - - -def _handle_existing_file(member: zipfile.ZipInfo, dest_path: str) -> None: - if _are_identical_member_and_file(member, dest_path): - logger.debug(f"Skipping unzip to {dest_path}; already in place") - else: - click.echo(f"Not overwriting {dest_path} (use --force to override)") - - -def _are_identical_member_and_file(member: zipfile.ZipInfo, dest_path: str) -> bool: - path_crc = _compute_crc32(dest_path) - return path_crc == member.CRC - - -def _restore_timestamps(member: zipfile.ZipInfo, dest_path: str) -> None: - date_time = time.mktime(member.date_time + (0, 0, -1)) - os.utime(dest_path, (date_time, date_time)) - - -def copy_project_config(src_path: str, force: bool) -> None: - """Copy a given project configuration file to projects.d/ directory.""" - project_configs_dest_dir = "projects.d" - os.makedirs(project_configs_dest_dir, exist_ok=True) - - dest_path = os.path.join(project_configs_dest_dir, os.path.basename(src_path)) - if os.path.exists(dest_path) and not force: - if _are_identical_files(src_path, dest_path): - logger.debug(f"Skipping copy to {dest_path}; already in place") - else: - click.echo(f"Not overwriting {dest_path} (use --force to override)") - else: - logger.debug(f"Copying to {dest_path}") - shutil.copy(src_path, dest_path) - - -def _are_identical_files(src_path: str, dest_path: str) -> bool: - src_crc32 = _compute_crc32(src_path) - dest_crc32 = _compute_crc32(dest_path) - return src_crc32 == dest_crc32 - - -def _compute_crc32(path: str) -> int: - if os.path.isdir(path): - return 0 - - size = 1024 * 1024 * 10 # 10 MiB chunks - with open(path, "rb") as fp: - crcval = 0 - while chunk := fp.read(size): - crcval = binascii.crc32(chunk, crcval) - return crcval - - -def get_vocab_id_from_config(config_path: str) -> str: - """Get the vocabulary ID from a configuration file.""" - config = configparser.ConfigParser() - config.read(config_path) - section = config.sections()[0] - return config[section]["vocab"] - - def _get_completion_choices( param: Argument, ) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: diff --git a/annif/hfh_util.py b/annif/hfh_util.py new file mode 100644 index 000000000..045e4710f --- /dev/null +++ b/annif/hfh_util.py @@ -0,0 +1,240 @@ +"""Utility functions for interactions with Hugging Face Hub.""" + +import binascii +import configparser +import importlib +import io +import os +import pathlib +import shutil +import tempfile +import time +import zipfile +from fnmatch import fnmatch +from typing import Any + +import click +from flask import current_app + +import annif +from annif.exception import OperationFailedException +from annif.project import Access, AnnifProject + +logger = annif.logger + + +def get_matching_projects(pattern: str) -> list[AnnifProject]: + """ + Get projects that match the given pattern. + """ + return [ + proj + for proj in annif.registry.get_projects(min_access=Access.private).values() + if fnmatch(proj.project_id, pattern) + ] + + +def prepare_commits(projects: list[AnnifProject], repo_id: str) -> tuple[list, list]: + """Prepare and pre-upload data and config commit operations for projects to a + Hugging Face Hub repository.""" + from huggingface_hub import preupload_lfs_files + + fobjs, operations = [], [] + data_dirs = {p.datadir for p in projects} + vocab_dirs = {p.vocab.datadir for p in projects} + all_dirs = data_dirs.union(vocab_dirs) + + for data_dir in all_dirs: + fobj, operation = _prepare_datadir_commit(data_dir) + preupload_lfs_files(repo_id, additions=[operation]) + fobjs.append(fobj) + operations.append(operation) + + for project in projects: + fobj, operation = _prepare_config_commit(project) + fobjs.append(fobj) + operations.append(operation) + + return fobjs, operations + + +def _prepare_datadir_commit(data_dir: str) -> tuple[io.BufferedRandom, Any]: + from huggingface_hub import CommitOperationAdd + + zip_repo_path = data_dir.split(os.path.sep, 1)[1] + ".zip" + fobj = _archive_dir(data_dir) + operation = CommitOperationAdd(path_in_repo=zip_repo_path, path_or_fileobj=fobj) + return fobj, operation + + +def _prepare_config_commit(project: AnnifProject) -> tuple[io.BytesIO, Any]: + from huggingface_hub import CommitOperationAdd + + config_repo_path = project.project_id + ".cfg" + fobj = _get_project_config(project) + operation = CommitOperationAdd(path_in_repo=config_repo_path, path_or_fileobj=fobj) + return fobj, operation + + +def _is_train_file(fname: str) -> bool: + train_file_patterns = ("-train", "tmp-") + for pat in train_file_patterns: + if pat in fname: + return True + return False + + +def _archive_dir(data_dir: str) -> io.BufferedRandom: + fp = tempfile.TemporaryFile() + path = pathlib.Path(data_dir) + fpaths = [fpath for fpath in path.glob("**/*") if not _is_train_file(fpath.name)] + with zipfile.ZipFile(fp, mode="w") as zfile: + zfile.comment = bytes( + f"Archived by Annif {importlib.metadata.version('annif')}", + encoding="utf-8", + ) + for fpath in fpaths: + logger.debug(f"Adding {fpath}") + arcname = os.path.join(*fpath.parts[1:]) + zfile.write(fpath, arcname=arcname) + fp.seek(0) + return fp + + +def _get_project_config(project: AnnifProject) -> io.BytesIO: + fp = tempfile.TemporaryFile(mode="w+t") + config = configparser.ConfigParser() + config[project.project_id] = project.config + config.write(fp) # This needs tempfile in text mode + fp.seek(0) + # But for upload fobj needs to be in binary mode + return io.BytesIO(fp.read().encode("utf8")) + + +def get_matching_project_ids_from_hf_hub( + project_ids_pattern: str, repo_id: str, token, revision: str +) -> list[str]: + """Get project IDs of the projects in a Hugging Face Model Hub repository that match + the given pattern.""" + all_repo_file_paths = _list_files_in_hf_hub(repo_id, token, revision) + return [ + path.rsplit(".cfg")[0] + for path in all_repo_file_paths + if fnmatch(path, f"{project_ids_pattern}.cfg") + ] + + +def _list_files_in_hf_hub(repo_id: str, token: str, revision: str) -> list[str]: + from huggingface_hub import list_repo_files + from huggingface_hub.utils import HfHubHTTPError, HFValidationError + + try: + return [ + repofile + for repofile in list_repo_files( + repo_id=repo_id, token=token, revision=revision + ) + ] + except (HfHubHTTPError, HFValidationError) as err: + raise OperationFailedException(str(err)) + + +def download_from_hf_hub( + filename: str, repo_id: str, token: str, revision: str +) -> list[str]: + from huggingface_hub import hf_hub_download + from huggingface_hub.utils import HfHubHTTPError, HFValidationError + + try: + return hf_hub_download( + repo_id=repo_id, + filename=filename, + token=token, + revision=revision, + ) + except (HfHubHTTPError, HFValidationError) as err: + raise OperationFailedException(str(err)) + + +def unzip_archive(src_path: str, force: bool) -> None: + """Unzip a zip archive of projects and vocabularies to a directory, by + default data/ under current directory.""" + datadir = current_app.config["DATADIR"] + with zipfile.ZipFile(src_path, "r") as zfile: + archive_comment = str(zfile.comment, encoding="utf-8") + logger.debug( + f'Extracting archive {src_path}; archive comment: "{archive_comment}"' + ) + for member in zfile.infolist(): + _unzip_member(zfile, member, datadir, force) + + +def _unzip_member( + zfile: zipfile.ZipFile, member: zipfile.ZipInfo, datadir: str, force: bool +) -> None: + dest_path = os.path.join(datadir, member.filename) + if os.path.exists(dest_path) and not force: + _handle_existing_file(member, dest_path) + return + logger.debug(f"Unzipping to {dest_path}") + zfile.extract(member, path=datadir) + _restore_timestamps(member, dest_path) + + +def _handle_existing_file(member: zipfile.ZipInfo, dest_path: str) -> None: + if _are_identical_member_and_file(member, dest_path): + logger.debug(f"Skipping unzip to {dest_path}; already in place") + else: + click.echo(f"Not overwriting {dest_path} (use --force to override)") + + +def _are_identical_member_and_file(member: zipfile.ZipInfo, dest_path: str) -> bool: + path_crc = _compute_crc32(dest_path) + return path_crc == member.CRC + + +def _restore_timestamps(member: zipfile.ZipInfo, dest_path: str) -> None: + date_time = time.mktime(member.date_time + (0, 0, -1)) + os.utime(dest_path, (date_time, date_time)) + + +def copy_project_config(src_path: str, force: bool) -> None: + """Copy a given project configuration file to projects.d/ directory.""" + project_configs_dest_dir = "projects.d" + os.makedirs(project_configs_dest_dir, exist_ok=True) + + dest_path = os.path.join(project_configs_dest_dir, os.path.basename(src_path)) + if os.path.exists(dest_path) and not force: + if _are_identical_files(src_path, dest_path): + logger.debug(f"Skipping copy to {dest_path}; already in place") + else: + click.echo(f"Not overwriting {dest_path} (use --force to override)") + else: + logger.debug(f"Copying to {dest_path}") + shutil.copy(src_path, dest_path) + + +def _are_identical_files(src_path: str, dest_path: str) -> bool: + src_crc32 = _compute_crc32(src_path) + dest_crc32 = _compute_crc32(dest_path) + return src_crc32 == dest_crc32 + + +def _compute_crc32(path: str) -> int: + if os.path.isdir(path): + return 0 + + size = 1024 * 1024 * 10 # 10 MiB chunks + with open(path, "rb") as fp: + crcval = 0 + while chunk := fp.read(size): + crcval = binascii.crc32(chunk, crcval) + return crcval + + +def get_vocab_id_from_config(config_path: str) -> str: + """Get the vocabulary ID from a configuration file.""" + config = configparser.ConfigParser() + config.read(config_path) + section = config.sections()[0] + return config[section]["vocab"] diff --git a/tests/test_cli.py b/tests/test_cli.py index bb2a8b53d..c9d904ff2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,6 +18,7 @@ import annif.cli import annif.cli_util +import annif.hfh_util import annif.parallel runner = CliRunner(env={"ANNIF_CONFIG": "annif.default_config.TestingConfig"}) @@ -1140,7 +1141,7 @@ def test_archive_dir(testdatadir): open(os.path.join(str(dirpath), "foo.txt"), "a").close() open(os.path.join(str(dirpath), "-train.txt"), "a").close() - fobj = annif.cli_util._archive_dir(dirpath) + fobj = annif.hfh_util._archive_dir(dirpath) assert isinstance(fobj, io.BufferedRandom) with zipfile.ZipFile(fobj, mode="r") as zfile: @@ -1150,7 +1151,7 @@ def test_archive_dir(testdatadir): def test_get_project_config(app_project): - result = annif.cli_util._get_project_config(app_project) + result = annif.hfh_util._get_project_config(app_project) assert isinstance(result, io.BytesIO) string_result = result.read().decode("UTF-8") assert "[dummy-en]" in string_result @@ -1175,7 +1176,7 @@ def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("annif.cli_util.copy_project_config") +@mock.patch("annif.hfh_util.copy_project_config") def test_download_dummy_fi( copy_project_config, hf_hub_download, list_repo_files, testdatadir ): @@ -1233,7 +1234,7 @@ def test_download_dummy_fi( "huggingface_hub.hf_hub_download", side_effect=hf_hub_download_mock_side_effect, ) -@mock.patch("annif.cli_util.copy_project_config") +@mock.patch("annif.hfh_util.copy_project_config") def test_download_dummy_fi_and_en( copy_project_config, hf_hub_download, list_repo_files, testdatadir ): @@ -1352,7 +1353,7 @@ def test_download_hf_hub_download_failed( def test_unzip_archive_initial(testdatadir): dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") fpath = os.path.join(str(dirpath), "file.txt") - annif.cli_util.unzip_archive( + annif.hfh_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=False, ) @@ -1371,7 +1372,7 @@ def test_unzip_archive_no_overwrite(testdatadir): with open(fpath, "wt") as pf: print("Existing content", file=pf) - annif.cli_util.unzip_archive( + annif.hfh_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=False, ) @@ -1387,7 +1388,7 @@ def test_unzip_archive_overwrite(testdatadir): with open(fpath, "wt") as pf: print("Existing content", file=pf) - annif.cli_util.unzip_archive( + annif.hfh_util.unzip_archive( os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), force=True, ) @@ -1400,10 +1401,10 @@ def test_unzip_archive_overwrite(testdatadir): @mock.patch("os.path.exists", return_value=True) -@mock.patch("annif.cli_util._compute_crc32", return_value=0) +@mock.patch("annif.hfh_util._compute_crc32", return_value=0) @mock.patch("shutil.copy") def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists): - annif.cli_util.copy_project_config( + annif.hfh_util.copy_project_config( os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False ) assert not copy.called @@ -1412,7 +1413,7 @@ def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists): @mock.patch("os.path.exists", return_value=True) @mock.patch("shutil.copy") def test_copy_project_config_overwrite(copy, exists): - annif.cli_util.copy_project_config( + annif.hfh_util.copy_project_config( os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True ) assert copy.called From 638aa07d997e6f6ed42682ad86b7096105695349 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 Apr 2024 09:41:30 +0300 Subject: [PATCH 37/38] Move unit tests for HuggingFaceHub util fns to own file --- tests/test_cli.py | 98 +-------------------------------------- tests/test_hfh_util.py | 103 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 97 deletions(-) create mode 100644 tests/test_hfh_util.py diff --git a/tests/test_cli.py b/tests/test_cli.py index c9d904ff2..46a9fa0ad 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,14 +2,12 @@ import contextlib import importlib -import io import json import os.path import random import re import shutil -import zipfile -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from unittest import mock from click.shell_completion import ShellComplete @@ -1135,28 +1133,6 @@ def test_upload_nonexistent_repo(): assert "Repository Not Found for url:" in failed_result.output -def test_archive_dir(testdatadir): - dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") - os.makedirs(dirpath, exist_ok=True) - open(os.path.join(str(dirpath), "foo.txt"), "a").close() - open(os.path.join(str(dirpath), "-train.txt"), "a").close() - - fobj = annif.hfh_util._archive_dir(dirpath) - assert isinstance(fobj, io.BufferedRandom) - - with zipfile.ZipFile(fobj, mode="r") as zfile: - archived_files = zfile.namelist() - assert len(archived_files) == 1 - assert os.path.split(archived_files[0])[1] == "foo.txt" - - -def test_get_project_config(app_project): - result = annif.hfh_util._get_project_config(app_project) - assert isinstance(result, io.BytesIO) - string_result = result.read().decode("UTF-8") - assert "[dummy-en]" in string_result - - def hf_hub_download_mock_side_effect(filename, repo_id, token, revision): return "tests/huggingface-cache/" + filename # Mocks the downloaded file paths @@ -1350,78 +1326,6 @@ def test_download_hf_hub_download_failed( assert hf_hub_download.called -def test_unzip_archive_initial(testdatadir): - dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") - fpath = os.path.join(str(dirpath), "file.txt") - annif.hfh_util.unzip_archive( - os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), - force=False, - ) - assert os.path.exists(fpath) - assert os.path.getsize(fpath) == 0 # Zero content from zip - ts = os.path.getmtime(fpath) - assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( - 1980, 1, 1, 0, 0 - ).astimezone(tz=timezone.utc) - - -def test_unzip_archive_no_overwrite(testdatadir): - dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") - fpath = os.path.join(str(dirpath), "file.txt") - os.makedirs(dirpath, exist_ok=True) - with open(fpath, "wt") as pf: - print("Existing content", file=pf) - - annif.hfh_util.unzip_archive( - os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), - force=False, - ) - assert os.path.exists(fpath) - assert os.path.getsize(fpath) == 17 # Existing content - assert datetime.now().timestamp() - os.path.getmtime(fpath) < 1 - - -def test_unzip_archive_overwrite(testdatadir): - dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") - fpath = os.path.join(str(dirpath), "file.txt") - os.makedirs(dirpath, exist_ok=True) - with open(fpath, "wt") as pf: - print("Existing content", file=pf) - - annif.hfh_util.unzip_archive( - os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), - force=True, - ) - assert os.path.exists(fpath) - assert os.path.getsize(fpath) == 0 # Zero content from zip - ts = os.path.getmtime(fpath) - assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( - 1980, 1, 1, 0, 0 - ).astimezone(tz=timezone.utc) - - -@mock.patch("os.path.exists", return_value=True) -@mock.patch("annif.hfh_util._compute_crc32", return_value=0) -@mock.patch("shutil.copy") -def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists): - annif.hfh_util.copy_project_config( - os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False - ) - assert not copy.called - - -@mock.patch("os.path.exists", return_value=True) -@mock.patch("shutil.copy") -def test_copy_project_config_overwrite(copy, exists): - annif.hfh_util.copy_project_config( - os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True - ) - assert copy.called - assert copy.call_args == mock.call( - "tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg" - ) - - def test_completion_script_generation(): result = runner.invoke(annif.cli.cli, ["completion", "--bash"]) assert not result.exception diff --git a/tests/test_hfh_util.py b/tests/test_hfh_util.py new file mode 100644 index 000000000..ce3d6aac9 --- /dev/null +++ b/tests/test_hfh_util.py @@ -0,0 +1,103 @@ +"""Unit test module for Hugging Face Hub utilities.""" + +import io +import os.path +import zipfile +from datetime import datetime, timezone +from unittest import mock + +import annif.hfh_util + + +def test_archive_dir(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + os.makedirs(dirpath, exist_ok=True) + open(os.path.join(str(dirpath), "foo.txt"), "a").close() + open(os.path.join(str(dirpath), "-train.txt"), "a").close() + + fobj = annif.hfh_util._archive_dir(dirpath) + assert isinstance(fobj, io.BufferedRandom) + + with zipfile.ZipFile(fobj, mode="r") as zfile: + archived_files = zfile.namelist() + assert len(archived_files) == 1 + assert os.path.split(archived_files[0])[1] == "foo.txt" + + +def test_get_project_config(app_project): + result = annif.hfh_util._get_project_config(app_project) + assert isinstance(result, io.BytesIO) + string_result = result.read().decode("UTF-8") + assert "[dummy-en]" in string_result + + +def test_unzip_archive_initial(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + annif.hfh_util.unzip_archive( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=False, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 0 # Zero content from zip + ts = os.path.getmtime(fpath) + assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( + 1980, 1, 1, 0, 0 + ).astimezone(tz=timezone.utc) + + +def test_unzip_archive_no_overwrite(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + os.makedirs(dirpath, exist_ok=True) + with open(fpath, "wt") as pf: + print("Existing content", file=pf) + + annif.hfh_util.unzip_archive( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=False, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 17 # Existing content + assert datetime.now().timestamp() - os.path.getmtime(fpath) < 1 + + +def test_unzip_archive_overwrite(testdatadir): + dirpath = os.path.join(str(testdatadir), "projects", "dummy-fi") + fpath = os.path.join(str(dirpath), "file.txt") + os.makedirs(dirpath, exist_ok=True) + with open(fpath, "wt") as pf: + print("Existing content", file=pf) + + annif.hfh_util.unzip_archive( + os.path.join("tests", "huggingface-cache", "projects", "dummy-fi.zip"), + force=True, + ) + assert os.path.exists(fpath) + assert os.path.getsize(fpath) == 0 # Zero content from zip + ts = os.path.getmtime(fpath) + assert datetime.fromtimestamp(ts).astimezone(tz=timezone.utc) == datetime( + 1980, 1, 1, 0, 0 + ).astimezone(tz=timezone.utc) + + +@mock.patch("os.path.exists", return_value=True) +@mock.patch("annif.hfh_util._compute_crc32", return_value=0) +@mock.patch("shutil.copy") +def test_copy_project_config_no_overwrite(copy, _compute_crc32, exists): + annif.hfh_util.copy_project_config( + os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=False + ) + assert not copy.called + + +@mock.patch("os.path.exists", return_value=True) +@mock.patch("shutil.copy") +def test_copy_project_config_overwrite(copy, exists): + annif.hfh_util.copy_project_config( + os.path.join("tests", "huggingface-cache", "dummy-fi.cfg"), force=True + ) + assert copy.called + assert copy.call_args == mock.call( + "tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg" + ) From 6f35fff9a4b1d55b521c27e50ceb541549bf2ef5 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 Apr 2024 09:59:15 +0300 Subject: [PATCH 38/38] Make io import conditional to TYPE_CHECKING --- annif/cli_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index c47196169..2a64582f2 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -3,7 +3,6 @@ from __future__ import annotations import collections -import io import itertools import os import sys @@ -18,6 +17,7 @@ from annif.project import Access if TYPE_CHECKING: + import io from datetime import datetime from click.core import Argument, Context, Option