Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added loader from HF spaces #860

Merged
merged 4 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions src/unitxt/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,3 +501,110 @@ class LoadFromDictionary(Loader):

def process(self) -> MultiStream:
return MultiStream.from_iterables(self.data)


class LoadFromHFSpace(LoadHF):
"""Used to load data from Huggingface spaces.

Loaders firstly tries to download all files specified in the 'data_files' parameter
from the given space and then reads them as a Huggingface dataset.

Attributes:
space_name (str): Name of the Huggingface space to be accessed to.
data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]]): Relative
paths to files within a given repository. If given as a mapping, paths should
be values, while keys should represent the type of respective files
(training, testing etc.).
path (str, optional): Absolute path to a directory where data should be downloaded to.
revision (str, optional): ID of a Git branch or commit to be used. By default, it is
set to None, thus data is downloaded from the main branch of the accessed
repository.
use_token (bool, optional): Whether token used for authentication when accessing
the Huggingface space - if necessary - should be read from the Huggingface
config folder.
token_env (str, optional): Key of an env variable which value will be used for
authentication when accessing the Huggingface space - if necessary.

Examples:
loader = LoadFromHFSpace(
space_name="lmsys/mt-bench",
data_files={
"train": [
"data/mt_bench/model_answer/gpt-3.5-turbo.jsonl",
"data/mt_bench/model_answer/gpt-4.jsonl",
],
"test": "data/mt_bench/model_answer/tulu-30b.jsonl",
},
)
multi_stream = loader.process()
"""

space_name: str
data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
path: Optional[str] = None
revision: Optional[str] = None
use_token: Optional[bool] = None
token_env: Optional[str] = None
requirements_list: List[str] = ["huggingface_hub"]

def _get_token(self) -> Optional[Union[bool, str]]:
if self.token_env:
token = os.getenv(self.token_env)
if not token:
get_logger().warning(
f"The 'token_env' parameter was specified as '{self.token_env}', "
f"however, no environment variable under such a name was found. "
f"Therefore, the loader will not use any tokens for authentication."
)
return token
return self.use_token

def _download_file_from_space(self, filename: str) -> str:
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError

token = self._get_token()

try:
file_path = hf_hub_download(
repo_id=self.space_name,
filename=filename,
repo_type="space",
token=token,
revision=self.revision,
local_dir=self.path,
)
except EntryNotFoundError as e:
raise ValueError(
f"The file '{filename}' was not found in the space '{self.space_name}'. "
f"Please check if the filename is correct, or if it exists in that "
f"Huggingface space."
) from e
except RepositoryNotFoundError as e:
raise ValueError(
f"The Huggingface space '{self.space_name}' was not found. "
f"Please check if the name is correct and you have access to the space."
) from e

return file_path

def _download_data(self) -> str:
if isinstance(self.data_files, str):
data_files = [self.data_files]
elif isinstance(self.data_files, Mapping):
data_files = list(self.data_files.values())
else:
data_files = self.data_files

for files in data_files:
if isinstance(files, str):
files = [files]
# All files - within the same space - are downloaded into the same base directory:
paths = [self._download_file_from_space(file) for file in files]
dir_path = paths[0].replace(files[0], "")

return dir_path

def process(self):
self.path = self._download_data()
return super().process()
30 changes: 30 additions & 0 deletions tests/library/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from unitxt.loaders import (
LoadCSV,
LoadFromDictionary,
LoadFromHFSpace,
LoadFromIBMCloud,
LoadHF,
MultipleSourceLoader,
Expand Down Expand Up @@ -248,3 +249,32 @@ def test_load_from_dictionary(self):
for split, instances in data.items():
for original_instance, stream_instance in zip(instances, streams[split]):
self.assertEqual(original_instance, stream_instance)

def test_load_from_hf_space(self):
params = {
"space_name": "lmsys/mt-bench",
"data_files": {
"train": [
"data/mt_bench/model_answer/koala-13b.jsonl",
"data/mt_bench/model_answer/llama-13b.jsonl",
],
"test": "data/mt_bench/model_answer/wizardlm-13b.jsonl",
},
}

expected_sample = {
"question_id": 81,
"model_id": "wizardlm-13b",
"answer_id": "DKHvKJgtzsvHN2ZJ8a3o5C",
"tstamp": 1686788249.913451,
}
loader = LoadFromHFSpace(**params)
ms = loader.process().to_dataset()
actual_sample = ms["test"][0]
actual_sample.pop("choices")
self.assertEqual(expected_sample, actual_sample)

params["loader_limit"] = 10
loader = LoadFromHFSpace(**params)
ms = loader.process().to_dataset()
assert ms.shape["train"] == (10, 5) and ms.shape["test"] == (10, 5)
Loading