Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swap name and relative_path as the mandatory entry #123

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 28 additions & 17 deletions src/dataregistry/registrar/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
_bump_version,
_copy_data,
_form_dataset_path,
_name_from_relpath,
_relpath_from_name,
_parse_version_string,
_read_configuration_file,
get_directory_info,
_name_from_relpath
)
from .dataset_util import set_dataset_status, get_dataset_status

Expand All @@ -29,10 +30,9 @@ def __init__(self, db_connection, root_dir, owner, owner_type, execution_table):

def register(
self,
relative_path,
name,
version,
version_suffix=None,
name=None,
creation_date=None,
description=None,
execution_id=None,
Expand All @@ -55,6 +55,7 @@ def register(
location_type="dataregistry",
url=None,
contact_email=None,
relative_path=None,
):
"""
Create a new dataset entry in the DESC data registry.
Expand All @@ -70,10 +71,9 @@ def register(

Parameters
----------
relative_path** : str
name** : str
version** : str
version_suffix** : str, optional
name** : str, optional
creation_date** : datetime, optional
description** : str, optional
execution_id** : int, optional
Expand Down Expand Up @@ -109,6 +109,7 @@ def register(
url**: str, optional
For `location_type="external"` only
contact_email**: str, optional
relative_path** : str, optional

Returns
-------
Expand All @@ -118,6 +119,11 @@ def register(
The execution ID associated with the dataset
"""

# If no name, we need a `relative_path`
if name is None:
if relative_path is None:
raise ValueError("`relative_path` is required when `name` is None")

# If external dataset, check for either a `url` or `contact_email`
if location_type == "external":
if url is None and contact_email is None:
Expand Down Expand Up @@ -161,19 +167,8 @@ def register(
"Only owner_type='production' can go in the production schema"
)

# If `name` not passed, automatically generate a name from the relative path
if name is None:
name = _name_from_relpath(relative_path)

# Look for previous entries. Fail if not overwritable
dataset_table = self._get_table_metadata("dataset")
previous = self._find_previous(relative_path, owner, owner_type)

if previous is None:
print(f"Dataset {relative_path} exists, and is not overwritable")
return None, None

# Deal with version string (non-special case)
dataset_table = self._get_table_metadata("dataset")
if version not in ["major", "minor", "patch"]:
v_fields = _parse_version_string(version)
version_string = version
Expand All @@ -185,6 +180,22 @@ def register(
f"{v_fields['major']}.{v_fields['minor']}.{v_fields['patch']}"
)

# If `relative_path` not passed, automatically generate one from the
# name, version and version_suffix
if relative_path is None:
JoanneBogart marked this conversation as resolved.
Show resolved Hide resolved
relative_path = _relpath_from_name(name, version_string, version_suffix)

# If `name` is None, generate it from the `relative_path`
if name is None:
name = _name_from_relpath(relative_path)

# Look for previous entries. Fail if not overwritable
previous = self._find_previous(relative_path, owner, owner_type)

if previous is None:
print(f"Dataset {relative_path} exists, and is not overwritable")
return None, None

# If no execution_id is supplied, create a minimal entry
if execution_id is None:
if execution_name is None:
Expand Down
30 changes: 30 additions & 0 deletions src/dataregistry/registrar/registrar_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"_form_dataset_path",
"get_directory_info",
"_name_from_relpath",
"_relpath_from_name",
"_copy_data",
]
VERSION_SEPARATOR = "."
Expand Down Expand Up @@ -340,3 +341,32 @@ def _compute_checksum(file_path):
)

raise Exception(e)

def _relpath_from_name(name, version, version_suffix):
"""
Construct a relative path from the name and version of a dataset.

We use this when the `relative_path` is not explicitly defined. To be safe
JoanneBogart marked this conversation as resolved.
Show resolved Hide resolved
from conflicts we also append the current timestamp to the relative_path.

Parameters
----------
name : str
Dataset name
version : str
Dataset version
version_suffix : str
Dataset version suffix

Returns
-------
relative_path : str
Automatically generated `relative_path`
"""

if version_suffix is not None:
relative_path = f"{name}_{version}_{version_suffix}"
else:
relative_path = f"{name}_{version}"

return relative_path
6 changes: 3 additions & 3 deletions src/dataregistry/schema/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -177,13 +177,13 @@ dataset:
description: "Unique identifier for this dataset"
name:
type: "String"
description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique. If None name is generated from the relative path."
description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique."
nullable: False
cli_optional: True
relative_path:
type: "String"
description: "Relative path storing the data, relative to `<root_dir>`"
description: "Relative path storing the data, relative to `<root_dir>`. If None name is generated automatically from the provided name and version."
nullable: False
cli_optional: True
version_major:
type: "Integer"
description: "Major version in semantic string (i.e., X.x.x)"
Expand Down
6 changes: 3 additions & 3 deletions src/dataregistry_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ def get_parser():

# Entries unique to registering the dataset when using the CLI
arg_register_dataset.add_argument(
"relative_path",
"name",
help=(
"Destination for the dataset within the data registry. Path is"
"relative to <registry root>/<owner_type>/<owner>."
"Any convenient, evocative name for the human."
"Note the combination of name, version and version_suffix must be unique."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Concerning the question you raised about uniqueness: I don't think that name, version and version_suffix need to be unique across the entire database. It's enough if (name, version, version_suffix, owner, owner_type) is unique

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do this in another PR

),
type=str,
)
Expand Down
4 changes: 2 additions & 2 deletions src/dataregistry_cli/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ def register_dataset(args):

# Register new dataset.
new_id = datareg.Registrar.dataset.register(
args.relative_path,
args.name,
args.version,
name=args.name,
version_suffix=args.version_suffix,
creation_date=args.creation_date,
access_API=args.access_API,
Expand All @@ -60,6 +59,7 @@ def register_dataset(args):
location_type=args.location_type,
url=args.url,
contact_email=args.contact_email,
relative_path=args.relative_path,
)

print(f"Created dataset entry with id {new_id}")
22 changes: 11 additions & 11 deletions tests/end_to_end_tests/database_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,11 @@ def _insert_execution_entry(

def _insert_dataset_entry(
datareg,
relpath,
name,
version,
owner_type=None,
owner=None,
description=None,
name=None,
execution_id=None,
version_suffix=None,
old_location=None,
Expand All @@ -164,15 +163,15 @@ def _insert_dataset_entry(
location_type="dummy",
contact_email=None,
url=None,
relative_path=None,
):
"""
Wrapper to create dataset entry

Parameters
----------
relpath : str
Relative path within the data registry to store the data
Relative to <ROOT>/<owner_type>/<owner>/...
name : str
A user selected name for the dataset
version : str
Semantic version string (i.e., M.N.P) or
"major", "minor", "patch" to automatically bump the version previous
Expand All @@ -182,8 +181,6 @@ def _insert_dataset_entry(
Dataset owner
description : str
Description of dataset
name : str
A manually selected name for the dataset
execution_id : int
Execution entry related to this dataset
version_suffix : str
Expand All @@ -205,10 +202,13 @@ def _insert_dataset_entry(
List of dataset ids that were the input to this execution
location_type : str, optional
"dataregistry", "external" or "dummy"
contact_email : str
contact_email : str, optional
Contact email for external datasets
url : str
url : str, optional
Url for external datasets
relative_path : str, optional
Relative path within the data registry to store the data
Relative to <ROOT>/<owner_type>/<owner>/...

Returns
-------
Expand All @@ -221,10 +221,9 @@ def _insert_dataset_entry(

# Add new entry.
dataset_id, execution_id = datareg.Registrar.dataset.register(
relpath,
name,
version,
version_suffix=version_suffix,
name=name,
creation_date=None,
description=description,
old_location=old_location,
Expand All @@ -243,6 +242,7 @@ def _insert_dataset_entry(
location_type=location_type,
contact_email=contact_email,
url=url,
relative_path=relative_path,
)

assert dataset_id is not None, "Trying to create a dataset that already exists"
Expand Down
11 changes: 4 additions & 7 deletions tests/end_to_end_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_simple_query(dummy_file):
cli.main(shlex.split(cmd))

# Update the registered dataset
cmd = "register dataset my_cli_dataset2 patch --location_type dummy --name my_cli_dataset"
cmd = "register dataset my_cli_dataset patch --location_type dummy"
cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}"
cli.main(shlex.split(cmd))

Expand All @@ -40,7 +40,7 @@ def test_dataset_entry_with_execution(dummy_file):
tmp_src_dir, tmp_root_dir = dummy_file

# Register a dataset with many options
cmd = "register dataset my_cli_dataset3 1.2.3 --location_type dummy"
cmd = "register dataset my_cli_dataset_with_ex 1.2.3 --location_type dummy"
cmd += " --description 'This is my dataset description'"
cmd += " --access_API 'Awesome API' --owner DESC --owner_type group"
cmd += " --version_suffix test --creation_date '2020-01-01'"
Expand All @@ -51,14 +51,11 @@ def test_dataset_entry_with_execution(dummy_file):

# Check
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION)
f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset3")
f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_with_ex")
results = datareg.Query.find_datasets(
[
"dataset.name",
"dataset.version_string",
"dataset.relative_path",
"execution.name",
"execution.execution_id",
"dataset.is_overwritable",
],
[f],
Expand Down Expand Up @@ -86,7 +83,7 @@ def test_production_entry(dummy_file):
# Check
f = datareg.Query.gen_filter("dataset.name", "==", "my_production_cli_dataset")
results = datareg.Query.find_datasets(
["dataset.name", "dataset.version_string", "dataset.relative_path"], [f]
["dataset.name", "dataset.version_string"], [f]
)
assert len(results["dataset.name"]) == 1, "Bad result from query dcli3"
assert results["dataset.version_string"][0] == "0.1.2"
Expand Down
10 changes: 8 additions & 2 deletions tests/end_to_end_tests/test_database_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ def test_get_dataset_absolute_path(dummy_file):
# Make a basic entry
d_id_1 = _insert_dataset_entry(
datareg,
dset_relpath,
"dataset_test_get_dataset_absolute_path",
"0.0.1",
owner_type=dset_ownertype,
owner=dset_owner,
relative_path=dset_relpath,
)

v = datareg.Query.get_dataset_absolute_path(d_id_1)
Expand Down Expand Up @@ -72,7 +73,12 @@ def test_find_entry(dummy_file):
datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION)

# Make a dataset
d_id = _insert_dataset_entry(datareg, "test_find_entry/dataset", "0.0.1")
d_id = _insert_dataset_entry(
datareg,
"dataset_test_find_entry",
"0.0.1",
relative_path="test_find_entry/dataset",
)

# Find it
r = datareg.Registrar.dataset.find_entry(d_id)
Expand Down
Loading
Loading