Skip to content

Commit

Permalink
Add option to disallow automatic remote schema loading
Browse files Browse the repository at this point in the history
For use by main hepdata app, so it can preload the allowed remote schemas.

Also made custom_data_schemas in DataFileValidator an instance variable
so it's clear which schemas have been loaded for which validator.
  • Loading branch information
alisonrclarke committed Aug 17, 2021
1 parent e1f7b24 commit 0688a68
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 8 deletions.
2 changes: 1 addition & 1 deletion hepdata_validator/data_file_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ class DataFileValidator(Validator):
"""
base_path = os.path.dirname(__file__)
schema_name = 'data_schema.json'
custom_data_schemas = {}

def __init__(self, *args, **kwargs):
super(DataFileValidator, self).__init__(*args, **kwargs)
self.default_schema_file = self._get_schema_filepath(self.schema_name)
self.custom_data_schemas = {}

def load_custom_schema(self, type, schema_file_path=None):
"""
Expand Down
35 changes: 28 additions & 7 deletions hepdata_validator/full_submission_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def __init__(self, *args, **kwargs):
self._data_file_validator = DataFileValidator(*args, **kwargs)
self.valid_files = {}
self.submission_docs = None
if 'autoload_remote_schemas' in kwargs:
self.autoload_remote_schemas = kwargs['autoload_remote_schemas']
else:
self.autoload_remote_schemas = True

def print_valid_files(self):
for type in SchemaType:
Expand Down Expand Up @@ -265,7 +269,14 @@ def _check_doc(self, doc):
if 'data_schema' in doc:
try:
file_type = doc['data_schema']
self._load_remote_schema(file_type)
if self.autoload_remote_schemas:
self.load_remote_schema(file_type)
elif doc['data_schema'] not in self._data_file_validator.custom_data_schemas:
self.add_validation_message(ValidationMessage(
file=self.submission_file_path,
message=f"Autoloading of remote schema {doc['data_schema']} is not allowed."
))
return False
except FileNotFoundError:
self.add_validation_message(ValidationMessage(
file=self.submission_file_path, message=f"Remote schema {doc['data_schema']} not found."
Expand Down Expand Up @@ -318,15 +329,25 @@ def _check_doc(self, doc):

return is_valid_submission_doc

def _load_remote_schema(self, schema_url):
# Load the schema with the given URL into self._data_file_validator
url = urlparse(schema_url)
schema_path, schema_name = os.path.split(url.path)

base_url = urlunsplit((url.scheme, url.netloc, schema_path, '', ''))
def load_remote_schema(self, schema_url=None, base_url=None, schema_name=None):
"""
Loads the given schema into the validator's DataSubmissionValidator.
"""
if schema_url:
url = urlparse(schema_url)
schema_path, schema_name = os.path.split(url.path)
base_url = urlunsplit((url.scheme, url.netloc, schema_path, '', ''))
elif not base_url or not schema_name:
raise ValueError("Must provide EITHER schema_url OR both base_url and schema_name")

resolver = JsonSchemaResolver(base_url)
downloader = HTTPSchemaDownloader(resolver, base_url)
if not schema_url:
schema_url = downloader.get_schema_type(schema_name)

# Don't download again if already loaded
if schema_url in self._data_file_validator.custom_data_schemas:
return

# Retrieve and save the remote schema in the local path
schema_spec = downloader.get_schema_spec(schema_name)
Expand Down
45 changes: 45 additions & 0 deletions testsuite/test_full_submission_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,39 @@ def test_valid_submission_dir_remote_schema(validator_v1, data_path, capsys):
""".format(submission_dir)


def test_valid_submission_dir_remote_schema_no_autoloading(data_path):
validator = FullSubmissionValidator(schema_version='1.1.0', autoload_remote_schemas=False)
submission_dir = os.path.join(data_path, 'TestRemoteSubmission')

# Validate without pre-loading schemas - should get an error
is_valid = validator.validate(directory=submission_dir)
assert not is_valid
messages = validator.get_messages(os.path.join(submission_dir, 'submission.yaml'))
assert len(messages) == 1
assert messages[0].message == \
"Autoloading of remote schema https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json is not allowed."

# Load remote schema and try again
validator.clear_all()
validator.load_remote_schema(schema_url='https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json')
is_valid = validator.validate(directory=submission_dir)
assert is_valid


def test_valid_submission_dir_remote_schema_multiple_loads():
validator = FullSubmissionValidator(schema_version='1.1.0', autoload_remote_schemas=False)

# Load schema once - should be fine
validator.load_remote_schema(base_url="https://scikit-hep.org/pyhf/schemas/1.0.0", schema_name="workspace.json")
assert len(validator._data_file_validator.custom_data_schemas) == 1
assert "https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json" in validator._data_file_validator.custom_data_schemas

# Load same schema again (via URL this time) - should not add anything to custom_data_schemas
validator.load_remote_schema(schema_url='https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json')
assert len(validator._data_file_validator.custom_data_schemas) == 1
assert "https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json" in validator._data_file_validator.custom_data_schemas


def test_invalid_input(validator_v1, data_path, capsys):
# Invalid file
is_valid = validator_v1.validate(file='notafile')
Expand Down Expand Up @@ -203,6 +236,18 @@ def test_invalid_syntax_submission(validator_v1, data_path, capsys):
in "{file}", line 10, column 1"""


def test_invalid_remote_schema_load(validator_v1):
with pytest.raises(ValueError) as excinfo:
validator_v1.load_remote_schema()

assert str(excinfo.value) == "Must provide EITHER schema_url OR both base_url and schema_name"

with pytest.raises(ValueError) as excinfo:
validator_v1.load_remote_schema(schema_name="my_schema_name")

assert str(excinfo.value) == "Must provide EITHER schema_url OR both base_url and schema_name"


def test_invalid_remote_schema(validator_v1, data_path, capsys):
submission_dir = os.path.join(data_path, 'TestRemoteSubmission_invalid')
file = os.path.join(submission_dir, 'submission.yaml')
Expand Down

0 comments on commit 0688a68

Please sign in to comment.