Skip to content

Commit

Permalink
Full submission validator now works for remote schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
alisonrclarke committed Jul 27, 2021
1 parent ff8758e commit 395d2ed
Show file tree
Hide file tree
Showing 14 changed files with 58,945 additions and 44 deletions.
75 changes: 67 additions & 8 deletions hepdata_validator/full_submission_validator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from enum import Enum
import os.path
import shutil
import sys
import tempfile
from urllib.parse import urlparse, urlunsplit

import yaml

from hepdata_validator import Validator, ValidationMessage
from .schema_resolver import JsonSchemaResolver
from .schema_downloader import HTTPSchemaDownloader
from .submission_file_validator import SubmissionFileValidator
from .data_file_validator import DataFileValidator

Expand All @@ -18,17 +21,31 @@
from yaml import SafeDumper as Dumper


class SchemaType(Enum):
SUBMISSION = 'submission'
SINGLE_YAML = 'single file'
DATA = 'data'
REMOTE = 'remote'


class FullSubmissionValidator(Validator):

def __init__(self, *args, **kwargs):
super(FullSubmissionValidator, self).__init__(*args, **kwargs)
self.submission_file_validator = SubmissionFileValidator(args, kwargs)
self.data_file_validator = DataFileValidator(args, kwargs)
self.valid_files = []
self.valid_files = {}

def print_valid_files(self):
for file in self.valid_files:
print(f'\t {file} is valid HEPData YAML.')
for type in SchemaType:
if type in self.valid_files:
if type == SchemaType.REMOTE:
for schema, file in self.valid_files[type]:
print(f'\t {file} is valid against schema {schema}.')
else:
for file in self.valid_files[type]:
print(f'\t {file} is valid HEPData {type.value} YAML.')


def validate(self, directory=None, file=None, zipfile=None):
"""
Expand Down Expand Up @@ -136,7 +153,8 @@ def validate(self, directory=None, file=None, zipfile=None):
is_valid_submission_file = False

if is_valid_submission_file:
self.valid_files.insert(0, self.submission_file_path)
type = SchemaType.SINGLE_YAML if self.single_yaml_file else SchemaType.SUBMISSION
self.valid_files[type] = [self.submission_file_path]

return len(self.messages) == 0
finally:
Expand Down Expand Up @@ -196,6 +214,18 @@ def _check_doc(self, doc):

user_data_file_path = self.submission_file_path if self.single_yaml_file else data_file_path

# Check the remote schema (if defined)
file_type = None
if 'data_schema' in doc:
try:
file_type = doc['data_schema']
self._load_remote_schema(file_type)
except FileNotFoundError:
self.add_validation_message(ValidationMessage(
file=self.submission_file_path, message=f"Remote schema {doc['data_schema']} not found."
))
return False

# Just try to load YAML data file without validating schema.
try:
contents = yaml.load(open(data_file_path, 'r'), Loader=Loader)
Expand All @@ -206,11 +236,14 @@ def _check_doc(self, doc):
return is_valid_submission_doc

# Validate the YAML data file
is_valid_data_file = self.data_file_validator.validate(file_path=data_file_path, data=contents)
is_valid_data_file = self.data_file_validator.validate(
file_path=data_file_path, file_type=file_type, data=contents
)
if not is_valid_data_file:
table_msg = f" ({doc['name']})" if self.single_yaml_file else ''
invalid_msg = f"against schema {doc['data_schema']}" if 'data_schema' in doc else "HEPData YAML"
self.add_validation_message(ValidationMessage(
file=user_data_file_path, message=f'{user_data_file_path}{table_msg} is invalid HEPData YAML.'
file=user_data_file_path, message=f'{user_data_file_path}{table_msg} is invalid {invalid_msg}.'
))
if self.single_yaml_file:
is_valid_submission_doc = False
Expand All @@ -221,10 +254,36 @@ def _check_doc(self, doc):
file=user_data_file_path, message=message.message
))
elif not self.single_yaml_file:
self.valid_files.append(user_data_file_path)
type = SchemaType.REMOTE if 'data_schema' in doc else SchemaType.DATA

if type not in self.valid_files:
self.valid_files[type] = []

if 'data_schema' in doc:
self.valid_files[type].append((doc['data_schema'], user_data_file_path))
else:
self.valid_files[type].append(user_data_file_path)

# For single YAML file, clean up by removing temporary data_file created above.
if self.single_yaml_file:
os.remove(doc['data_file'])

return is_valid_submission_doc

def _load_remote_schema(self, schema_url):
# Load the schema with the given URL into self.data_file_validator
url = urlparse(schema_url)
schema_path, schema_name = os.path.split(url.path)

base_url = urlunsplit((url.scheme, url.netloc, schema_path, '', ''))

resolver = JsonSchemaResolver(base_url)
downloader = HTTPSchemaDownloader(resolver, base_url)

# Retrieve and save the remote schema in the local path
schema_spec = downloader.get_schema_spec(schema_name)
downloader.save_locally(schema_name, schema_spec)

# Load the custom schema as a custom type
local_path = os.path.join(downloader.schemas_path, schema_name)
self.data_file_validator.load_custom_schema(schema_url, local_path)
24 changes: 12 additions & 12 deletions testsuite/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ def test_valid_submission_dir(data_path, cli_runner):
result = cli_runner.invoke(validate, ['-d', submission_dir])
assert result.exit_code == 0
assert result.output == """{0} is valid.
{0}/submission.yaml is valid HEPData YAML.
{0}/data1.yaml is valid HEPData YAML.
{0}/data2.yaml is valid HEPData YAML.
{0}/data3.yaml is valid HEPData YAML.
{0}/data4.yaml is valid HEPData YAML.
{0}/data5.yaml is valid HEPData YAML.
{0}/data6.yaml is valid HEPData YAML.
{0}/data7.yaml is valid HEPData YAML.
{0}/data8.yaml is valid HEPData YAML.
{0}/submission.yaml is valid HEPData submission YAML.
{0}/data1.yaml is valid HEPData data YAML.
{0}/data2.yaml is valid HEPData data YAML.
{0}/data3.yaml is valid HEPData data YAML.
{0}/data4.yaml is valid HEPData data YAML.
{0}/data5.yaml is valid HEPData data YAML.
{0}/data6.yaml is valid HEPData data YAML.
{0}/data7.yaml is valid HEPData data YAML.
{0}/data8.yaml is valid HEPData data YAML.
""".format(submission_dir)


Expand All @@ -39,17 +39,17 @@ def test_valid_submission_zip(data_path, cli_runner):
assert result.exit_code == 0
lines = result.output.splitlines()
assert lines[0] == f"{submission_zip} is valid."
assert lines[1].endswith("/submission.yaml is valid HEPData YAML.")
assert lines[1].endswith("/submission.yaml is valid HEPData submission YAML.")
for i in list(range(1, 8)):
assert lines[i+1].endswith(f'data{i}.yaml is valid HEPData YAML.')
assert lines[i+1].endswith(f'data{i}.yaml is valid HEPData data YAML.')


def test_valid_single_yaml(data_path, cli_runner):
submission_file = os.path.join(data_path, '1512299.yaml')
result = cli_runner.invoke(validate, ['-f', submission_file])
assert result.exit_code == 0
assert result.output == f"""{submission_file} is valid.
{submission_file} is valid HEPData YAML.
{submission_file} is valid HEPData single file YAML.
"""


Expand Down
Empty file.
Empty file.
Empty file.
40 changes: 40 additions & 0 deletions testsuite/test_data/TestRemoteSubmission/submission.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
additional_resources:
- description: Web page with auxiliary material
location: https://atlas.web.cern.ch/Atlas/GROUPS/PHYSICS/PAPERS/SUSY-2018-31/
- description: Truth code to compute acceptance for all signal regions using the SimpleAnalysis
framework
location: Sbottom_MB2018.cxx
- description: Archive of full likelihoods in the HistFactory JSON format described
in ATL-PHYS-PUB-2019-029 Provided are 3 statiscal models labeled RegionA RegionB
and RegionC respectively each in their own sub-directory. For each model the background-only
model is found i the file named 'BkgOnly.json' For each model a set of patches
for various signal points is provided
location: HEPData_workspaces.tar.gz
- description: slha files for the 3 baseline signal points used in the analysis for
regions A,B,C
location: SbMB_SLHAs.tar.gz
comment: ''
data_license:
description: The content can be shared and adapted but you must give
appropriate credit and cannot restrict access to others.
name: cc-by-4.0
url: https://creativecommons.org/licenses/by/4.0/
---
data_file: valid_file_custom_remote.json
data_schema: https://scikit-hep.org/pyhf/schemas/1.0.0/workspace.json
description: Background-only workspace for RegionA in pyhf JSON format.
keywords:
- name: reactions
values:
- P P --> SBOTTOM SBOTTOM X
- name: observables
values:
- N
- name: phrases
values:
- SUSY
- Supersymmetry
- 3rd Generation
- Likelihood
name: RegionA/BkgOnly.json
Loading

0 comments on commit 395d2ed

Please sign in to comment.