Merge bec110e into 94fecde

HEPData · Jul 27, 2021 · 0b046e5 · 0b046e5
2 parents 94fecde + bec110e
commit 0b046e5
Show file tree

Hide file tree

Showing 81 changed files with 71,207 additions and 43 deletions.
diff --git a/README.rst b/README.rst
@@ -61,18 +61,126 @@ Via GitHub (for developers):
 Usage
 -----
 
+``hepdata-validator`` allows you to validate (via the command line or python):
+
+   * A full directory of submission and data files
+   * A zipped archive file containing all of the files (`full details <https://hepdata-submission.readthedocs.io/en/latest/introduction.html>`_)
+   * A `single submission file <https://hepdata-submission.readthedocs.io/en/latest/single_yaml.html>`_
+   * Individual data and submission files (via python only)
+
+Command line
+============
+
+Installing ``hepdata-validator`` adds the command ``hepdata-validate`` to your path, which allows you to validate a
+`HEPData submission <https://hepdata-submission.readthedocs.io/en/latest/introduction.html>`_ offline.
+
+Examples
+^^^^^^^^
+
+To validate a submission in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate
+
+To validate a submission in another directory:
+
+.. code:: bash
+
+    $ hepdata-validate -d ../TestHEPSubmission
+
+To validate a zip file in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate -z TestHEPSubmission.zip
+
+To validate a single yaml file in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate -f hep_submission.yaml
+
+Usage options
+^^^^^^^^^^^^^
+
+.. code:: bash
+
+    $ hepdata-validate --help
+    Usage: hepdata-validate [OPTIONS]
+
+      Offline validation of submission.yaml and YAML data files. Can check either
+      a single file or a directory
+
+    Options:
+      -d, --directory TEXT  Directory to check (defaults to current working
+                            directory)
+      -f, --file TEXT       Single submission yaml file to check - see
+                            https://hepdata-
+                            submission.readthedocs.io/en/latest/single_yaml.html.
+                            (Overrides directory)
+      -z, --zipfile TEXT    Zipped file (e.g. .zip, .tar.gz, .gzip) to check.
+                            (Overrides directory and file)
+      --help                Show this message and exit.
+
+
+Python
+======
+
+Validating a full submission
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To validate a full submission, instantiate a ``FullSubmissionValidator`` object:
+
+.. code:: python
+
+    from hepdata_validator.full_submission_validator import FullSubmissionValidator, SchemaType
+    full_submission_validator = FullSubmissionValidator()
+
+    # validate a directory
+    is_dir_valid = full_submission_validator.validate(directory='TestHEPSubmission')
+
+    # or uncomment to validate a zipped archive
+    # is_zip_valid = full_submission_validator.validate(zipfile='TestHEPSubmission.zip')
+
+    # or uncomment to validate a single file
+    # is_file_valid = full_submission_validator.validate(file='hep_submission.yaml')
+
+    # if there are any error messages, they are retrievable through this call
+    full_submission_validator.get_messages()
+
+    # the error messages can be printed
+    full_submission_validator.print_errors(submission_file_path)
+
+    # the list of valid files can be retrieved via the valid_files property, which is a
+    # dict mapping SchemaType (e.g. submission, data, single file, remote) to lists of
+    # valid files
+    full_submission_validator.valid_files[SchemaType.SINGLE_YAML]
+    full_submission_validator.valid_files[SchemaType.SUBMISSION]
+    full_submission_validator.valid_files[SchemaType.DATA]
+
+    # if a remote schema is used, valid_files is a list of tuples (schema, file)
+    full_submission_validator.valid_files[SchemaType.REMOTE]
+
+    # the list of valid files can be printed
+    full_submission_validator.print_valid_files()
+
+
+Validating individual files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 To validate submission files, instantiate a ``SubmissionFileValidator`` object:
 
 .. code:: python
 
     from hepdata_validator.submission_file_validator import SubmissionFileValidator
-    
+
     submission_file_validator = SubmissionFileValidator()
     submission_file_path = 'submission.yaml'
-    
+
     # the validate method takes a string representing the file path
     is_valid_submission_file = submission_file_validator.validate(file_path=submission_file_path)
-    
+
     # if there are any error messages, they are retrievable through this call
     submission_file_validator.get_messages()
 
@@ -83,14 +191,14 @@ To validate submission files, instantiate a ``SubmissionFileValidator`` object:
 To validate data files, instantiate a ``DataFileValidator`` object:
 
 .. code:: python
-    
+
     from hepdata_validator.data_file_validator import DataFileValidator
-    
+
     data_file_validator = DataFileValidator()
-    
+
     # the validate method takes a string representing the file path
     data_file_validator.validate(file_path='data.yaml')
-    
+
     # if there are any error messages, they are retrievable through this call
     data_file_validator.get_messages()
 
@@ -106,12 +214,12 @@ for the error message lookup map.
 
     from hepdata_validator.data_file_validator import DataFileValidator
     import yaml
-    
+
     file_contents = yaml.safe_load(open('data.yaml', 'r'))
     data_file_validator = DataFileValidator()
-    
+
     data_file_validator.validate(file_path='data.yaml', data=file_contents)
-    
+
     data_file_validator.get_messages('data.yaml')
 
     data_file_validator.print_errors('data.yaml')
@@ -131,10 +239,6 @@ For the analogous case of the ``SubmissionFileValidator``:
     is_valid_submission_file = submission_file_validator.validate(file_path=submission_file_path, data=docs)
     submission_file_validator.print_errors(submission_file_path)
 
-An example `offline validation script <https://github.com/HEPData/hepdata-submission/blob/master/scripts/check.py>`_
-uses the ``hepdata_validator`` package to validate the ``submission.yaml`` file and all YAML data files of a
-HEPData submission.
-
 
 Schema Versions
 ---------------
@@ -196,7 +300,7 @@ download them. However, in principle, for testing purposes, note that the same m
 
 .. code:: python
 
-    schema_path = 'https://hepdata.net/submission/schemas/1.0.1/'
+    schema_path = 'https://hepdata.net/submission/schemas/1.1.0/'
     schema_name = 'data_schema.json'
 
-and passing a HEPData YAML data file as the ``file_path`` argument of the ``validate`` method.
+and passing a HEPData YAML data file as the ``file_path`` argument of the ``validate`` method.
diff --git a/hepdata_validator/__init__.py b/hepdata_validator/__init__.py
@@ -24,12 +24,13 @@
 
 import abc
 import os
+from packaging import version as packaging_version
 
 from .version import __version__
 
 __all__ = ('__version__', )
 
-VALID_SCHEMA_VERSIONS = ['1.0.1', '1.0.0', '0.1.0']
+VALID_SCHEMA_VERSIONS = ['1.1.0', '1.0.1', '1.0.0', '0.1.0']
 LATEST_SCHEMA_VERSION = VALID_SCHEMA_VERSIONS[0]
 
 RAW_SCHEMAS_URL = 'https://raw.githubusercontent.com/HEPData/hepdata-validator/' \
@@ -48,22 +49,16 @@ def __init__(self, *args, **kwargs):
         self.default_schema_file = ''
         self.schemas = kwargs.get('schemas', {})
         self.schema_folder = kwargs.get('schema_folder', 'schemas')
-        self.schema_version = kwargs.get('schema_version', LATEST_SCHEMA_VERSION)
-        if self.schema_version not in VALID_SCHEMA_VERSIONS:
-            raise ValueError('Invalid schema version ' + self.schema_version)
+        self.schema_version_string = kwargs.get('schema_version', LATEST_SCHEMA_VERSION)
+        if self.schema_version_string not in VALID_SCHEMA_VERSIONS:
+            raise ValueError('Invalid schema version ' + self.schema_version_string)
+        self.schema_version = packaging_version.parse(self.schema_version_string)
 
-    def _get_major_version(self):
-        """
-        Parses the major version of the validator.
-
-        :return: integer corresponding to the validator major version
-        """
-        return int(self.schema_version.split('.')[0])
 
     def _get_schema_filepath(self, schema_filename):
         full_filepath = os.path.join(self.base_path,
                                      self.schema_folder,
-                                     self.schema_version,
+                                     self.schema_version_string,
                                      schema_filename)
 
         if not os.path.isfile(full_filepath):

diff --git a/hepdata_validator/cli.py b/hepdata_validator/cli.py
@@ -0,0 +1,30 @@
+import sys
+
+import click
+
+from .full_submission_validator import FullSubmissionValidator
+
+
+@click.command()
+@click.option('--directory', '-d', default='.', help='Directory to check (defaults to current working directory)')
+@click.option('--file', '-f', default=None, help='Single submission yaml file to check - see https://hepdata-submission.readthedocs.io/en/latest/single_yaml.html. (Overrides directory)')
+@click.option('--zipfile', '-z', default=None, help='Zipped file (e.g. .zip, .tar.gz, .gzip) to check. (Overrides directory and file)')
+def validate(directory, file, zipfile):  # pragma: no cover
+    """
+    Offline validation of submission.yaml and YAML data files.
+    Can check either a single file or a directory
+    """
+    file_or_dir_checked = zipfile if zipfile else (file if file else directory)
+    validator = FullSubmissionValidator()
+    is_valid = validator.validate(directory, file, zipfile)
+    if is_valid:
+        click.echo(f"{file_or_dir_checked} is valid.")
+    else:
+        click.echo(f"ERROR: {file_or_dir_checked} is invalid.")
+
+    validator.print_valid_files()
+    for f in validator.messages.keys():
+        validator.print_errors(f)
+
+    if not is_valid:
+        sys.exit(1)
diff --git a/hepdata_validator/data_file_validator.py b/hepdata_validator/data_file_validator.py
@@ -34,6 +34,8 @@
 
 from hepdata_validator import Validator, ValidationMessage
 from jsonschema import validate as json_validate, ValidationError
+from jsonschema.validators import validator_for
+from jsonschema.exceptions import by_relevance
 
 __author__ = 'eamonnmaguire'
 
@@ -66,7 +68,7 @@ def load_custom_schema(self, type, schema_file_path=None):
             else:
                 _schema_file = os.path.join(self.base_path,
                                             self.schema_folder,
-                                            self.schema_version,
+                                            self.schema_version_string,
                                             "{0}_schema.json".format(type))
 
             with open(_schema_file, 'r') as f:
@@ -117,9 +119,17 @@ def validate(self, **kwargs):
                 json_validate(data, custom_schema)
             else:
                 with open(self.default_schema_file, 'r') as f:
-                    default_data_schema = json.load(f)
-                    json_validate(data, default_data_schema)
-                if self._get_major_version() > 0:
+                    data_schema = json.load(f)
+                    # Create validator ourselves so we can tweak the errors
+                    cls = validator_for(data_schema)
+                    v = cls(data_schema)
+                    # Make 'oneOf' errors more relevant to give better error
+                    # messages about 'low' without 'high' etc
+                    sort_fn = by_relevance(strong='oneOf')
+                    for error in sorted(v.iter_errors(data), key=sort_fn):
+                        raise error
+
+                if self.schema_version.major > 0:
                     check_for_zero_uncertainty(data)
                     check_length_values(data)