Merge bb2acc7 into 5db8fb7

HEPData · Mar 17, 2020 · 75e789e · 75e789e
2 parents 5db8fb7 + bb2acc7
commit 75e789e
Show file tree

Hide file tree

Showing 7 changed files with 307 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,4 +56,7 @@ docs/_build/
 target/
 
 # PyCharm
-.idea/
+.idea/
+
+# Downloaded schemas
+hepdata_validator/schemas_remote/
diff --git a/README.rst b/README.rst
@@ -61,7 +61,29 @@ Via GitHub (for developers):
 Usage
 -----
 
-To validate files, you need to instantiate a validator (I love OO).
+To validate against remote schemas, instantiate a ``HTTPSchemaDownloader`` object.
+
+This object retrieves schemas from a remote location, and optionally save them in the local file system,
+following the structure: ``schemas_remote/<host>/<version>/<schema_name>``
+
+.. code:: python
+
+    from hepdata_validator.schema_downloader import HTTPSchemaDownloader
+
+    downloader = HTTPSchemaDownloader(
+        endpoint="https://scikit-hep.org/pyhf/schemas/1.0.0",
+        company="scikit-hep.org",
+        version="1.0.0",
+    )
+
+    schema_name = "defs.json"
+    schema_spec = downloader.get_schema(schema_name)
+
+    # The downloader stores the remote schema in the local path
+    downloader.save_locally(schema_name, schema_spec)
+
+
+To validate submissions, instantiate a ``SubmissionFileValidator`` object:
 
 .. code:: python
 
@@ -80,7 +102,11 @@ To validate files, you need to instantiate a validator (I love OO).
     submission_file_validator.print_errors(submission_file_path)
 
 
-Data file validation is exactly the same.
+To validate data files, you need to instantiate a ``DataFileValidator`` object.
+
+In this case, the ``DataFileValidator`` can take a ``schema_folder`` argument to specify
+the location of the schemas it is going to validate (by default ``schemas``).
+This is useful when validating against schemas stored inside ``schemas_remote/<organization_name>``.
 
 .. code:: python
     
@@ -125,13 +151,22 @@ HEPData submission.
 Schemas
 -------
 
-There are currently 2 versions of the JSON schemas, `0.1.0
+When considering **native HEP JSON schemas**, there are currently 2 versions: `0.1.0
 <https://github.com/HEPData/hepdata-validator/tree/master/hepdata_validator/schemas/0.1.0>`_ and `1.0.0
-<https://github.com/HEPData/hepdata-validator/tree/master/hepdata_validator/schemas/1.0.0>`_. In most cases you should use
-**1.0.0** (the default). If you need to use a different version, you can pass a keyword argument ``schema_version``
-when initialising the validator:
+<https://github.com/HEPData/hepdata-validator/tree/master/hepdata_validator/schemas/1.0.0>`_.
+In most cases you should use **1.0.0** (the default). If you need to use a different version,
+you can pass a keyword argument ``schema_version`` when initialising the validator:
+
+.. code:: python
+
+    sub_validator = SubmissionFileValidator(schema_version='0.1.0')
+    data_validator = DataFileValidator(schema_version='0.1.0')
+
+When using **remotely defined schemas**, versions depend on the organization providing those schemas,
+and it is their responsibility to offer a way of keeping track of different schemas versions.
+An example may be:
 
 .. code:: python
 
-    submission_file_validator = SubmissionFileValidator(schema_version='0.1.0')
-    data_file_validator = DataFileValidator(schema_version='0.1.0')
+    sub_validator = SubmissionFileValidator(schema_folder='schemas_remote/scikit-hep.org', schema_version='1.0.0')
+    data_validator = DataFileValidator(schema_folder='schemas_remote/scikit-hep.org', schema_version='1.0.0')
diff --git a/hepdata_validator/__init__.py b/hepdata_validator/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of HEPData.
-# Copyright (C) 2016 CERN.
+# Copyright (C) 2020 CERN.
 #
 # HEPData is free software; you can redistribute it
 # and/or modify it under the terms of the GNU General Public License as
@@ -47,13 +47,14 @@ def __init__(self, *args, **kwargs):
         self.messages = {}
         self.default_schema_file = ''
         self.schemas = kwargs.get('schemas', {})
+        self.schema_folder = kwargs.get('schema_folder', 'schemas')
         self.schema_version = kwargs.get('schema_version', LATEST_SCHEMA_VERSION)
         if self.schema_version not in VALID_SCHEMA_VERSIONS:
             raise ValueError('Invalid schema version ' + self.schema_version)
 
     def _get_schema_filepath(self, schema_filename):
         full_filepath = os.path.join(self.base_path,
-                                     'schemas',
+                                     self.schema_folder,
                                      self.schema_version,
                                      schema_filename)
 

diff --git a/hepdata_validator/data_file_validator.py b/hepdata_validator/data_file_validator.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # This file is part of HEPData.
-# Copyright (C) 2016 CERN.
+# Copyright (C) 2020 CERN.
 #
 # HEPData is free software; you can redistribute it
 # and/or modify it under the terms of the GNU General Public License as
@@ -66,7 +66,7 @@ def load_custom_schema(self, type, schema_file_path=None):
                 _schema_file = schema_file_path
             else:
                 _schema_file = os.path.join(self.base_path,
-                                            'schemas',
+                                            self.schema_folder,
                                             self.schema_version,
                                             "{0}_schema.json".format(type))
 

diff --git a/hepdata_validator/schema_downloader.py b/hepdata_validator/schema_downloader.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of HEPData.
+# Copyright (C) 2020 CERN.
+#
+# HEPData is free software; you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# HEPData is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HEPData; if not, write to the
+# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+# MA 02111-1307, USA.
+#
+# In applying this license, CERN does not
+# waive the privileges and immunities granted to it by virtue of its status
+# as an Intergovernmental Organization or submit itself to any jurisdiction.
+
+import os
+import requests
+from abc import ABCMeta
+from abc import abstractmethod
+
+
+class SchemaDownloaderInterface(object):
+    """
+    Interface for the schema downloader objects.
+    Used to validate schemas available across the internet.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_schema(self, schema_name):
+        """
+        Retrieves the specified schema from a remote URL.
+        :param schema_name: str.
+        :return: str.
+        """
+
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save_locally(self, schema_name, schema_spec, overwrite):
+        """
+        Saves the remote schema in the local file system
+        :param schema_name: str.
+        :param schema_spec: str.
+        :param overwrite: bool.
+        :return: None.
+        """
+
+        raise NotImplementedError()
+
+
+class HTTPSchemaDownloader(SchemaDownloaderInterface):
+    """
+    Object to download schemas using HTTP / HTTPS
+    Used to validate schemas available across the internet.
+    """
+
+    def __init__(self, endpoint, company, version):
+        """
+        Initializes the local folder where schemas will be stored.
+        :param endpoint: str.
+        :param company: str.
+        :param version: str.
+        """
+
+        self.endpoint = endpoint
+        self.company = company
+        self.version = version
+
+        self.saved_schema_folder = "schemas_remote"
+        self.saved_schema_path = self._build_local_path(company, version)
+
+    def _build_local_path(self, company, version):
+        """
+        Builds the remote schemas complete URL, up to the schema names
+        :param company: str
+        :param version: str.
+        :return: str.
+        """
+
+        base_path = os.path.dirname(__file__)
+        return os.path.join(base_path, self.saved_schema_folder, company, version)
+
+    def get_schema(self, schema_name):
+        """
+        Downloads the specified schema from a remote URL.
+        :param schema_name: str.
+        :return: str.
+        """
+
+        schema_url = self.endpoint + "/" + schema_name
+        schema_resp = requests.get(schema_url)
+        schema_resp.raise_for_status()
+
+        return schema_resp.text
+
+    def save_locally(self, schema_name, schema_spec, overwrite=False):
+        """
+        Saves the remote schema in the local file system
+        :param schema_name: str.
+        :param schema_spec: str.
+        :param overwrite: bool.
+        :return: None.
+        """
+
+        file_path = os.path.join(self.saved_schema_path, schema_name)
+        file_folder = os.path.dirname(file_path)
+
+        # Skip download if the file exist
+        if os.path.isfile(file_path) and not overwrite:
+            return
+
+        # This is compatible both with Python2 and Python3
+        try:
+            os.makedirs(file_folder)
+        except OSError:
+            if not os.path.isdir(file_folder):
+                raise
+
+        with open(file_path, 'w') as f:
+            f.write(schema_spec)
diff --git a/setup.py b/setup.py
@@ -13,6 +13,7 @@
     'pytest-cov>=1.8.0',
     'pytest-pep8>=1.0.6',
     'coverage>=3.7.1',
+    'mock>=2.0.0',
 ]
 
 extras_require = {
@@ -83,7 +84,8 @@ def run_tests(self):
     extras_require=extras_require,
     install_requires=[
         "pyyaml",
-        "jsonschema"
+        "jsonschema",
+        "requests",
     ],
     test_suite='hepdata_validator.testsuite',
     tests_require=test_requirements,

diff --git a/testsuite/test_schema_downloader.py b/testsuite/test_schema_downloader.py
@@ -0,0 +1,121 @@
+import os
+import pytest
+from hepdata_validator.schema_downloader import HTTPSchemaDownloader
+from requests.exceptions import HTTPError
+from mock import patch
+
+
+####################################################
+#                 Tests fixtures                   #
+####################################################
+
+
+@pytest.fixture(scope="module")
+def http_downloader():
+    """
+    Generates a valid HTTPSchemaDownloader using example names
+    """
+
+    return HTTPSchemaDownloader(
+        endpoint="https://testing.com/schemas/1.0.0",
+        company="testing.com",
+        version="1.0.0",
+    )
+
+
+####################################################
+#                   Tests mocks                    #
+####################################################
+
+
+class MockedResponse(object):
+
+    def __init__(self, content, http_code):
+        self.content = content
+        self.http_code = http_code
+
+    def raise_for_status(self):
+        if self.http_code != 200:
+            raise HTTPError
+
+    @property
+    def text(self):
+        return self.content
+
+
+def get_patched_valid_response(url):
+    return MockedResponse('{"field_1": "value_1", "field_2": "value_2"}', 200)
+
+
+def get_patched_invalid_response(url):
+    return MockedResponse("Not found", 404)
+
+
+####################################################
+#            HTTPSchemaDownloader tests            #
+####################################################
+
+
+@patch('requests.get', new=get_patched_valid_response)
+def test_http_downloader_get_schema(http_downloader):
+    """
+    Tests the HTTPSchemaDownloader with a real schema name
+    :param http_downloader: HTTPSchemaDownloader
+    """
+
+    file_name = "real_schema.json"
+
+    schema_spec = http_downloader.get_schema(file_name)
+    assert len(schema_spec) > 0
+
+
+@patch('requests.get', new=get_patched_invalid_response)
+def test_http_downloader_get_missing_schema(http_downloader):
+    """
+    Tests the HTTPSchemaDownloader with a missing schema name
+    :param http_downloader: HTTPSchemaDownloader
+    """
+
+    file_name = "missing_schema.json"
+
+    with pytest.raises(HTTPError):
+        http_downloader.get_schema(file_name)
+
+
+def test_http_downloader_save_schema(http_downloader):
+    """
+    Tests the HTTPSchemaDownloader with an invalid initialization
+    :param http_downloader: HTTPSchemaDownloader
+    """
+
+    schema_name = "dummy.json"
+    schema_spec = '{"key_1": "value_1", "key_2": "value_2"}'
+
+    http_downloader.save_locally(schema_name, schema_spec, overwrite=True)
+
+    expected_folder = http_downloader.saved_schema_path
+    expected_path = os.path.join(expected_folder, schema_name)
+
+    assert os.path.isfile(expected_path)
+
+
+def test_http_downloader_save_existing_schema(http_downloader):
+    """
+    Tests the HTTPSchemaDownloader with an invalid initialization
+    :param http_downloader: HTTPSchemaDownloader
+    """
+
+    schema_name = "dummy.json"
+    schema_spec_1 = '{"key_1": "value_1", "key_2": "value_2"}'
+    schema_spec_2 = '{"key_1": "new_value_1", "key_2": "new_value_2"}'
+
+    http_downloader.save_locally(schema_name, schema_spec_1, overwrite=True)
+    http_downloader.save_locally(schema_name, schema_spec_2, overwrite=False)
+
+    expected_folder = http_downloader.saved_schema_path
+    expected_path = os.path.join(expected_folder, schema_name)
+
+    with open(expected_path, 'r') as f:
+        file_content = f.read()
+
+    assert file_content == schema_spec_1