Add checks for zero uncertainty and length of values list in data schema

* Check that uncertainties are not all zero in data schema (closes #7). * Check that length of 'values' list is consistent in data schema. * Add tests for all zero uncertainties and inconsistent 'values' list. * Catch LookupError in 'validate' method of SubmissionFileValidator. * Rename 'valid_data_with_percent.yaml' file and remove invalid fields. * Change 'error' to 'uncertainty' in tests since 'error' is ambiguous. * Add 2nd value for independent variable in 'valid_file.yaml' (& json). * Correct mistake in README.rst, add print_errors in last usage example. Signed-off-by: Graeme Watt <graeme.watt@durham.ac.uk>
HEPData · Feb 25, 2020 · 293eec0 · 293eec0
1 parent 7148b62
commit 293eec0
Show file tree

Hide file tree

Showing 10 changed files with 243 additions and 20 deletions.
diff --git a/README.rst b/README.rst
@@ -103,13 +103,15 @@ for the error message lookup map.
     from hepdata_validator.data_file_validator import DataFileValidator
     import yaml
     
-    file = yaml.load(open('data.yaml', 'r'))
+    file_contents = yaml.load(open('data.yaml', 'r'))
     data_file_validator = DataFileValidator()
     
     data_file_validator.validate(file_path='data.yaml', data=file_contents)
     
     data_file_validator.get_messages('data.yaml')
 
+    data_file_validator.print_errors('data.yaml')
+
 
 An example `offline validation script <https://github.com/HEPData/hepdata-submission/blob/master/scripts/check.py>`_
 uses the ``hepdata_validator`` package to validate the ``submission.yaml`` file and all YAML data files of a

diff --git a/hepdata_validator/data_file_validator.py b/hepdata_validator/data_file_validator.py
@@ -121,6 +121,9 @@ def validate(self, **kwargs):
                 json_validate(data, custom_schema)
             else:
                 json_validate(data, default_data_schema)
+                if self.schema_version != '0.1.0':
+                    check_for_zero_uncertainty(data)
+                    check_length_values(data)
 
         except ValidationError as ve:
 
@@ -143,3 +146,64 @@ def __init__(self, message=''):
 
     def __unicode__(self):
         return self.message
+
+
+def check_for_zero_uncertainty(data):
+    """
+    Check that uncertainties are not all zero.
+    
+    :param data: data table in YAML format
+    :return: raise ValidationError if uncertainties are all zero
+    """
+    for dependent_variable in data['dependent_variables']:
+
+        if 'values' in dependent_variable:
+            for value in dependent_variable['values']:
+
+                if 'errors' in value:
+                    zero_uncertainties = []
+                    for error in value['errors']:
+
+                        if 'symerror' in error:
+                            error_plus = error_minus = error['symerror']
+                        elif 'asymerror' in error:
+                            error_plus = error['asymerror']['plus']
+                            error_minus = error['asymerror']['minus']
+
+                        if isinstance(error_plus, str):
+                            error_plus = error_plus.replace('%', '')
+                        try:
+                            error_plus = float(error_plus)
+                        except ValueError:
+                            pass
+
+                        if isinstance(error_minus, str):
+                            error_minus = error_minus.replace('%', '')
+                        try:
+                            error_minus = float(error_minus)
+                        except ValueError:
+                            pass
+
+                        if error_plus == 0 and error_minus == 0:
+                            zero_uncertainties.append(True)
+                        else:
+                            zero_uncertainties.append(False)
+
+                    if all(zero_uncertainties):
+                        raise ValidationError('Uncertainties should not all be zero', instance=value)
+
+
+def check_length_values(data):
+    """
+    Check that the length of the 'values' list is consistent for
+    each of the independent_variables and dependent_variables.
+    
+    :param data: data table in YAML format
+    :return: raise ValidationError if inconsistent
+    """
+    indep_count = [len(indep['values']) for indep in data['independent_variables']]
+    dep_count = [len(dep['values']) for dep in data['dependent_variables']]
+    if len(set(indep_count + dep_count)) > 1:  # if more than one unique count
+        raise ValidationError("Inconsistent length of 'values' list: " + 
+                              "independent_variables%s, dependent_variables%s" % (str(indep_count), str(dep_count)),
+                              instance=data)
diff --git a/hepdata_validator/submission_file_validator.py b/hepdata_validator/submission_file_validator.py
@@ -84,6 +84,9 @@ def validate(self, **kwargs):
             if not self.has_errors(file_path):
                 return_value = True
 
+        except LookupError as le:
+            raise le
+
         except ScannerError as se:  # pragma: no cover
             self.add_validation_message(  # pragma: no cover
                 ValidationMessage(file=file_path, message=

diff --git a/testsuite/test_data/file_with_inconsistent_values.yaml b/testsuite/test_data/file_with_inconsistent_values.yaml
@@ -0,0 +1,20 @@
+---
+independent_variables:
+  - header: {name: SQRT(S), units: GEV}
+    values:
+      - value: 7000
+dependent_variables:
+  - header: {name: SIG(total), units: FB}
+    qualifiers:
+      - {name: RE, value: P P --> Z0 Z0 X}
+    values:
+      - value: 6.7
+        errors:
+          - {symerror: 0.45, label: stat}
+          - {asymerror: {plus: 0.4, minus: -0.3}, label: sys}
+          - {symerror: 0.34, label: "sys,lumi"}
+      - value: 5.7
+        errors:
+          - {symerror: 0.4, label: stat}
+          - {asymerror: {plus: 0.42, minus: 0.31}, label: sys}
+          - {symerror: 0.4, label: "sys,lumi"}
diff --git a/testsuite/test_data/file_with_zero_uncertainty.yaml b/testsuite/test_data/file_with_zero_uncertainty.yaml
@@ -0,0 +1,21 @@
+---
+independent_variables:
+  - header: {name: SQRT(S), units: GEV}
+    values:
+      - value: 7000
+      - value: 8000
+dependent_variables:
+  - header: {name: SIG(total), units: FB}
+    qualifiers:
+      - {name: RE, value: P P --> Z0 Z0 X}
+    values:
+      - value: 6.7
+        errors:
+          - {symerror: 0.45, label: stat}
+          - {asymerror: {plus: 0.4, minus: -0.3}, label: sys}
+          - {symerror: 0.34, label: "sys,lumi"}
+      - value: 5.7
+        errors:
+          - {symerror: 0.0, label: stat}
+          - {asymerror: {plus: 0.0, minus: 0.0}, label: sys}
+          - {symerror: 0.0, label: "sys,lumi"}
diff --git a/...uite/test_data/valid_data_with_error.yaml → ...te/test_data/valid_data_with_percent.yaml b/...uite/test_data/valid_data_with_error.yaml → ...te/test_data/valid_data_with_percent.yaml
@@ -1,14 +1,4 @@
-
 ---
-name: 'Table 1'
-label: 'Data from Table 4'
-description: |
-    Measured double-differential dijet cross sections for the range 0.0 &lt;= y* &lt; 0.5 and jet radius parameter R = 0.4.  The statistical uncertainties from data and MC simulation have been combined.  The three columns correspond to nominal, stronger or weaker correlations between jet energy scale uncertainty components
-keywords:
-  - {name: reactions, values: ['P P --> JET JET X']}
-  - {name: observables, values: ['D2SIG/DM/DYRAP']}
-  - {name: cmenergies, values: [7000.0]}
-additional_resources:
 independent_variables:
   - header: {name: 'M(2JET)', units: 'TEV'}
     values:
@@ -49,7 +39,7 @@ dependent_variables:
       - value: 777000.0
         errors:
           - {symerror: 0.73%, label: 'stat'}
-          - {asymerror: {plus: 3.1%, minus: -3.0%}, label: 'sys'}
+          - {asymerror: {plus: '', minus: -3.0%}, label: 'sys'}  # empty string for 'plus' component
           - {symerror: 0.4%, label: 'sys'}
           - {symerror: 0.1%, label: 'sys'}
           - {symerror: 0.2%, label: 'sys'}
@@ -61,7 +51,7 @@ dependent_variables:
           - {symerror: 1.0%, label: 'sys'}
           - {symerror: 0.3%, label: 'sys'}
           - {symerror: 0.8%, label: 'sys'}
-          - {asymerror: {plus: 0.5%, minus: -0.6%}, label: 'sys'}
+          - {asymerror: {plus: 0.5%, minus: ''}, label: 'sys'}  # empty string for 'minus' component
           - {symerror: 0.0%, label: 'sys'}
           - {symerror: 0.0%, label: 'sys'}
           - {symerror: 0.0%, label: 'sys'}

diff --git a/testsuite/test_data/valid_data_with_zero_percent.yaml b/testsuite/test_data/valid_data_with_zero_percent.yaml
@@ -0,0 +1,89 @@
+---
+independent_variables:
+  - header: {name: 'M(2JET)', units: 'TEV'}
+    values:
+      - {low: 0.26, high: 0.31}
+dependent_variables:
+  - header: {name: 'D2(SIG)/DM(2JET)/DYRAP*', units: 'PB*TEV*-1'}
+    qualifiers:
+      - {name: SQRT(S), value: '7000.0', units: 'GeV'}
+      - {name: YRAP*, value: '0.0-0.5'}
+      - {name: R, value: '0.4'}
+      - {name: RE, value: 'P P --> JET JET X'}
+      - {name: PT(JET1), value: '> 100', units: 'GEV'}
+      - {name: PT(JET2), value: '> 50', units: 'GEV'}
+      - {name: ABS(YRAP(JET1)), value: '< 3.0'}
+      - {name: ABS(YRAP(JET2)), value: '< 3.0'}
+      - {name: Correlation assumptions, value: 'Nominal'}
+    values:
+      - value: 777000.0
+        errors:
+          - {symerror: 0.0%, label: 'stat'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: -0.0%, minus: 0.0%}, label: 'sys'}
+          - {asymerror: {plus: -0.0%, minus: 0.0%}, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {asymerror: {plus: 0.0%, minus: -0.0%}, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
+          - {symerror: 0.0%, label: 'sys'}
diff --git a/testsuite/test_data/valid_file.json b/testsuite/test_data/valid_file.json
@@ -9,6 +9,9 @@
         {
           "high": 7000,
           "low": 6500
+        },
+        {
+          "value": 8000
         }
       ]
     }

diff --git a/testsuite/test_data/valid_file.yaml b/testsuite/test_data/valid_file.yaml
@@ -3,6 +3,7 @@ independent_variables:
   - header: {name: SQRT(S), units: GEV}
     values:
       - value: 7000
+      - value: 8000
 dependent_variables:
   - header: {name: SIG(total), units: FB}
     qualifiers:

diff --git a/testsuite/validation_test.py b/testsuite/validation_test.py
@@ -193,9 +193,9 @@ def setUp(self):
             self.base_dir,
             'test_data/invalid_file.json')
 
-        self.valid_file_error_percent_yaml = os.path.join(
+        self.valid_file_uncertainty_percent_yaml = os.path.join(
             self.base_dir,
-            'test_data/valid_data_with_error.yaml'
+            'test_data/valid_data_with_percent.yaml'
         )
 
         self.invalid_syntax_data_file = os.path.join(
@@ -212,6 +212,18 @@ def setUp(self):
             self.base_dir,
             'test_data/valid_file_custom.yaml')
 
+        self.file_with_zero_uncertainty = os.path.join(
+            self.base_dir,
+            'test_data/file_with_zero_uncertainty.yaml')
+
+        self.file_with_zero_percent = os.path.join(
+            self.base_dir,
+            'test_data/valid_data_with_zero_percent.yaml')
+
+        self.file_with_inconsistent_values = os.path.join(
+            self.base_dir,
+            'test_data/file_with_inconsistent_values.yaml')
+
     def test_no_file_path_supplied(self):
         try:
             self.validator.validate(file_path=None)
@@ -231,11 +243,11 @@ def test_invalid_yaml_file(self):
 
         self.validator.print_errors(self.invalid_file_yaml)
 
-    def test_valid_file_with_percent_errors(self):
-        print('___DATA_VALIDATION: Testing valid yaml percent error ___')
-        self.assertEqual(self.validator.validate(file_path=self.valid_file_error_percent_yaml),
-                         False)
-        self.validator.print_errors(self.valid_file_error_percent_yaml)
+    def test_valid_file_with_percent_uncertainty(self):
+        print('___DATA_VALIDATION: Testing valid yaml percent uncertainty ___')
+        is_valid = self.validator.validate(file_path=self.valid_file_uncertainty_percent_yaml)
+        self.validator.print_errors(self.valid_file_uncertainty_percent_yaml)
+        self.assertEqual(is_valid, True)
 
     def test_valid_json_file(self):
         print('___DATA_VALIDATION: Testing valid json submission___')
@@ -319,6 +331,24 @@ def test_ioerror_yaml_file(self):
 
         self.validator.print_errors(self.valid_file_yaml[:-1])
 
+    def test_file_with_zero_uncertainty(self):
+        print('___DATA_VALIDATION: Testing file with zero uncertainty___')
+        self.assertEqual(self.validator.validate(file_path=self.file_with_zero_uncertainty), False)
+
+        self.validator.print_errors(self.file_with_zero_uncertainty)
+
+    def test_file_with_zero_percent(self):
+        print('___DATA_VALIDATION: Testing file with zero percent uncertainty___')
+        self.assertEqual(self.validator.validate(file_path=self.file_with_zero_percent), False)
+
+        self.validator.print_errors(self.file_with_zero_percent)
+
+    def test_file_with_inconsistent_values(self):
+        print('___DATA_VALIDATION: Testing file with inconsistent values list___')
+        self.assertEqual(self.validator.validate(file_path=self.file_with_inconsistent_values), False)
+
+        self.validator.print_errors(self.file_with_inconsistent_values)
+
 
 if __name__ == '__main__':
     unittest.main()