Skip to content

Commit

Permalink
Move MITAardvark format validation method into validator module
Browse files Browse the repository at this point in the history
  • Loading branch information
jonavellecuerdo committed Feb 6, 2024
1 parent f63608d commit 4041f31
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 83 deletions.
85 changes: 3 additions & 82 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,17 @@
import datetime
import json
import logging
import os
from abc import abstractmethod
from typing import Any, Literal

from attrs import asdict, define, field, fields
from attrs.validators import in_, instance_of
from jsonschema import FormatChecker
from jsonschema.validators import Draft202012Validator
from lxml import etree # type: ignore[import-untyped]
from referencing import Registry, Resource

from harvester.aws.sqs import ZipFileEventMessage
from harvester.config import Config
from harvester.records.exceptions import FieldMethodError, JSONSchemaValidationError
from harvester.records.exceptions import FieldMethodError
from harvester.records.validators import MITAardvarkFormatValidator
from harvester.utils import dedupe_list_of_values

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -112,83 +109,7 @@ def __attrs_post_init__(self) -> None:
The JSON schema validation is performed in addition to the validation of
arguments through attrs attribute validators.
"""
self.validate()

@property
def json_schemas(self) -> dict:
"""Load JSON schemas for validating MITAardvark records.
To validate MITAardvark records, the validator relies on two schemas:
* MITAardvark schema;
* OpenGeoMetadata's (OGM) Geoblacklight Aardvark schema.
The MITAardvark schema will comprise of majority of OGM's Aardvark schema,
with several updates for MIT's purposes. The schemas are read from
harvester/records/schemas directory and later added to a referencing.Registry.
Once in the registry, the validator can use the schemas to validate data.
Returns:
dict: JSON schemas for validating MITAardvark records.
"""
schemas = {}
schema_dir = os.path.join(os.path.dirname(__file__), "schemas")
with open(schema_dir + "/mit-schema-aardvark.json") as f:
schemas["mit-schema-aardvark"] = json.loads(f.read())
with open(schema_dir + "/geoblacklight-schema-aardvark.json") as f:
schemas["geoblacklight-schema-aardvark"] = json.loads(f.read())
return schemas

@property
def validator(self) -> Draft202012Validator:
"""Create a validator with JSON schemas for evaluating MITAardvark records.
An instance referencing.Registry is created with the required schema added as
resources. When the validator is created, the registry is included as an argument.
This enables the validator to use the schemas for validation.
Note: For more information on
* registries: https://python-jsonschema.readthedocs.io/en/stable/referencing
* validators: https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol
Returns:
Draft202012Validator: JSON schema validator with MITAardvark and OGM Aardvark
schemas.
"""
registry: Registry = Registry().with_resources(
[
(
"mit-schema-aardvark",
Resource.from_contents(self.json_schemas["mit-schema-aardvark"]),
),
(
"geoblacklight-schema-aardvark",
Resource.from_contents(
self.json_schemas["geoblacklight-schema-aardvark"]
),
),
]
)
return Draft202012Validator(
schema=self.json_schemas["mit-schema-aardvark"],
registry=registry,
format_checker=FormatChecker(),
)

def validate(self) -> None:
"""Validate that Aardvark is compliant for MIT purposes.
The validator is retrieved in order to use .iter_errors() to iterate through
each of the validation errors in the normalized record. If there are any errors,
they are compiled into a single error message that appears in a
JSONSchemaValidationError exception.
"""
validation_errors = sorted(self.validator.iter_errors(self.to_dict()), key=str)

if validation_errors:
exc = JSONSchemaValidationError(validation_errors)
logger.debug(exc.message)
raise exc
logger.debug("The normalized MITAardvark record is valid")
MITAardvarkFormatValidator(self.to_dict()).validate()

def to_dict(self) -> dict:
"""Dump MITAardvark record to dictionary."""
Expand Down
98 changes: 97 additions & 1 deletion harvester/records/validators.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import json
import logging
import os
import re
from ast import literal_eval
from collections.abc import Callable
from functools import partial, update_wrapper

import shapely
from attrs import define
from jsonschema import FormatChecker
from jsonschema.validators import Draft202012Validator
from referencing import Registry, Resource

from harvester.records.exceptions import FieldValueInvalidWarning
from harvester.records.exceptions import (
FieldValueInvalidWarning,
JSONSchemaValidationError,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -62,3 +71,90 @@ def create_geoshape(wkt: str) -> shapely.Geometry:
xmin, xmax, ymax, ymin = literal_eval(geoshape_string.group(1))
return shapely.box(xmin, ymin, xmax, ymax)
return shapely.from_wkt(wkt)


##########################
# Format Validators
##########################
@define
class MITAardvarkFormatValidator:
"""MITAardvark format validator class."""

normalized_record: dict

@property
def jsonschemas(self) -> dict:
"""Load JSON schemas for validating MITAardvark records.
To validate MITAardvark records, the validator relies on two schemas:
* MITAardvark schema;
* OpenGeoMetadata's (OGM) Geoblacklight Aardvark schema.
The MITAardvark schema will comprise of majority of OGM's Aardvark schema,
with several updates for MIT's purposes. The schemas are read from
harvester/records/schemas directory and later added to a referencing.Registry.
Once in the registry, the validator can use the schemas to validate data.
Returns:
dict: JSON schemas for validating MITAardvark records.
"""
schemas = {}
schema_dir = os.path.join(os.path.dirname(__file__), "schemas")
with open(schema_dir + "/mit-schema-aardvark.json") as f:
schemas["mit-schema-aardvark"] = json.loads(f.read())
with open(schema_dir + "/geoblacklight-schema-aardvark.json") as f:
schemas["geoblacklight-schema-aardvark"] = json.loads(f.read())
return schemas

@property
def jsonschema_validator(self) -> Draft202012Validator:
"""Create a validator with JSON schemas for evaluating MITAardvark records.
An instance referencing.Registry is created with the required schema added as
resources. When the validator is created, the registry is included as an argument.
This enables the validator to use the schemas for validation.
Note: For more information on
* registries: https://python-jsonschema.readthedocs.io/en/stable/referencing
* validators: https://python-jsonschema.readthedocs.io/en/stable/validate/#the-validator-protocol
Returns:
Draft202012Validator: JSON schema validator with MITAardvark and OGM Aardvark
schemas.
"""
registry: Registry = Registry().with_resources(
[
(
"mit-schema-aardvark",
Resource.from_contents(self.jsonschemas["mit-schema-aardvark"]),
),
(
"geoblacklight-schema-aardvark",
Resource.from_contents(
self.jsonschemas["geoblacklight-schema-aardvark"]
),
),
]
)
return Draft202012Validator(
schema=self.jsonschemas["mit-schema-aardvark"],
registry=registry,
format_checker=FormatChecker(),
)

def validate(self) -> None:
"""Validate format of MITAardvark record using JSON schema.
The validator is retrieved in order to use .iter_errors() to iterate through
each of the validation errors in the normalized record. If there are any errors,
they are compiled into a single error message that appears in a
JSONSchemaValidationError exception.
"""
if schema_validation_errors := sorted(
self.jsonschema_validator.iter_errors(self.normalized_record),
key=str,
):
validation_error = JSONSchemaValidationError(schema_validation_errors)
logger.error(validation_error)
raise validation_error
logger.debug("The normalized MITAardvark record is valid")

0 comments on commit 4041f31

Please sign in to comment.