Skip to content

Commit

Permalink
Normalize to controlled terms for dct_format_s and gbl_resourceType_s…
Browse files Browse the repository at this point in the history
…m fields

Why these changes are being introduced:

Two fields written to the outputted MITAardvark records, 'dct_format_s' and 'gbl_resourceType_sm',
did not have their values controlled to suggested values from the Aardvark schema.  This was revealed
when attempting to map facet filters from the geo TIMDEX UI that rely on these fields.

While updates are still required in Transmogrifier and the TIMDEX data model for where these values
end up, normalizing them to controlled terms will benefit the quality of the data for facet aggregations.

How this addresses that need:
* sets of controlled terms have been added to records.controlled_terms
* method SourceRecord.get_controlled_dct_format_s_term() created to normalize values from source metadata
* method SourceRecord.get_controlled_gbl_resourceType_sm_terms() created to normalize values from source metadata
* these two new methods applied to FGDC, ISO19139, GBL1, and Aardvark source classes

Side effects of this change:
* Normalization of data for dct_format_s and gbl_resourceType_sm fields

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-195
  • Loading branch information
ghukill committed Feb 20, 2024
1 parent ff2592d commit 3a7dded
Show file tree
Hide file tree
Showing 14 changed files with 330 additions and 16 deletions.
6 changes: 4 additions & 2 deletions harvester/records/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _dct_creator_sm(self) -> list[str]:
return self.parsed_data.get("dct_creator_sm", [])

def _dct_format_s(self) -> str | None:
return self.parsed_data.get("dct_format_s")
return self.get_controlled_dct_format_s_term(self.parsed_data.get("dct_format_s"))

def _dct_issued_s(self) -> str | None:
return self.parsed_data.get("dct_issued_s")
Expand Down Expand Up @@ -126,7 +126,9 @@ def _gbl_dateRange_drsim(self) -> list[str]:
return value

def _gbl_resourceType_sm(self) -> list[str]:
return self.parsed_data.get("gbl_resourceType_sm", [])
return self.get_controlled_gbl_resourceType_sm_terms(
self.parsed_data.get("gbl_resourceType_sm", [])
)

def _gbl_indexYear_im(self) -> list[int]:
date_values = self.parsed_data.get("gbl_indexYear_im", [])
Expand Down
134 changes: 134 additions & 0 deletions harvester/records/controlled_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""harvester.records.controlled_terms"""

# Controlled terms for Aardvark field: "dct_format_s"
# https://opengeometadata.org/ogm-aardvark/#format
DCT_FORMAT_S_OGM_TERMS = {
"ArcGRID",
"CD - ROM",
"DEM",
"DVD - ROM",
"Feature",
"Class",
"Geodatabase",
"GeoJPEG",
"GeoJSON",
"GeoPackage",
"GeoPDF",
"GeoTIFF",
"JPEG",
"JPEG2000",
"KML",
"KMZ",
"LAS",
"LAZ",
"Mixed",
"MrSID",
"PDF",
"PNG",
"Pulsewaves",
"Raster",
"Dataset",
"Shapefile",
"SQLite",
"Database",
"Tabular",
"Data",
"TIFF",
}

# https://opengeometadata.org/ogm-aardvark/#resource-type-values-loc
# note: suggested most applicable to scanned maps
GBL_RESOURCETYPE_SM_LOC_TERMS = {
"Aerial photographs",
"Aerial views",
"Aeronautical charts",
"Armillary spheres",
"Astronautical charts",
"Astronomical models",
"Atlases",
"Bathymetric maps",
"Block diagrams",
"Bottle-charts",
"Cadastral maps",
"Cartographic materials",
"Cartographic materials for people with visual disabilities",
"Celestial charts",
"Celestial globes",
"Census data",
"Children's atlases",
"Children's maps",
"Comparative maps",
"Composite atlases",
"Digital elevation models",
"Digital maps",
"Early maps",
"Ephemerides",
"Ethnographic maps",
"Fire insurance maps",
"Flow maps",
"Gazetteers",
"Geological cross-sections",
"Geological maps",
"Globes",
"Gores (Maps)",
"Gravity anomaly maps",
"Index maps",
"Linguistic atlases",
"Loran charts",
"Manuscript maps",
"Mappae mundi",
"Mental maps",
"Meteorological charts",
"Military maps",
"Mine maps",
"Miniature maps",
"Nautical charts",
"Outline maps",
"Photogrammetric maps",
"Photomaps",
"Physical maps",
"Pictorial maps",
"Plotting charts",
"Portolan charts",
"Quadrangle maps",
"Relief models",
"Remote-sensing maps",
"Road maps",
"Statistical maps",
"Stick charts",
"Strip maps",
"Thematic maps",
"Topographic maps",
"Tourist maps",
"Upside-down maps",
"Wall maps",
"World atlases",
"World maps",
"Worm's-eye views",
"Zoning maps",
}

# https://opengeometadata.org/ogm-aardvark/#resource-type-values-ogm
# note: suggested most applicable to geospatial data
GBL_RESOURCETYPE_SM_OGM_TERMS = {
"Annotations",
"Basemaps",
"LiDAR",
"Line data",
"Mesh data",
"Multi-spectral data",
"Oblique photographs",
"Point cloud data ",
"Point data",
"Polygon data",
"Raster data",
"Satellite imagery",
"Streetview photographs",
"Table data",
}

# Controlled terms for Aardvark field: "gbl_resourceType_sm"
# note: controlled terms are allowed from LOC or OGM terms
GBL_RESOURCETYPE_SM_TERMS = GBL_RESOURCETYPE_SM_LOC_TERMS.union(
GBL_RESOURCETYPE_SM_OGM_TERMS
)
15 changes: 11 additions & 4 deletions harvester/records/fgdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,15 @@ def _dct_creator_sm(self) -> list[str]:
def _dct_format_s(self) -> str | None:
xpath_expr = """
//metadata
/spdoinfo
/direct
/distinfo
/stdorder
/digform
/digtinfo
/formname
"""
return self.single_string_from_xpath(xpath_expr)
return self.get_controlled_dct_format_s_term(
self.single_string_from_xpath(xpath_expr)
)

def _dct_issued_s(self) -> str | None:
xpath_expr = """
Expand Down Expand Up @@ -378,7 +383,9 @@ def _gbl_resourceType_sm(self) -> list[str]:
/sdtsterm
/sdtstype
"""
return self.string_list_from_xpath(xpath_expr)
return self.get_controlled_gbl_resourceType_sm_terms(
self.string_list_from_xpath(xpath_expr)
)

def _locn_geometry(self) -> str | None:
"""Field method: locn_geometry
Expand Down
6 changes: 4 additions & 2 deletions harvester/records/gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _dct_creator_sm(self) -> list[str] | None:
return self.parsed_data.get("dc_creator_sm")

def _dct_format_s(self) -> str | None:
return self.parsed_data.get("dc_format_s")
return self.get_controlled_dct_format_s_term(self.parsed_data.get("dc_format_s"))

def _dct_issued_s(self) -> str | None:
return self.parsed_data.get("dct_issued_s")
Expand Down Expand Up @@ -172,7 +172,9 @@ def _gbl_dateRange_drsim(self) -> list[str]:
return []

def _gbl_resourceType_sm(self) -> list[str]:
return self._convert_scalar_to_array("layer_geom_type_s")
return self.get_controlled_gbl_resourceType_sm_terms(
self._convert_scalar_to_array("layer_geom_type_s")
)

def _gbl_indexYear_im(self) -> list[int]:
if value := self.parsed_data.get("solr_year_i"):
Expand Down
8 changes: 6 additions & 2 deletions harvester/records/iso19139.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def _dct_format_s(self) -> str | None:
/gmd:name
/gco:CharacterString
"""
return self.single_string_from_xpath(xpath_expr)
return self.get_controlled_dct_format_s_term(
self.single_string_from_xpath(xpath_expr)
)

def _dct_issued_s(self) -> str | None:
xpath_expr = """
Expand Down Expand Up @@ -430,7 +432,9 @@ def _gbl_resourceType_sm(self) -> list[str]:
/gmd:keyword
/gco:CharacterString
"""
return self.string_list_from_xpath(xpath_expr)
return self.get_controlled_gbl_resourceType_sm_terms(
self.string_list_from_xpath(xpath_expr)
)

def _gbl_indexYear_im(self) -> list[int]:
"""Field method: gbl_indexYear_im
Expand Down
76 changes: 76 additions & 0 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

from harvester.aws.sqs import ZipFileEventMessage
from harvester.config import Config
from harvester.records.controlled_terms import (
DCT_FORMAT_S_OGM_TERMS,
GBL_RESOURCETYPE_SM_TERMS,
)
from harvester.records.exceptions import FieldMethodError
from harvester.records.validators import MITAardvarkFormatValidator
from harvester.utils import dedupe_list_of_values
Expand Down Expand Up @@ -228,6 +232,78 @@ def is_deleted(self) -> bool:
return True
return False

def get_controlled_dct_format_s_term(self, value: str | None) -> str | None:
"""Get single of controlled term for dct_format_s from original value."""
if not value:
return None

value = value.lower().strip()

# allow for some variants and similar matches
# note: order is important; more specific should be first
if (
"shapefile" in value
or value == "shp"
or value == "avshp"
or "shp," in value
or "esri" in value
or "geodatabase" in value
):
value = "shapefile"
elif "geotiff" in value:
value = "geotiff"
elif "jpeg2000" in value:
value = "jpeg2000"
elif "tiff/jpeg" in value or "multiple" in value:
value = "mixed"
elif "tiff" in value:
value = "tiff"
elif "jpeg" in value or "jpg" in value:
value = "jpeg"
elif "tabular" in value:
value = "tabular"

return {term.lower(): term for term in DCT_FORMAT_S_OGM_TERMS}.get(value)

def get_controlled_gbl_resourceType_sm_terms(
self, values: list[str] | None
) -> list[str]:
"""Get list of controlled terms for gbl_resourceType_sm from original values."""
if not values:
return []

controlled_values = []

# add allowed controlled terms not defined by Aardvark spec
controlled_terms = GBL_RESOURCETYPE_SM_TERMS
controlled_terms.update(["Image data", "Vector data", "Mixed"])

for value in values:
processed_value = value.strip().lower()

# allow for some variants and similar matches
# note: order is important; more specific should be first
if "polygon" in processed_value:
processed_value = "polygon data"
elif "raster" in processed_value:
processed_value = "raster data"
elif "point" in processed_value:
processed_value = "point data"
elif "line" in processed_value or "string" in processed_value:
processed_value = "line data"
elif "image" in processed_value:
processed_value = "image data"
elif "vector" in processed_value:
processed_value = "vector data"
elif "mixed" in processed_value or "composite" in processed_value:
processed_value = "mixed"

if controlled_value := {
term.lower(): term for term in GBL_RESOURCETYPE_SM_TERMS
}.get(processed_value):
controlled_values.append(controlled_value)
return dedupe_list_of_values(controlled_values)

def normalize(self) -> MITAardvark:
"""Method to normalize a SourceRecord to an MIT Aardvark MITAardvark instance.
Expand Down
12 changes: 12 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Aardvark,
MITAardvark,
Record,
SourceRecord,
XMLSourceRecord,
)
from harvester.records.validators import ValidateGeoshapeWKT
Expand Down Expand Up @@ -398,6 +399,17 @@ def iso19139_source_record_all_fields():
)


@pytest.fixture
def generic_source_record():
return SourceRecord(
origin="mit",
identifier="abc123",
metadata_format="fgdc",
data=b"Nothing to see here.",
event="created",
)


@pytest.fixture
def xpath_returns_nothing():
with patch.object(XMLSourceRecord, "xpath_query") as mocked_xpath:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/records/aardvark/aardvark_all_fields.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"dct_description_sm": [
"This layer is a georeferenced raster image of a paper map entitled Cairo (H-36-63, 75). The original scanned image with the legend and other information regarding the source is available in Dome. See the online linkage."
],
"dct_format_s": "Raster",
"dct_format_s": "Shapefile",
"dct_identifier_sm": [
"EG_CAIRO_A25TOPO_1972",
"http://hdl.handle.net/1721.3/172443",
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/records/fgdc/fgdc_all_fields.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2863,6 +2863,7 @@ http://www.nyc.gov/html/doitt/html/open/local_law_11_2012.shtml</accconst>
<stdorder>
<digform>
<digtinfo>
<formname>Shapefile</formname>
<transize Sync="TRUE">40.844</transize>
<dssize Sync="TRUE">40.844</dssize>
</digtinfo>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_records/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_aardvark_dct_creator_sm(aardvark_all_fields):


def test_aardvark_dct_format_s(aardvark_all_fields):
assert aardvark_all_fields._dct_format_s() == "Raster"
assert aardvark_all_fields._dct_format_s() == "Shapefile"


def test_aardvark_dct_issued_s(aardvark_all_fields):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_records/test_fgdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_fgdc_optional_dct_creator_sm(fgdc_source_record_all_fields):


def test_fgdc_optional_dct_format_s(fgdc_source_record_all_fields):
assert fgdc_source_record_all_fields._dct_format_s() == "Vector"
assert fgdc_source_record_all_fields._dct_format_s() == "Shapefile"


def test_fgdc_dct_format_s_missing_element_default_restricted(
Expand Down Expand Up @@ -241,7 +241,7 @@ def test_fgdc_optional_gbl_indexYear_im_date_parse_log_continue(


def test_fgdc_optional_gbl_resourceType_sm(fgdc_source_record_all_fields):
assert fgdc_source_record_all_fields._gbl_resourceType_sm() == ["G-polygon"]
assert fgdc_source_record_all_fields._gbl_resourceType_sm() == ["Polygon data"]


def test_fgdc_record_required_locn_geometry(fgdc_source_record_all_fields):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_records/test_gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def test_gbl1_required_gbl_dateRange_drsim(gbl1_all_fields):


def test_gbl1_required_gbl_resourceType_sm(gbl1_all_fields):
assert gbl1_all_fields._gbl_resourceType_sm() == ["Polygon"]
assert gbl1_all_fields._gbl_resourceType_sm() == ["Polygon data"]


def test_gbl1_required_gbl_indexYear_im(gbl1_all_fields):
Expand Down
Loading

0 comments on commit 3a7dded

Please sign in to comment.