Skip to content

Commit

Permalink
Merge pull request #171 from MITLibraries/GDT-195-normalize-format-re…
Browse files Browse the repository at this point in the history
…source-type

Normalize to controlled terms for `dct_format_s` and `gbl_resourceType_sm`
  • Loading branch information
ghukill authored Feb 21, 2024
2 parents ff2592d + 59b0b14 commit 0168a10
Show file tree
Hide file tree
Showing 14 changed files with 395 additions and 17 deletions.
6 changes: 4 additions & 2 deletions harvester/records/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _dct_creator_sm(self) -> list[str]:
return self.parsed_data.get("dct_creator_sm", [])

def _dct_format_s(self) -> str | None:
return self.parsed_data.get("dct_format_s")
return self.get_controlled_dct_format_s_term(self.parsed_data.get("dct_format_s"))

def _dct_issued_s(self) -> str | None:
return self.parsed_data.get("dct_issued_s")
Expand Down Expand Up @@ -126,7 +126,9 @@ def _gbl_dateRange_drsim(self) -> list[str]:
return value

def _gbl_resourceType_sm(self) -> list[str]:
return self.parsed_data.get("gbl_resourceType_sm", [])
return self.get_controlled_gbl_resourceType_sm_terms(
self.parsed_data.get("gbl_resourceType_sm", [])
)

def _gbl_indexYear_im(self) -> list[int]:
date_values = self.parsed_data.get("gbl_indexYear_im", [])
Expand Down
134 changes: 134 additions & 0 deletions harvester/records/controlled_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""harvester.records.controlled_terms"""

# Controlled terms for Aardvark field: "dct_format_s"
# https://opengeometadata.org/ogm-aardvark/#format
DCT_FORMAT_S_OGM_TERMS = {
"ArcGRID",
"CD - ROM",
"DEM",
"DVD - ROM",
"Feature",
"Class",
"Geodatabase",
"GeoJPEG",
"GeoJSON",
"GeoPackage",
"GeoPDF",
"GeoTIFF",
"JPEG",
"JPEG2000",
"KML",
"KMZ",
"LAS",
"LAZ",
"Mixed",
"MrSID",
"PDF",
"PNG",
"Pulsewaves",
"Raster",
"Dataset",
"Shapefile",
"SQLite",
"Database",
"Tabular",
"Data",
"TIFF",
}

# https://opengeometadata.org/ogm-aardvark/#resource-type-values-loc
# note: suggested most applicable to scanned maps
GBL_RESOURCETYPE_SM_LOC_TERMS = {
"Aerial photographs",
"Aerial views",
"Aeronautical charts",
"Armillary spheres",
"Astronautical charts",
"Astronomical models",
"Atlases",
"Bathymetric maps",
"Block diagrams",
"Bottle-charts",
"Cadastral maps",
"Cartographic materials",
"Cartographic materials for people with visual disabilities",
"Celestial charts",
"Celestial globes",
"Census data",
"Children's atlases",
"Children's maps",
"Comparative maps",
"Composite atlases",
"Digital elevation models",
"Digital maps",
"Early maps",
"Ephemerides",
"Ethnographic maps",
"Fire insurance maps",
"Flow maps",
"Gazetteers",
"Geological cross-sections",
"Geological maps",
"Globes",
"Gores (Maps)",
"Gravity anomaly maps",
"Index maps",
"Linguistic atlases",
"Loran charts",
"Manuscript maps",
"Mappae mundi",
"Mental maps",
"Meteorological charts",
"Military maps",
"Mine maps",
"Miniature maps",
"Nautical charts",
"Outline maps",
"Photogrammetric maps",
"Photomaps",
"Physical maps",
"Pictorial maps",
"Plotting charts",
"Portolan charts",
"Quadrangle maps",
"Relief models",
"Remote-sensing maps",
"Road maps",
"Statistical maps",
"Stick charts",
"Strip maps",
"Thematic maps",
"Topographic maps",
"Tourist maps",
"Upside-down maps",
"Wall maps",
"World atlases",
"World maps",
"Worm's-eye views",
"Zoning maps",
}

# https://opengeometadata.org/ogm-aardvark/#resource-type-values-ogm
# note: suggested most applicable to geospatial data
GBL_RESOURCETYPE_SM_OGM_TERMS = {
"Annotations",
"Basemaps",
"LiDAR",
"Line data",
"Mesh data",
"Multi-spectral data",
"Oblique photographs",
"Point cloud data ",
"Point data",
"Polygon data",
"Raster data",
"Satellite imagery",
"Streetview photographs",
"Table data",
}

# Controlled terms for Aardvark field: "gbl_resourceType_sm"
# note: controlled terms are allowed from LOC or OGM terms
GBL_RESOURCETYPE_SM_TERMS = GBL_RESOURCETYPE_SM_LOC_TERMS.union(
GBL_RESOURCETYPE_SM_OGM_TERMS
)
22 changes: 18 additions & 4 deletions harvester/records/fgdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,24 @@ def _dct_creator_sm(self) -> list[str]:
return self.string_list_from_xpath(xpath_expr)

def _dct_format_s(self) -> str | None:
"""Field method: dct_format_s.
This method prefers an explicit value from //digtinfo/formname, but when that is
not present, or does not map to a controlled term, then the shared method
get_controlled_dct_format_s_term() checks values from field method
gbl_resourceType_sm() for help on determining file type.
"""
xpath_expr = """
//metadata
/spdoinfo
/direct
/distinfo
/stdorder
/digform
/digtinfo
/formname
"""
return self.single_string_from_xpath(xpath_expr)
return self.get_controlled_dct_format_s_term(
self.single_string_from_xpath(xpath_expr)
)

def _dct_issued_s(self) -> str | None:
xpath_expr = """
Expand Down Expand Up @@ -378,7 +390,9 @@ def _gbl_resourceType_sm(self) -> list[str]:
/sdtsterm
/sdtstype
"""
return self.string_list_from_xpath(xpath_expr)
return self.get_controlled_gbl_resourceType_sm_terms(
self.string_list_from_xpath(xpath_expr)
)

def _locn_geometry(self) -> str | None:
"""Field method: locn_geometry
Expand Down
6 changes: 4 additions & 2 deletions harvester/records/gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _dct_creator_sm(self) -> list[str] | None:
return self.parsed_data.get("dc_creator_sm")

def _dct_format_s(self) -> str | None:
return self.parsed_data.get("dc_format_s")
return self.get_controlled_dct_format_s_term(self.parsed_data.get("dc_format_s"))

def _dct_issued_s(self) -> str | None:
return self.parsed_data.get("dct_issued_s")
Expand Down Expand Up @@ -172,7 +172,9 @@ def _gbl_dateRange_drsim(self) -> list[str]:
return []

def _gbl_resourceType_sm(self) -> list[str]:
return self._convert_scalar_to_array("layer_geom_type_s")
return self.get_controlled_gbl_resourceType_sm_terms(
self._convert_scalar_to_array("layer_geom_type_s")
)

def _gbl_indexYear_im(self) -> list[int]:
if value := self.parsed_data.get("solr_year_i"):
Expand Down
8 changes: 6 additions & 2 deletions harvester/records/iso19139.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def _dct_format_s(self) -> str | None:
/gmd:name
/gco:CharacterString
"""
return self.single_string_from_xpath(xpath_expr)
return self.get_controlled_dct_format_s_term(
self.single_string_from_xpath(xpath_expr)
)

def _dct_issued_s(self) -> str | None:
xpath_expr = """
Expand Down Expand Up @@ -430,7 +432,9 @@ def _gbl_resourceType_sm(self) -> list[str]:
/gmd:keyword
/gco:CharacterString
"""
return self.string_list_from_xpath(xpath_expr)
return self.get_controlled_gbl_resourceType_sm_terms(
self.string_list_from_xpath(xpath_expr)
)

def _gbl_indexYear_im(self) -> list[int]:
"""Field method: gbl_indexYear_im
Expand Down
102 changes: 102 additions & 0 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

from harvester.aws.sqs import ZipFileEventMessage
from harvester.config import Config
from harvester.records.controlled_terms import (
DCT_FORMAT_S_OGM_TERMS,
GBL_RESOURCETYPE_SM_TERMS,
)
from harvester.records.exceptions import FieldMethodError
from harvester.records.validators import MITAardvarkFormatValidator
from harvester.utils import dedupe_list_of_values
Expand Down Expand Up @@ -228,6 +232,104 @@ def is_deleted(self) -> bool:
return True
return False

def get_controlled_dct_format_s_term(self, value: str | None) -> str | None:
"""Get a single controlled term for dct_format_s from original value.
If a value is not provided, or does not match a controlled term, this method falls
back on looking at controlled values from the gbl_resourceType_sm field which may
indicate the file type (e.g. Vector or Polygon data indicates it is likely a
Shapefile).
"""
controlled_value = None

if value:
value = value.lower().strip()

# allow for some variants and similar matches
# note: order is important; more specific should be first
if (
"shapefile" in value
or value == "shp"
or value == "avshp"
or "shp," in value
or "esri" in value
or "geodatabase" in value
):
value = "shapefile"
elif "geotiff" in value:
value = "geotiff"
elif "jpeg2000" in value:
value = "jpeg2000"
elif "tiff/jpeg" in value or "multiple" in value:
value = "mixed"
elif "tiff" in value:
value = "tiff"
elif "jpeg" in value or "jpg" in value:
value = "jpeg"
elif "tabular" in value:
value = "tabular"

controlled_value = {
term.lower(): term for term in DCT_FORMAT_S_OGM_TERMS
}.get(value)

# if still no controlled format value determined, fallback on looking at
# controlled resource types that may indicate file format type
if not controlled_value:
resource_type_to_format_map = {
"Polygon data": "Shapefile",
"Point data": "Shapefile",
"Line data": "Shapefile",
"Vector data": "Shapefile",
}
for (
resource_type
) in self._gbl_resourceType_sm(): # type: ignore[attr-defined]
if mapped_value := resource_type_to_format_map.get(resource_type):
controlled_value = mapped_value

return controlled_value

def get_controlled_gbl_resourceType_sm_terms(
self, values: list[str] | None
) -> list[str]:
"""Get list of controlled terms for gbl_resourceType_sm from original values."""
if not values:
return []

controlled_values = []

# add allowed controlled terms not defined by Aardvark spec
controlled_terms = GBL_RESOURCETYPE_SM_TERMS
controlled_terms.update(["Image data", "Vector data", "Mixed"])

for value in values:
processed_value = value.strip().lower()

# allow for some variants and similar matches
# note: order is important; more specific should be first
if "polygon" in processed_value:
processed_value = "polygon data"
elif "raster" in processed_value:
processed_value = "raster data"
elif "point" in processed_value:
processed_value = "point data"
elif "line" in processed_value or "string" in processed_value:
processed_value = "line data"
elif "image" in processed_value:
processed_value = "image data"
elif "vector" in processed_value:
processed_value = "vector data"
elif "mixed" in processed_value or "composite" in processed_value:
processed_value = "mixed"

if controlled_value := {
term.lower(): term for term in GBL_RESOURCETYPE_SM_TERMS
}.get(processed_value):
controlled_values.append(controlled_value)

return dedupe_list_of_values(controlled_values)

def normalize(self) -> MITAardvark:
"""Method to normalize a SourceRecord to an MIT Aardvark MITAardvark instance.
Expand Down
17 changes: 17 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Aardvark,
MITAardvark,
Record,
SourceRecord,
XMLSourceRecord,
)
from harvester.records.validators import ValidateGeoshapeWKT
Expand Down Expand Up @@ -398,6 +399,22 @@ def iso19139_source_record_all_fields():
)


@pytest.fixture
def generic_source_record():
class GenericSourceRecord(SourceRecord):

def _gbl_resourceType_sm(self):
return []

return GenericSourceRecord(
origin="mit",
identifier="abc123",
metadata_format="fgdc",
data=b"Nothing to see here.",
event="created",
)


@pytest.fixture
def xpath_returns_nothing():
with patch.object(XMLSourceRecord, "xpath_query") as mocked_xpath:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/records/aardvark/aardvark_all_fields.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"dct_description_sm": [
"This layer is a georeferenced raster image of a paper map entitled Cairo (H-36-63, 75). The original scanned image with the legend and other information regarding the source is available in Dome. See the online linkage."
],
"dct_format_s": "Raster",
"dct_format_s": "Shapefile",
"dct_identifier_sm": [
"EG_CAIRO_A25TOPO_1972",
"http://hdl.handle.net/1721.3/172443",
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/records/fgdc/fgdc_all_fields.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2863,6 +2863,7 @@ http://www.nyc.gov/html/doitt/html/open/local_law_11_2012.shtml</accconst>
<stdorder>
<digform>
<digtinfo>
<formname>Shapefile</formname>
<transize Sync="TRUE">40.844</transize>
<dssize Sync="TRUE">40.844</dssize>
</digtinfo>
Expand Down
Loading

0 comments on commit 0168a10

Please sign in to comment.