Skip to content

Commit

Permalink
Merge pull request #112 from MITLibraries/s3-marc
Browse files Browse the repository at this point in the history
Switch MARC loading to use MARC21 from S3
  • Loading branch information
Mike Graves committed Dec 10, 2019
2 parents f3241f0 + 55c7e1a commit 80ef3e3
Show file tree
Hide file tree
Showing 8 changed files with 398 additions and 418 deletions.
489 changes: 225 additions & 264 deletions Pipfile.lock

Large diffs are not rendered by default.

30 changes: 16 additions & 14 deletions slingshot/cli.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from concurrent.futures import as_completed, ThreadPoolExecutor
from datetime import datetime
import io
import os.path
import traceback
from urllib.parse import urlparse

import click
from sqlalchemy.engine.url import URL

from slingshot import state, PUBLIC_WORKSPACE, RESTRICTED_WORKSPACE, DATASTORE
from slingshot.app import (GeoServer, HttpSession, make_slug, publish_layer,
from slingshot import (state, PUBLIC_WORKSPACE, RESTRICTED_WORKSPACE,
DATASTORE, S3_BUFFER_SIZE)
from slingshot.app import (GeoServer, HttpSession, publish_layer,
publishable_layers, Solr)
from slingshot.db import engine
from slingshot.marc import MarcParser
from slingshot.record import Record
from slingshot.s3 import session
from slingshot.marc import filter_record, MarcParser
from slingshot.s3 import session, S3IO


@click.group()
Expand Down Expand Up @@ -209,20 +211,20 @@ def marc(marc_file, solr, solr_user, solr_password):
"Paper Map" or "Cartographic Material", and then index all appropriate
records from the provided MARC XML file.
"""
fparts = urlparse(marc_file)
s3 = session().resource('s3', region_name='us-east-1')
marc = io.BufferedReader(
S3IO(s3.Object(fparts.netloc, fparts.path.lstrip('/'))),
buffer_size=S3_BUFFER_SIZE)
solr_auth = (solr_user, solr_password) if solr_user and solr_password \
else None
s = Solr(solr, HttpSession(), solr_auth)
s.delete('dct_provenance_s:MIT AND dc_format_s:"Paper Map"')
s.delete('dct_provenance_s:MIT AND dc_format_s:"Cartographic Material"')
for record in MarcParser(marc_file):
for record in MarcParser(marc, filter_record):
try:
if record.get('dc_format_s') and \
record.get('_location') in ('Map Room', 'GIS Collection'):
del(record['_location'])
record['layer_slug_s'] = make_slug(record['dc_identifier_s'])
gbl_record = Record(**record)
s.add(gbl_record.as_dict(), soft_commit=False)
s.add(record.as_dict(), soft_commit=False)
except Exception as e:
click.echo('Failed indexing {}: {}'.format(
record['dc_identifier_s'], e))
click.echo(
'Failed indexing {}: {}'.format(record.dc_identifier_s, e))
s.commit()
238 changes: 101 additions & 137 deletions slingshot/marc.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from collections.abc import Iterator
from decimal import Decimal, getcontext
import re
try:
from lxml.etree import iterparse
except ImportError:
from xml.etree.ElementTree import iterparse

import pymarc
from pymarc import MARCReader

from slingshot.app import make_slug
from slingshot.record import Record


COORD_REGEX = re.compile(
r"""^(?P<hemisphere>[NSEW+-])?
(?P<degrees>\d{3}(\.\d*)?)
(?P<minutes>\d{2}(\.\d*)?)?
(?P<seconds>\d{2}(\.\d*)?)?""", re.IGNORECASE | re.VERBOSE)


DC_FORMAT_S = {
Expand All @@ -15,136 +22,93 @@
}


class XMLParser(object):
"""Base streaming XML parser for iterating over records.
A subclass must implement ``start_handler`` and ``end_handler`` methods
that accept an Element. The subclass must also provide an attribute called
``record_elem`` that specifies the element name for a record. This is a
namespace aware parser, so make sure to include the namespace if one
exists.
:param fstream: file name or file object
:returns: record generator
"""

def __init__(self, fstream=None):
self.fstream = fstream

def __iter__(self):
for event, elem in self._context(self.fstream):
if event == 'start':
self.start_handler(elem)
def filter_record(record):
return ('655' in record) and (record['655']['a'] == 'Maps.') and \
('852' in record) and \
(formats(record).intersection(DC_FORMAT_S.keys())) and \
(record['852']['c'] in ('MAPRM', 'GIS'))


def formats(record):
return {sf for f in record.get_fields('852')
for sf in f.get_subfields('k')}


class MarcParser(Iterator):
def __init__(self, stream, f=None):
self.reader = MARCReader(stream)
if f:
self.reader = filter(f, self.reader)

def __next__(self):
while True:
try:
record = next(self.reader)
except StopIteration:
raise
except Exception:
# pymarc doesn't handle broken MARC very well. A single bad
# record will stop the whole train.
continue
if not record:
continue
ident = f'http://library.mit.edu/item/{record["001"].value()}'
subjects = {sf for f in record.get_fields('650')
for sf in f.get_subfields('a')}
spatial_subjects = {sf for f in record.get_fields('650')
for sf in f.get_subfields('z')}
fmts = [DC_FORMAT_S[f] for f in formats(record)
if f in DC_FORMAT_S]
if fmts:
fmt = fmts[0]
else:
fmt = None
if '034' in record and all([record['034'][s] for s in 'defg']):
w = convert_coord(pad_034(record['034']['d']))
e = convert_coord(pad_034(record['034']['e']))
n = convert_coord(pad_034(record['034']['f']))
s = convert_coord(pad_034(record['034']['g']))
geom = f'ENVELOPE({w}, {e}, {n}, {s})'
else:
if elem.tag == self.record_elem:
self.end_handler(elem)
yield self.record
else:
self.end_handler(elem)
elem.clear()

def _context(self, fstream):
return iterparse(fstream, events=('start', 'end'))


class MarcParser(XMLParser):
MARC_NS = 'http://www.loc.gov/MARC21/slim'
record_elem = '{http://www.loc.gov/MARC21/slim}record'

###################
# MARC lists the following possible formats for a coordinate in the 034
# field:
# hdddmmss
# hddd.dddddd
# [+/-]ddd.dddddd (plus sign optional)
# hdddmm.mmmm
# hdddmmss.sss
###################
COORD_REGEX = re.compile(
r"""^(?P<hemisphere>[NSEW+-])?
(?P<degrees>\d{3}(\.\d*)?)
(?P<minutes>\d{2}(\.\d*)?)?
(?P<seconds>\d{2}(\.\d*)?)?""", re.IGNORECASE | re.VERBOSE)

def start_handler(self, elem):
if elem.tag == self.record_elem:
self._record = pymarc.Record()
elif elem.tag == '{%s}datafield' % self.MARC_NS:
tag = elem.get('tag')
ind1 = elem.get('ind1')
ind2 = elem.get('ind2')
self._field = pymarc.Field(tag, indicators=[ind1, ind2])
elif elem.tag == '{%s}controlfield' % self.MARC_NS:
# skip controlfields that are letters
if elem.get('tag').isdigit():
self._field = pymarc.Field(elem.get('tag'))

def end_handler(self, elem):
if elem.tag == '{%s}datafield' % self.MARC_NS:
self._record.add_field(self._field)
self._field = None
elif elem.tag == '{%s}controlfield' % self.MARC_NS:
if elem.get('tag').isdigit():
self._field.data = elem.text
self._record.add_field(self._field)
self._field = None
elif elem.tag == '{%s}subfield' % self.MARC_NS:
self._field.add_subfield(elem.get('code'), elem.text)

@property
def record(self):
record = {}
marc_id = 'http://library.mit.edu/item/{}'.format(
self._record['001'].value())

record['dc_identifier_s'] = marc_id
record['dc_rights_s'] = 'Public'
record['dc_title_s'] = self._record.title()
record['dc_publisher_s'] = self._record.publisher()
record['dc_creator_sm'] = [self._record.author()]
record['dc_type_s'] = 'Physical Object'
record['dct_references_s'] = {'http://schema.org/url': marc_id}
record['layer_geom_type_s'] = 'Mixed'
if '520' in self._record:
record['dc_description_s'] = self._record['520'].format_field()
for field in self._record.get_fields('650'):
for subfield in field.get_subfields('a'):
record.setdefault('dc_subject_sm', set()).add(subfield)
for subfield in field.get_subfields('z'):
record.setdefault('dct_spatial_sm', set()).add(subfield)
record['dct_temporal_sm'] = [self._record.pubyear()]
if '876' in self._record:
record['dc_format_s'] = DC_FORMAT_S.get(self._record['876']['k'])
record['_location'] = self._record['876']['B']
if self._record['034'] is not None and \
all([self._record['034'][s] for s in 'defg']):
w = self.convert_coord(self.pad_034(self._record['034']['d']))
e = self.convert_coord(self.pad_034(self._record['034']['e']))
n = self.convert_coord(self.pad_034(self._record['034']['f']))
s = self.convert_coord(self.pad_034(self._record['034']['g']))
record['solr_geom'] = 'ENVELOPE({}, {}, {}, {})'.format(w, e, n, s)
return record

@classmethod
def convert_coord(cls, coordinate, precision=10):
o_precision = getcontext().prec
getcontext().prec = precision
matches = cls.COORD_REGEX.search(coordinate)
if not matches:
return None
parts = matches.groupdict()
dec = Decimal(parts.get('degrees')) + \
Decimal(parts.get('minutes') or 0) / 60 + \
Decimal(parts.get('seconds') or 0) / 3600
if parts.get('hemisphere') and \
parts.get('hemisphere').lower() in 'ws-':
dec = dec * -1
getcontext().prec = o_precision
return dec

@classmethod
def pad_034(self, coordinate):
h, c = coordinate[0], coordinate[1:]
if h in 'NSEW':
c = "{:>07}".format(c)
return h + c
geom = None
r = Record(
dc_identifier_s=ident,
dc_rights_s='Public',
dc_title_s=record.title(),
dc_publisher_s=record.publisher(),
dc_creator_sm=[record.author()],
dc_type_s='Physical Object',
dct_references_s={'http://schema.org/url': ident},
layer_geom_type_s='Mixed',
dc_subject_sm=subjects,
dct_spatial_sm=spatial_subjects,
dct_temporal_sm=record.pubyear(),
solr_geom=geom,
dc_format_s=fmt,
layer_slug_s=make_slug(ident),
)
return r


def convert_coord(coordinate, precision=10):
o_precision = getcontext().prec
getcontext().prec = precision
matches = COORD_REGEX.search(coordinate)
if not matches:
return None
parts = matches.groupdict()
dec = Decimal(parts.get('degrees')) + \
Decimal(parts.get('minutes') or 0) / 60 + \
Decimal(parts.get('seconds') or 0) / 3600
if parts.get('hemisphere') and \
parts.get('hemisphere').lower() in 'ws-':
dec = dec * -1
getcontext().prec = o_precision
return dec


def pad_034(coordinate):
h, c = coordinate[0], coordinate[1:]
if h in 'NSEW':
c = "{:>07}".format(c)
return h + c
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def s3():
conn = boto3.resource("s3")
conn.create_bucket(Bucket="upload")
conn.create_bucket(Bucket="store")
conn.create_bucket(Bucket='marc')
yield conn


Expand Down Expand Up @@ -45,6 +46,13 @@ def db():
conn.execute("DROP TABLE {}.bermuda".format(schema))


@pytest.fixture
def marc_records(s3):
bucket = s3.Bucket('marc')
bucket.upload_file(_data_file('fixtures/map_01.mrc'), 'records.mrc')
return 's3://marc/records.mrc'


@pytest.fixture
def shapefile():
return _data_file('fixtures/bermuda.zip')
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/map_01.mrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
02680cem a2200553 i 450000100100000000500170001000700090002700800410003601000170007703400550009403400460014903500210019504000700021604200080028604300120029405000210030605200090032707400140033608600240035011000490037424500720042324600400049525501340053525500700066926000650073930000920080433600360089633700250093233800220095750000460097950000360102550003840106150000590144565000340150465000370153865000350157565100430161065100630165365100230171665500170173965500250175685200450178191000960182691000150192204900090193791000960194694900720204299400120211400210728620130214072514.0aj canzn121025s2012 dcubg az a f 0 eng c a 20125943111 aab2000000dE0600000eE0743000fN0383000gN02900000 aadE0600000eE0743000fN0383000gN0290000 a(OCoLC)814380561 aUPMbengerdacUPMerdadUBYdOCLCOdUPMdXFFdGPOdDLCdOVVdMYG apcc aa-af--- 4aG7631.A1 2012.U5 a7631 a0856-A-010 aPREX 3.10/4:AF 3/241 aUnited States.bCentral Intelligence Agency.10aAfghanistan country profile :bAfghanistan provinces and districts.30aAfghanistan provinces and districts aScale 1:2,000,000 ;bAzimuthal equal area, center point 34°00ʹN and 67°33ʹEc(E 60°00ʹ--E 74°30ʹ/N 38°30ʹ--N 29°00ʹ). aScale not givenc(E 60°00ʹ--E 74°30ʹ/N 38°30ʹ--N 29°00ʹ). a[Washington, D.C.] :b[Central Intelligence Agency],c[2012] a2 maps on 1 sheet :bboth sides, 1 color ;c52 x 66 cm and 63 x 81 cm, sheet 65 x 99 cm acartographic images2rdacontent aunmediated2rdamedia asheet2rdacarrier aRelief shown by shading and spot heights. aShipping list no.: 2013-0001-S. aCountry profile map includes tables of "Geographic information" in top border, world location map, comparative area map with the eastern half of the United States, 4 ancillary maps on "Ethnolinguistic Groups," "Population density 2009," "Rainfall 2010" and "Poppy Cultivation 2010," source of information note, and historical timeline along the left border with explanatory text. a"798377AI (G03373) 8-12" and "900163AI (G03374) 8-12." 0aGeographyzAfghanistanvMaps. 0aArea studieszAfghanistanvMaps. 0aDemographyzAfghanistanvMaps. 0aAfghanistanxStudy and teachingvMaps. 0aAfghanistanxAdministrative and political divisionsvMaps. 0aAfghanistanvMaps. 7aMaps.2lcgft 7aOutline maps.2lcgft0 bRTCcMAPRMkMAPhG7631.A1 2012.U5pAvail amjy130213bmodc~ewalljekmlblankmblankordappccqr-mapras1w~x0y~z~imjyd130213 aMARCIVEAUT aMYGG amjy130213bmodc~ewalljekmlblankmblankordappccqr-mapras1w~x0y~z~imjyd130213 04IPar-mapbRTCcMAPRMkMAPo0p39080036215264x04hG7631.A1 2012.U5 a02bMYG
4 changes: 2 additions & 2 deletions tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_publish_layer_makes_fgdc_public(s3, shapefile, db):
"store", "mock://example.com/ogc",
"mock://example.com/download")
obj = s3.Bucket("store").Object("bermuda/bermuda.xml")
grants = [g for g in obj.Acl().grants if g['Grantee'].get("URI") == \
grants = [g for g in obj.Acl().grants if g['Grantee'].get("URI") ==
'http://acs.amazonaws.com/groups/global/AllUsers']
assert grants.pop()['Permission'] == 'READ'

Expand All @@ -121,7 +121,7 @@ def test_publish_layer_uses_ogc_proxy_url(s3, shapefile, db):
"mock://example.com/download")
obj = s3.Bucket("store").Object("bermuda/geoblacklight.json")
assert "mock://example.com/ogc/wms" in obj.get()['Body'].read()\
.decode('utf8')
.decode('utf8')


def test_publishable_layers_includes_new_layer(s3, dynamo_table):
Expand Down
13 changes: 12 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from slingshot import state
from slingshot.cli import main
from slingshot.db import engine, metadata
from slingshot.db import metadata


@pytest.fixture
Expand Down Expand Up @@ -91,3 +91,14 @@ def test_initializes_geoserver(runner):
'mock://example.com/geoserver'])
assert res.exit_code == 0
assert m.call_count == 6


def test_publishes_marc_records(runner, marc_records):
with requests_mock.Mocker() as m:
m.post('mock://example.com/solr/update')
m.post('mock://example.com/solr/update/json/docs')
res = runner.invoke(main, ['marc', marc_records,
'--solr', 'mock://example.com/solr'])
assert 'mock://example.com/solr/update/json/docs' in \
[call.url for call in m.request_history]
assert res.exit_code == 0

0 comments on commit 80ef3e3

Please sign in to comment.