## Preliminaries

*We'll set up a working directory and download a small sample
of images to ingest.*

In [1]:
import os

# We create a working directory for this example.
os.makedirs("00_files", exist_ok=True)

# We create a "data" subdirectory for the images and the metadata tag files.
os.makedirs("00_files", exist_ok=True)
data = "00_files/data"

# We create an "out" subdirectory for the processed images and the metadata catalog.
os.makedirs("00_files/out", exist_ok=True)
out  = "00_files/out"  

Let's download a sample collection of 20 images from the logbooks of the USCG
Storis.

In [2]:
import requests
import logging
import http.client

# To set up logging.
# https://stackoverflow.com/questions/16337511/
http.client.HTTPConnection.debuglevel = 1
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True

# To access the NARA API for images of the USCG Storis' 1957 logbook.
nara_id = "38547962"
api_base = 'https://catalog.archives.gov/api/v1/'
api_url = '{0}?naIds={1}'.format(api_base, nara_id)
res = requests.get(api_url)

# To parse the NARA API output for metadata.
entry_img_array = res.json().get('opaResponse').get('results').get('result')[0].get('objects').get('object')
digital_directory = entry_img_array[0].get('file').get('@path').split("/")[-2]

# To write the NARA API output to file for reference.
api_output = "{0}/nara_id_{1}.json".format(data, digital_directory, nara_id)
if res.status_code == 200:
    with open(api_output, 'wb') as f:
        f.write(res.content)

# To download images of 40 pages of the Storis' logbooks.
for img_info in entry_img_array: 

    # We test for mimetype "image/jpeg"---we don't want to download any files
    # with mimetype "application/pdf".
    if img_info.get('file').get('@mime') == "image/jpeg":

        img_name = img_info.get('file').get('@name')
        img_url = img_info.get('file').get('@url')
        img_res = requests.get(img_url)

        # To write a single image to file.
        local_img_name = "{0}/{1}".format(data, img_name)
        if img_res.status_code == 200:
            with open(local_img_name, 'wb') as img_f:
                img_f.write(img_res.content)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443
DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /api/v1/?naIds=38547962 HTTP/1.1" 200 None


send: b'GET /api/v1/?naIds=38547962 HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: application/json;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:52:56 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: trans

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:52:58 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa03.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa03
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:52:59 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:01 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:02 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:04 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa03.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa03
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:06 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:08 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:10 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:12 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa03.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa03
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:13 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:15 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443
DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1" 200 None


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:17 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:19 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:20 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:22 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:24 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:26 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:27 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:35 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.21.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Tue, 14 Jan 2020 00:53:38 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


Let's write a metadata tagfile in the "data" subdirectory with the minimal
required metadata for the sample of images.

In [3]:
import csv

with open(os.path.join(data, 'metadata.csv'), mode='w') as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    metadata_writer.writerow(['archive.host_country', 'USA'])
    metadata_writer.writerow(['document.contact_person', 'Kevin Wood'])
    metadata_writer.writerow(['archive.notes', 'Images available via API at https://catalog.archives.gov/api/v1/38547962'])
    metadata_writer.writerow(['platform.name', 'USCG Storis'])
    metadata_writer.writerow(['document.id_within_archive', '38547962'])
    metadata_writer.writerow(['document.id_within_archive_type', 'NARA ID'])
    metadata_writer.writerow(['document.record_type', "ships' logs"])
    metadata_writer.writerow(['document.accession_to_archive_date', '2016-08-19'])
    metadata_writer.writerow(['document.standardized_region_list', 'north_atlantic'])
    metadata_writer.writerow(['document.start_date', '1957-06-09'])
    metadata_writer.writerow(['document.start_date', '1957-09-30'])
    metadata_writer.writerow(['document.rights_statement', 'CC0 Public Domain'])
    metadata_writer.writerow(['document.notes', ''])

During ingest we'll associate the above metadata to the 20 samples images. In
practice, any `.csv` file in the `data` subdirectory will be parsed as a
metadata tagefile. For example, the tagfile `metadata.csv` provides metadata for
images in the same directory `uscg-storis/data` as itself and in all
subdirectory below itself. 

To enable users to provide "hierarchical" metadata,
the information in a tagfile from a subdirectory has precendence over any
tagfiles from parent directories. (The idea is to provide the *most specific
metadata* for images in the same directory as the images themselves, while
parent directories might provide *general metadata* for a whole collection of
images.)

Here's what the tagfile we created looks like.

In [11]:
import pandas as pd
df = pd.read_csv(os.path.join(data, "metadata.csv"), header=None, names=["field", "value"])
df

Unnamed: 0,field,value
0,archive.host_country,USA
1,document.contact_person,Kevin Wood
2,archive.notes,Images available via API at https://catalog.ar...
3,platform.name,USCG Storis
4,document.id_within_archive,38547962
5,document.id_within_archive_type,NARA ID
6,document.record_type,ships' logs
7,document.accession_to_archive_date,2016-08-19
8,document.standardized_region_list,north_atlantic
9,document.start_date,1957-06-09


## Ingesting images

First, we'll interactively load "helper" functions as defined in the `rdai`
module.

In [12]:
%run -i scripts/utils.py

Now, we'll define the global variable `fixed_seq` in order to call `mint_uuid`
for each image file.

In [13]:
# We generate a fixed sequence for uuids.
get_fixed_seq()

If we're on casper, then we'll need to load python-magic from `rdadata`. Else,
we assume the python-magic package has already been installed, e.g., with `pip3
install python-magic --user`.

In [8]:
import sys
sys.path.append('/glade/u/home/rdadata/lib/python/site-packages')

In [9]:
# get_exiftool()
import subprocess
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')
import sys
sys.path.append(os.path.join(repo_dir, "dependencies/pyexiftool"))
import exiftool

In [17]:
normalized_catalog = get_normalized_catalog(data)
# We generate a metadata catalog (unnormalized) from the data directory.

RDAI: Tag EXIF:ImageUniqueID=4ed9a76e366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0126.JPG.
RDAI: Tag EXIF:ImageUniqueID=4f111cf4366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0127.JPG.
RDAI: Tag EXIF:ImageUniqueID=4f47a8e6366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0128.JPG.
RDAI: Tag EXIF:ImageUniqueID=4f8020c2366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0129.JPG.
RDAI: Tag EXIF:ImageUniqueID=4fb66cd2366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0130.JPG.
RDAI: Tag EXIF:ImageUniqueID=4fefcfb6366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0131.JPG.
RDAI: Tag EXIF:ImageUniqueID=5026e75e366811ea8e2808119645fa1c already exists in file 00_files/data/storis-wmec-38-1957-logbooks_0132.JPG.
RDAI: Tag EXIF:ImageUniqueID=505fe

In [16]:
!tree

[36m.[00m
├── [36m00_files[00m
│   ├── [36mdata[00m
│   │   ├── [32mmetadata.csv[00m
│   │   ├── nara_id_storis-wmec-38-1957-logbooks.json
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0126.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0127.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0128.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0129.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0130.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0131.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0132.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0133.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0134.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0135.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0136.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0137.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0138.JPG[00m
│   │   ├── [33mstoris-wmec-38-1957-logbooks_0139.JPG

In [18]:
normalized_catalog

{'contents': [{'file_path': '00_files/data/metadata.csv',
   'media_type': 'text/plain'},
  {'file_path': '00_files/data/nara_id_storis-wmec-38-1957-logbooks.json',
   'media_type': 'text/plain'},
  {'file_path': '00_files/data/storis-wmec-38-1957-logbooks_0126.JPG',
   'media_type': 'image/jpeg',
   'uuid': '4ed9a76e366811ea8e2808119645fa1c'},
  {'file_path': '00_files/data/storis-wmec-38-1957-logbooks_0127.JPG',
   'media_type': 'image/jpeg',
   'uuid': '4f111cf4366811ea8e2808119645fa1c'},
  {'file_path': '00_files/data/storis-wmec-38-1957-logbooks_0128.JPG',
   'media_type': 'image/jpeg',
   'uuid': '4f47a8e6366811ea8e2808119645fa1c'},
  {'file_path': '00_files/data/storis-wmec-38-1957-logbooks_0129.JPG',
   'media_type': 'image/jpeg',
   'uuid': '4f8020c2366811ea8e2808119645fa1c'},
  {'file_path': '00_files/data/storis-wmec-38-1957-logbooks_0130.JPG',
   'media_type': 'image/jpeg',
   'uuid': '4fb66cd2366811ea8e2808119645fa1c'},
  {'file_path': '00_files/data/storis-wmec-38-1957-lo

In [19]:
catalog = unnormalize_catalog(normalized_catalog)
# We flatten the normalized catalog. 
# Each file in the data directory "has its own entry" in this catalog.
# We'll eventually ignore non-image files.

In [21]:
catalog

[{'archive.host_country': 'USA',
  'document.contact_person': 'Kevin Wood',
  'archive.notes': 'Images available via API at https://catalog.archives.gov/api/v1/38547962',
  'platform.name': 'USCG Storis',
  'document.id_within_archive': '38547962',
  'document.id_within_archive_type': 'NARA ID',
  'document.record_type': "ships' logs",
  'document.accession_to_archive_date': '2016-08-19',
  'document.standardized_region_list': 'north_atlantic',
  'document.start_date': '1957-09-30',
  'document.rights_statement': 'CC0 Public Domain',
  'document.notes': '',
  'file_path': '00_files/data/metadata.csv',
  'media_type': 'text/plain'},
 {'archive.host_country': 'USA',
  'document.contact_person': 'Kevin Wood',
  'archive.notes': 'Images available via API at https://catalog.archives.gov/api/v1/38547962',
  'platform.name': 'USCG Storis',
  'document.id_within_archive': '38547962',
  'document.id_within_archive_type': 'NARA ID',
  'document.record_type': "ships' logs",
  'document.accession_

In [22]:
write_timestamped_catalog(catalog, out)
# We write this version of the metadata catalog to the output directory.

In [25]:
df = pd.read_json('00_files/out/2020-01-13-170200-catalog.json')
df.columns

Index(['archive.host_country', 'archive.notes',
       'document.accession_to_archive_date', 'document.contact_person',
       'document.id_within_archive', 'document.id_within_archive_type',
       'document.notes', 'document.record_type', 'document.rights_statement',
       'document.standardized_region_list', 'document.start_date', 'file_path',
       'media_type', 'platform.name', 'uuid'],
      dtype='object')

In [30]:
df.shape

(24, 15)

In [32]:
df[df['platform.name'] == 'USCG Autumn']

Unnamed: 0,archive.host_country,archive.notes,document.accession_to_archive_date,document.contact_person,document.id_within_archive,document.id_within_archive_type,document.notes,document.record_type,document.rights_statement,document.standardized_region_list,document.start_date,file_path,media_type,platform.name,uuid
22,USA,Images available via API at https://catalog.ar...,2016-08-19,Kevin Wood,38547962,NARA ID,,ships' logs,CC0 Public Domain,north_atlantic,1957-09-30,00_files/data/tmp/metadata.csv,text/plain,USCG Autumn,
23,USA,Images available via API at https://catalog.ar...,2016-08-19,Kevin Wood,38547962,NARA ID,,ships' logs,CC0 Public Domain,north_atlantic,1957-09-30,00_files/data/tmp/storis-wmec-38-1957-logbooks...,image/jpeg,USCG Autumn,531294b6366811ea8e2808119645fa1c


In [None]:
catalog = read_timestamped_catalog(out)
# We read in the most recent version of the metadata catalog from the out directory.

elementary_family = [c for c in catalog if c['media_type'].startswith("image")]
# We create a list of all the entries in the catalog that are image files.

In [None]:
import os
# We'll perform some file renames between the data directory and the out directory.

# We move all the images in the catalog to the output directory.
for member in elementary_family:
    os.rename(member['file_path'], os.path.join(out, member['uuid']))

In [None]:
# Conversely, we move all the images in the catalog back to the data directory.
for member in elementary_family:
    os.rename(os.path.join(out, member['uuid']), member['file_path'])

## Clean up the working directory

In [1]:
!rm -r 00_files/

rm: cannot remove '00_files/': No such file or directory
