## Preliminaries

*We'll set up a working directory and download a small sample
of images to ingest.*

In [None]:
import os

# We create a working directory for this example.
os.makedirs("00_files", exist_ok=True)

# We create a "data" subdirectory for the images and the metadata tag files.
os.makedirs("00_files", exist_ok=True)
data = "00_files/data"

# We create an "out" subdirectory for the processed images and the metadata catalog.
os.makedirs("00_files/out", exist_ok=True)
out  = "00_files/out"  

Let's download a sample collection of 20 images from the logbooks of the USCG
Storis.

In [None]:
import requests
import logging
import http.client

# To set up logging.
# https://stackoverflow.com/questions/16337511/
http.client.HTTPConnection.debuglevel = 1
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True

# To access the NARA API for images of the USCG Storis' 1957 logbook.
nara_id = "38547962"
api_base = 'https://catalog.archives.gov/api/v1/'
api_url = '{0}?naIds={1}'.format(api_base, nara_id)
res = requests.get(api_url)

# To parse the NARA API output for metadata.
entry_img_array = res.json().get('opaResponse').get('results').get('result')[0].get('objects').get('object')
digital_directory = entry_img_array[0].get('file').get('@path').split("/")[-2]

# To write the NARA API output to file for reference.
api_output = "{0}/nara_id_{1}.json".format(data, digital_directory, nara_id)
if res.status_code == 200:
    with open(api_output, 'wb') as f:
        f.write(res.content)

# To download images of 40 pages of the Storis' logbooks.
for img_info in entry_img_array: 

    # We test for mimetype "image/jpeg"---we don't want to download any files
    # with mimetype "application/pdf".
    if img_info.get('file').get('@mime') == "image/jpeg":

        img_name = img_info.get('file').get('@name')
        img_url = img_info.get('file').get('@url')
        img_res = requests.get(img_url)

        # To write a single image to file.
        local_img_name = "{0}/{1}".format(data, img_name)
        if img_res.status_code == 200:
            with open(local_img_name, 'wb') as img_f:
                img_f.write(img_res.content)

Let's write a metadata tagfile in the "data" subdirectory with the minimal
required metadata for the sample of images.

In [None]:
import csv

with open(os.path.join(data, 'metadata.csv'), mode='w') as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    metadata_writer.writerow(['archive.host_country', 'USA'])
    metadata_writer.writerow(['document.contact_person', 'Kevin Wood'])
    metadata_writer.writerow(['archive.notes', 'Images available via API at https://catalog.archives.gov/api/v1/38547962'])
    metadata_writer.writerow(['platform.name', 'USCG Storis'])
    metadata_writer.writerow(['document.id_within_archive', '38547962'])
    metadata_writer.writerow(['document.id_within_archive_type', 'NARA ID'])
    metadata_writer.writerow(['document.record_type', "ships' logs"])
    metadata_writer.writerow(['document.accession_to_archive_date', '2016-08-19'])
    metadata_writer.writerow(['document.standardized_region_list', 'north_atlantic'])
    metadata_writer.writerow(['document.start_date', '1957-06-09'])
    metadata_writer.writerow(['document.start_date', '1957-09-30'])
    metadata_writer.writerow(['document.rights_statement', 'CC0 Public Domain'])
    metadata_writer.writerow(['document.notes', ''])

During ingest we'll associate the above metadata to the 20 samples images. In
practice, any `.csv` file in the `data` subdirectory will be parsed as a
metadata tagefile. For example, the tagfile `metadata.csv` provides metadata for
images in the same directory `uscg-storis/data` as itself and in all
subdirectory below itself. 

To enable users to provide "hierarchical" metadata,
the information in a tagfile from a subdirectory has precendence over any
tagfiles from parent directories. (The idea is to provide the *most specific
metadata* for images in the same directory as the images themselves, while
parent directories might provide *general metadata* for a whole collection of
images.)

Here's what the tagfile we created looks like.

In [None]:
import pandas as pd
df = pd.read_csv(os.path.join(data, "metadata.csv"), header=None, names=["field", "value"])
df

## Ingesting images

First, we'll interactively load "helper" functions as defined in the `rdai`
module.

In [None]:
%run -i rdai

Now, we'll define the global variable `fixed_seq` in order to call `mint_uuid`
for each image file.

In [None]:
# We generate a fixed sequence for uuids.
get_fixed_seq()

If we're on casper, then we'll need to load python-magic from `rdadata`. Else,
we assume the python-magic package has already been installed, e.g., with `pip3
install python-magic --user`.

In [None]:
import sys
sys.path.append('/glade/u/home/rdadata/lib/python/site-packages')

In [None]:
# get_exiftool()
import subprocess
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')
import sys
sys.path.append(os.path.join(repo_dir, "dependencies/pyexiftool"))
import exiftool

In [None]:
normalized_catalog = get_normalized_catalog(data)
# We generate a metadata catalog (unnormalized) from the data directory.

catalog = unnormalize_catalog(normalized_catalog)
# We flatten the normalized catalog. 
# Each file in the data directory "has its own entry" in this catalog.
# We'll eventually ignore non-image files.

write_timestamped_catalog(catalog, out)
# We write this version of the metadata catalog to the output directory.

In [None]:
catalog = read_timestamped_catalog(out)
# We read in the most recent version of the metadata catalog from the out directory.

elementary_family = [c for c in catalog if c['media_type'].startswith("image")]
# We create a list of all the entries in the catalog that are image files.

In [None]:
import os
# We'll perform some file renames between the data directory and the out directory.

# We move all the images in the catalog to the output directory.
for member in elementary_family:
    os.rename(member['file_path'], os.path.join(out, member['uuid']))

In [None]:
# Conversely, we move all the images in the catalog back to the data directory.
for member in elementary_family:
    os.rename(os.path.join(out, member['uuid']), member['file_path'])

## Clean up the working directory

In [1]:
!rm -r 00_files/

rm: cannot remove '00_files/': No such file or directory
