## setting up an "acceptance test" by cleaning out the `01_files` workspace

In [None]:
!rm -rf 01_files

## create `01_files` workspace and populate it with `tar` archive

In [None]:
import os

# We create a working directory for this example.
os.makedirs("01_files", exist_ok=True)

# We create a "data" subdirectory for the images and the metadata tag files.
# os.makedirs("01_files/data", exist_ok=True)
data = "01_files/data"

# We create an "out" subdirectory for the processed images and the metadata catalog.
os.makedirs("01_files/out", exist_ok=True)
out  = "01_files/out" 

In [None]:
import requests
# import logging
# import http.client

# # To set up logging.
# # https://stackoverflow.com/questions/16337511/
# http.client.HTTPConnection.debuglevel = 1
# logging.basicConfig()
# logging.getLogger().setLevel(logging.DEBUG)
# requests_log = logging.getLogger("requests.packages.urllib3")
# requests_log.setLevel(logging.DEBUG)
# requests_log.propagate = True

# To access 
mwe_url = "https://github.com/coltongrainger/2020-02-05-mwe/raw/master/2020-02-05-mwe.tar.gz"
mwe_tar_archive = "01_files/2020-02-05-mwe.tar.gz"
res = requests.get(mwe_url)
if res.status_code == 200:
    with open(mwe_tar_archive, 'wb') as f:
        f.write(res.content)

In [None]:
import subprocess
retcode = subprocess.call(['tar', '-xvf', mwe_tar_archive, '-C', "01_files"])
if retcode == 0:
    print("Extracted successfully")
else:
    raise IOError('tar exited with code %d' % retcode)

In [None]:
os.remove(mwe_tar_archive)

In [None]:
for p in os.listdir(data):
    print(os.listdir(os.path.join(data, p)))

## catalog images under `data` directory

In [None]:
%run -i "scripts/utils.py"

In [None]:
# We generate a fixed sequence for uuids.
get_fixed_seq()

In [None]:
import sys
sys.path.append('/glade/u/home/rdadata/lib/python/site-packages')

# get_exiftool()
import subprocess
import os
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')
sys.path.append(os.path.join(repo_dir, "dependencies/pyexiftool"))
import exiftool

In [None]:
normalized_catalog = get_normalized_catalog(data)
# We generate a metadata catalog (unnormalized) from the data directory.

In [None]:
catalog = unnormalize_catalog(normalized_catalog)
# We flatten the normalized catalog. 
# Each file in the data directory "has its own entry" in this catalog.
# We'll eventually ignore non-image files.

In [None]:
write_timestamped_catalog(catalog, out)
# We write this version of the metadata catalog to the output directory.

In [None]:
catalog = read_timestamped_catalog(out)
# We read in the most recent version of the metadata catalog from the out directory.

## rename images by `uuid`

In [None]:
elementary_family = [c for c in catalog if c['media_type'].startswith("image")]
# We create a list of all the entries in the catalog that are image files.

In [None]:
import os
# We'll perform some file renames between the data directory and the out directory.

# We move all the images in the catalog to the output directory.
for member in elementary_family:
    os.rename(member['file_path'], os.path.join(out, member['uuid']))

## pick up catalog with `pandas`

In [None]:
import pandas as pd

df = pd.DataFrame(catalog)
df = df[df['media_type'].str.contains("image")] 
# we only want to keep track of image files

## filter `DataFrame` for archives, platforms, documents, and images to insert
into DB `images`

In [None]:
import re

In [None]:
arc_df = df.filter(regex=("^archive"))
arc_df = arc_df.drop_duplicates()
arc_df.rename(columns=lambda x: re.sub('archive.','',x), inplace=True)
arc_df.to_dict('records')

In [None]:
plt_df = df.filter(regex=("^platform"))
plt_df = plt_df.drop_duplicates()
plt_df.rename(columns=lambda x: re.sub('platform.','',x), inplace=True)
plt_df.to_dict('records')

In [None]:
doc_df = pd.concat(
    [df.filter(regex=("^document")), 
     df.filter(items=["archive.host_country", "archive.name"]),
     df.filter(items=["platform.host_country", "platform.name"])
    ], axis=1
).drop_duplicates()
doc_df.rename(columns=lambda x: re.sub('document.', '', x), inplace=True)
doc_df.fillna("", inplace=True) # avoid NaNs

## declare and persist tables in `images`

In [None]:
%run -i scripts/tables.py

In [None]:
# from sqlalchemy import create_engine
# engine = create_engine('mysql+mysqlconnector://user:pass@rda-db.ucar.edu/images')
# engine = create_engine('mysql+pymysql://user:pass@localhost/images')
# TODO read defaults extra file

In [None]:
metadata.drop_all(engine) # clean out the DB
metadata.create_all(engine) # reinitialize the canonical schema.

In [None]:
connection = engine.connect() # let's start working with these tables

## metadata insertion for archives

In [None]:
from sqlalchemy import insert, select

In [None]:
ins = insert(archive)
rp = connection.execute(ins, arc_df.to_dict('records'))
# throws integrity error if run twice

In [None]:
s = select([archive])
rp = connection.execute(s)
for arc in rp:
    print(arc)

## metadata insertion for platforms

In [None]:
ins = insert(platform)
rp = connection.execute(ins, plt_df.to_dict('records'))

In [None]:
s = select([platform])
rp = connection.execute(s)
for plt in rp:
    print(plt)

## metadata insertion for documents

In [None]:
from sqlalchemy import and_

def get_archive_fk(doc_dict): # add error handling
    s = select([archive.c.archive_id])
    s = s.where(and_(
        archive.c.name == doc_dict['archive.name'],
        archive.c.host_country == doc_dict['archive.host_country']
    ))
    s = s.limit(1) # should be unique anyways
    rp = connection.execute(s)
    result = rp.scalar() # is the parent id
    return result

def get_platform_fk(doc_dict): # add error handling
    s = select([platform.c.platform_id])
    s = s.where(and_(
        platform.c.name == doc_dict['platform.name'],
        platform.c.host_country == doc_dict['platform.host_country']
    ))
    s = s.limit(1) # should be unique anyways
    rp = connection.execute(s)
    result = rp.scalar() # is the parent id
    return result

In [None]:
for doc_dict in doc_df.to_dict('record'):
    arc_id = get_archive_fk(doc_dict)
    plt_id = get_platform_fk(doc_dict)
    for key in [
        'archive.host_country',
        'archive.name',
        'platform.host_country',
        'platform.name'
    ]:
        doc_dict.pop(key)
    ins = insert(document)
    rp = connection.execute(ins,
            doc_dict,
            archive_id = arc_id,
            platform_id = plt_id
    )
# throws an integrity error if run twice

In [None]:
s = select([document.c.id_within_archive_type, document.c.id_within_archive, document.c.contact_person])
s = s.where(document.c.start_date.between("1900-01-01", "2000-01-01"))
rp = connection.execute(s)
for res in rp:
    print(res)

In [None]:
s = select([document.c.id_within_archive_type, document.c.id_within_archive, document.c.contact_person])
s = s.where(document.c.start_date.between("1800-01-01", "1900-01-01"))
rp = connection.execute(s)
for res in rp:
    print(res)

## metadata insertion for images

In [None]:
def get_document_fk(img_dict):
    s = select([document.c.document_id])
    s = s.where(and_(
        document.c.id_within_archive == img_dict['document.id_within_archive'],
        document.c.id_within_archive_type == img_dict['document.id_within_archive_type']
    ))
    s = s.limit(1) # should be unique anyways
    rp = connection.execute(s)
    result = rp.scalar() # is the parent id
    return result

In [None]:
img_df = df.filter(items=["uuid","media_type","document.id_within_archive","document.id_within_archive_type"])

In [None]:
for img_dict in img_df.to_dict('record'):
    doc_id = get_document_fk(img_dict)
    for key in ["document.id_within_archive","document.id_within_archive_type"]:
        img_dict.pop(key)
    img_dict['document_id'] = doc_id
    ins = insert(image, img_dict)
    print(ins.compile().params)
    rp = connection.execute(ins, img_dict)    
# throws an integrity error if run twice

## retrieval of images by uuid

In [None]:
from IPython.display import Image, display

In [None]:
s = select([image.c.uuid, image.c.media_type])
s = s.order_by(image.c.uuid)
rp = connection.execute(s)

for img in rp:
    print(os.path.join(out, img.uuid), img.media_type)

## subsetting by date range

In [None]:
columns = [image.c.uuid, image.c.media_type, 
           document.c.start_date, document.c.standardized_region_list]

twentieth_century = select(columns)
twentieth_century = twentieth_century.select_from(
    image.join(document)).where(
    document.c.start_date.between("1900-01-01", "1999-12-31"))

rp = connection.execute(twentieth_century).fetchall()

for img in rp:
    display(Image(
        filename=os.path.join(out, img.uuid),
        format=img.media_type.replace("image/","")
    ))
    for key in img.keys():
        print('{:>20}: {}'.format(key, img[key]))


In [None]:
columns = [image.c.uuid, image.c.media_type, 
           document.c.start_date, document.c.standardized_region_list]

nineteenth_century = select(columns)
nineteenth_century = nineteenth_century.select_from(
    image.join(document)).where(
    document.c.start_date.between("1800-01-01", "1899-12-31"))
nineteenth_century = nineteenth_century.order_by(document.c.start_date)

rp = connection.execute(nineteenth_century).fetchall()

for img in rp:
    display(Image(
        filename=os.path.join(out, img.uuid),
        format=img.media_type.replace("image/","")
    ))
    for key in img.keys():
        print('{:>20}: {}'.format(key, img[key]))