# This notebook runs the finetuned image recognition model over Chroncling America pages

Using the manifests folder generated using the repo chronam-get-images (https://github.com/bcglee/chronam-get-images), this notebook:

1. Systematically downloads the JPG files from the 'ndnp-jpeg-surrogates' S3 bucket and the XML files from the 'ndnp-batches' S3 bucket. *Note: if you're experimenting with this pipeline and can't configure access to the s3 bucket, you can use the repo chronam-get-images (https://github.com/bcglee/chronam-get-images) to pull down the JPG and XML files of desired pages in the correct file heirarchy; then, move those files to '../chronam_files').
2. Generates and saves predictions for each JPG image. This step utilizes model weights generated using the notebook 'train_model.ipynb'.
3. Adds captions from the METS/ALTO OCR in the XML for each image as metadata. This step is performed by identifying text within each predicted bounding box.
4. Crops all of the predicted visual content and saves the cropped images.
5. Generates embeddings for the predicted visual content for each image and adds the embeddings to the metadata. Currently, img2vec is being utilized for this (https://github.com/christiansafka/img2vec).
6. Zips the files, sends to an S3 bucket (currently set to my private bucket) and deletes the downlaoded JPG and XML files to free disk space.

# The next two cells include the code for the systematic download of Chroncling America images from the S3 buckets.

This first cell handles imports and initial settings for pulling down the files:

In [1]:
import boto3
import botocore
import s3fs
import glob
import sys
import os
import time
from PIL import Image
import io
import math
import datetime 


This second cell handles the file retrieval for a specified manifest and destination directory:

In [2]:
# function that retrieves .jpg and .xml files for each filepath in manifest
def retrieve_files(packet):
        
    # creates dict for storing widths/heights of images
    im_size_dict = {}
    
    ct = 0
    
    # grab directory to CD into first (it is the firs entry in the array)
    dir_path = packet[0]
    os.chdir(dir_path)
    
    # grabs page_filepaths from the data packet
    page_filepaths = packet[1]
        
    # iterate through each filepath and download
    for page_filepath in page_filepaths:
                
        # sets filepath for download destination (note: file is .jp2, so we need to replace suffixes below)
        local_filepath = page_filepath.replace('/', '_').replace('.jp2', '.jpg')

        # see: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrations3.html
        # see also:  https://www.edureka.co/community/17558/python-aws-boto3-how-do-i-read-files-from-s3-bucket
        try:
            obj = s3.Object('ndnp-jpeg-surrogates', page_filepath.replace(".jp2", ".jpg"))
            body = obj.get()['Body'].read()
            im = Image.open(io.BytesIO(body))
            im.resize((math.floor(im.width/6), math.floor(im.height/6)), resample=0).save(local_filepath)
            im_size_dict[page_filepath.replace("/", "_").replace(".jp2", ".jpg")] = (im.width, im.height)

        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        try:
            s3.Bucket('ndnp-batches').download_file(page_filepath.replace(".jp2", ".xml"), local_filepath.replace(".jpg", ".xml"))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        if ct % 10 == 0:
            print(str(ct))
            
        ct += 1
        
    return im_size_dict


# The next two cells load the finetuned model and define the function for performing predictions on the images saved above.

In [3]:
# import some common libraries
import cv2
import random
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# import detectron2, etc.
import detectron2
from detectron2.config import get_cfg
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.logger import setup_logger

# added to enable increased batch size at inference time to increase GPU utilization
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
import torch


In [4]:
def generate_predictions(zipped):
            
    # unzips packed information for process to perform predictions
    
    S3_SAVE_DIR = zipped[0]
    OUTPUT_SAVE_DIR = zipped[1]
    dir_name = zipped[2]
    INFERENCE_BATCH_SIZE = zipped[3]
    filepaths = zipped[4]
    ID = zipped[5]

    with torch.cuda.device(ID):

        # navigates to correct directory (process is spawned in /notebooks)

        os.chdir(S3_SAVE_DIR + dir_name)

        # sets up model for process

        setup_logger()
        cfg = get_cfg()
        cfg.merge_from_file("../../..//detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")

        # sets prediction score threshold - note that 0.5 is fairly conservative
        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

        # sets number of object classes
        # (5:  "Illustration/Photograph", "Photograph", "Comics/Cartoon", "Editorial Cartoon", "Map", "Headline", "Ad")
        cfg.MODEL.ROI_HEADS.NUM_CLASSES = 7

        # build model
        model = build_model(cfg)

        # see:  https://github.com/facebookresearch/detectron2/issues/282 - must load weights this way if using model
        DetectionCheckpointer(model).load("../../model_weights/model_final.pth")
        model.train(False) 

        ct = 0

        # construct batches
        batches = chunk(filepaths, math.ceil(len(filepaths)/INFERENCE_BATCH_SIZE))

        # iterate through images
        for batch in batches:

            # sets up inputs by loading in all files in batch
            inputs = []

            # stores image dimensions
            dimensions = []

            # iterate through files in batch
            for file in batch:

                # read in image
                image = cv2.imread(file)

                # store image dimensions
                height, width, _ = image.shape
                dimensions.append([width, height])

                # perform inference on batch
                image = np.transpose(image,(2,0,1))
                # see https://github.com/facebookresearch/detectron2/issues/282 for in-depth description of why 
                # image is loaded in this way
                image_tensor = torch.from_numpy(image)
                inputs.append({"image": image_tensor})

            # performs inference
            outputs = model(inputs)

            # saves predictions
            predictions = {}

            # iterate over images in batch and save predictions to JSON
            for i in range(0, len(batch)):

                # saves file name in format of ChronAm file structure
                predictions["filepath"] = dir_name + "data/" + batch[i].split("data_")[1].replace(dir_name, '').replace('_', '/')

                # saves date of publication
                date_str = predictions["filepath"].split('/')[-2]
                predictions["pub_date"] = str(datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:8])))

                # saves predictions
                # we first normalize the bounding box coordinates
                boxes = outputs[i]["instances"].get_fields()["pred_boxes"].to("cpu").tensor.tolist()
                normalized_boxes = []
                width = dimensions[i][0]
                height = dimensions[i][1]

                for box in boxes:
                    normalized_box = (box[0]/float(width), box[1]/float(height), box[2]/float(width), box[3]/float(height))
                    normalized_boxes.append(normalized_box)

                # saves additional outputs of predictions
                predictions["boxes"] = normalized_boxes
                predictions["scores"] = outputs[i]["instances"].get_fields()["scores"].to("cpu").tolist()
                predictions["pred_classes"] = outputs[i]["instances"].get_fields()["pred_classes"].to("cpu").tolist()

                with open(OUTPUT_SAVE_DIR + dir_name + batch[i].replace('.jpg','.json'), "w") as fp:
                    json.dump(predictions, fp)

            if ct % 10 == 0:
                print(ct)

            ct += INFERENCE_BATCH_SIZE


# The next two cells define functions for extracting the OCR within each predicted box

1. The first cell defines the function for returning the proper OCR for a specific page.
2. The second cell defines the function for iterating over the JSON files containing the predictions.

In [5]:
# import xml.etree.cElementTree as ET
# etree
from lxml import etree as ET
from xml.etree.ElementTree import ElementTree

# tolerance around box for testing whether OCR falls within bounds
WIDTH_TOLERANCE = 0.000
HEIGHT_TOLERANCE = 0.000

# given a file path and a list of bounding boxes, this function traverses the associated XML
# and returns the OCR within each bounding box
def retrieve_ocr_for_file(xml_filepath, true_img_filepath, page_width_pix, page_height_pix, bounding_boxes, predicted_classes):

    # creates empty nested list fo storing OCR in each box
    ocr = [ [] for i in range(len(bounding_boxes)) ]

    # sets tree and root based on filepath
    parser = ET.XMLParser()
    tree = ET.parse(xml_filepath, parser)
    root = tree.getroot()
    
    # sets tag prefix (everywhere)
    prefix = root.tag.split('}')[0] + '}'

    # traverses to layout and then the page and then the print space
    layout = root.find(prefix + 'Layout')
    page = layout.find(prefix + 'Page')
    print_space = page.find(prefix + 'PrintSpace')
    
    text_boxes =  [textblock for textblock in print_space.iterchildren(prefix + "TextBlock")]
    
    # gets page height and page width in inch1200 units
    page_width_inch = int(page.attrib['WIDTH'])
    page_height_inch = int(page.attrib['HEIGHT'])

    # sets conversion to normalized coordinates for comparison between METS/ALTO and predicted boxes
    W_CONVERSION = 1./float(page_width_inch)
    H_CONVERSION = 1./float(page_height_inch)

    # we now iterate over each bounding box
    for i in range(0, len(bounding_boxes)):

        bounding_box = bounding_boxes[i]
        predicted_class = predicted_classes[i]

        # we then iterate over each text box
        for text_box in text_boxes:
                        
            box_w1 = int(float(text_box.attrib["HPOS"]))
            box_h1 = int(float(text_box.attrib["VPOS"]))
            box_w2 = box_w1 + int(float(text_box.attrib["WIDTH"]))
            box_h2 = box_h1 + int(float(text_box.attrib["HEIGHT"]))
            
            # if the text box and bounding box do not intersect, we skip (as no text will overlap in smaller units)
            if box_w2*W_CONVERSION < bounding_box[0] and box_h2*H_CONVERSION < bounding_box[1]:
                continue
            if box_w1*W_CONVERSION > bounding_box[0] + bounding_box[2] and box_h2*H_CONVERSION < bounding_box[1]:
                continue
            if box_w2*W_CONVERSION < bounding_box[0] and box_h1*H_CONVERSION > bounding_box[1] + bounding_box[3]:
                continue
            if box_w1*W_CONVERSION > bounding_box[0] + bounding_box[2] and box_h1*H_CONVERSION > bounding_box[1] + bounding_box[3]:
                continue
                
            # we then iterate over the text lines in each box
            for text_line in text_box.iterchildren(prefix + 'TextLine'):
                
                line_w1 = int(float(text_box.attrib["HPOS"]))
                line_h1 = int(float(text_box.attrib["VPOS"]))
                line_w2 = line_w1 + int(float(text_box.attrib["WIDTH"]))
                line_h2 = line_h1 + int(float(text_box.attrib["HEIGHT"]))

                # if the text box and bounding box do not intersect, we skip (as no text will overlap in smaller units)
                if line_w2*W_CONVERSION < bounding_box[0] and line_h2*H_CONVERSION < bounding_box[1]:
                    continue
                if line_w1*W_CONVERSION > bounding_box[0] + bounding_box[2] and line_h2*H_CONVERSION < bounding_box[1]:
                    continue
                if line_w2*W_CONVERSION < bounding_box[0] and line_h1*H_CONVERSION > bounding_box[1] + bounding_box[3]:
                    continue
                if line_w1*W_CONVERSION > bounding_box[0] + bounding_box[2] and line_h1*H_CONVERSION > bounding_box[1] + bounding_box[3]:
                    continue
                
                # we now iterate over every string in each line (each string is separated by whitespace)
                for string in text_line.iterchildren(prefix + 'String'):
            
                    w1 = int(float(string.attrib["HPOS"]))
                    h1 = int(float(string.attrib["VPOS"]))
                    w2 = w1 + int(float(string.attrib["WIDTH"]))
                    h2 = h1 + int(float(string.attrib["HEIGHT"]))

                    # checks if the text appears within the bounding box & extra tolerance for words that are clipped
                    if w1*W_CONVERSION > bounding_box[0] - WIDTH_TOLERANCE:
                        if w2*W_CONVERSION < bounding_box[2] + WIDTH_TOLERANCE:
                            if h1*H_CONVERSION > bounding_box[1] - HEIGHT_TOLERANCE:
                                if h2*H_CONVERSION < bounding_box[3] + HEIGHT_TOLERANCE:

                                    # appends text content to list
                                    ocr[i].append(string.attrib["CONTENT"])

    return ocr

In [6]:
def retrieve_ocr(packet):

    # grab contents of packet, CD into correct directory
    dir_name = packet[1]
    os.chdir(packet[0] + dir_name)
    json_info = packet[2]

    ct = 0
    
    # we now iterate through all of the predictions JSON files
    for json_entry in json_info:
        
        # unpacks the input from Pool
        json_filepath = json_entry[0]
        im_width = json_entry[1]
        im_height = json_entry[2]
        
        # loads the JSON
        with open(json_filepath) as f:
            predictions = json.load(f)
        
        # pulls off relevant data fields from the JSON
        original_img_filepath = predictions['filepath']
        boxes = predictions['boxes']
        scores = predictions['scores']
        classes = predictions['pred_classes']

        # sets the number of predicted bounding boxes
        n_pred = len(scores)

        # we now find the XML and JPG files corresponding to this predictions JSON
        xml_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.xml')
        jpg_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.jpg')

        # stores list of OCR
        ocr = []

        # we only try to retrieve the OCR if there is one or more predicted box
        if n_pred > 0:
            ocr = retrieve_ocr_for_file(xml_filepath, jpg_filepath, im_width, im_height, boxes, classes)

        # adds the ocr field to the JSON metadata for the page
        predictions['ocr'] = ocr

        # we save the updated JSON
        with open(json_filepath, 'w') as f:
            json.dump(predictions, f)

        if ct % 10 == 0:
            print(ct)

        ct += 1


# This cell defines a function for cropping all of the predicted visual content:

In [7]:
def crop(packet):
            
    OUTPUT_SAVE_DIR = packet[0]
    S3_SAVE_DIR = packet[1]
    dir_name = packet[2]
    json_filepaths = packet[3]
    
    os.chdir(OUTPUT_SAVE_DIR+dir_name)
    
    ct = 0
    
    for json_filepath in json_filepaths:
        
        # we load the JSON
        with open(json_filepath) as f:
            predictions = json.load(f)
          
        # load in boxes
        boxes = predictions['boxes']
        scores = predictions['scores']
        classes = predictions['pred_classes']
        
        # grab filepath of image
        jpg_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.jpg')

        # open image
        im = Image.open(jpg_filepath)
        
        # empty list for storing embeddings
        img_embeddings = []
        
        # empty list or storing filepaths of extracted visual content
        content_filepaths = []

        # iterate through boxes, crop, and send to embedding
        for i in range(0, len(boxes)):
            box = boxes[i]
            pred_class = classes[i]
            
            # if it's a headline, we skip the embedding generation
            if pred_class == 5:
                img_embeddings.append([])
                content_filepaths.append([])
                continue
                
            # crop image according to box (converted from normalized coordinates to image coordinates)
            cropped = im.crop((box[0]*im.width, box[1]*im.height, box[2]*im.width, box[3]*im.height)).convert('RGB')
            # save cropped image to output directory
            cropped_filepath = json_filepath.replace(".json", "_" + str(i) + ".jpg")
            cropped.save(cropped_filepath)
            content_filepaths.append(cropped_filepath.split('/')[-1])
            
        # add filepaths of extracted visual content to output
        predictions['visual_content_filepaths'] = content_filepaths
        
        # we save the updated JSON
        with open(json_filepath, 'w') as f:
            json.dump(predictions, f)
    
        if ct % 10 == 0:
            print(ct)

        ct += 1

# This cell defines a function for generating embeddings of each predicted box:

In [8]:
from img2vec_pytorch import Img2Vec

def generate_embeddings(zipped):
    
    # unzips packed information for process to perform predictions
    
    OUTPUT_SAVE_DIR = zipped[0]
    S3_SAVE_DIR = zipped[1]
    dir_name = zipped[2]
    json_filepaths = zipped[3]
    ID = zipped[4]

    with torch.cuda.device(ID):

        # load in img2vec
        # we choose resnet-18 embeddings
        img2vec = Img2Vec(cuda=True, model='resnet-18') 
        
        ct = 0

        # iterate through the JSON files
        for json_filepath in json_filepaths:
            
            # we load the JSON
            with open(json_filepath) as f:
                predictions = json.load(f)

            # load in boxes
            boxes = predictions['boxes']
            scores = predictions['scores']
            classes = predictions['pred_classes']
            cropped_filepaths = predictions['visual_content_filepaths']

            # grab filepath of image
            jpg_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.jpg')

            # empty list for storing embeddings
            img_embeddings = []

            # iterate through boxes, crop, and send to embedding
            for i in range(0, len(boxes)):
                box = boxes[i]
                pred_class = classes[i]
                cropped_filepath = cropped_filepaths[i]

                # if it's a headline, we skip the embedding generation
                if pred_class == 5:
                    img_embeddings.append([])
                    continue

                # open cropped image
                im = Image.open(cropped_filepath).convert('RGB')
                # generate embedding using img2vec
                embedding = img2vec.get_vec(im, tensor=False)
                # add to list (render embedding numpy array as list to enable JSON serialization)
                img_embeddings.append(embedding.tolist())

            # add embeddings to output
            predictions['embeddings'] = img_embeddings

            # we save the updated JSON
            with open(json_filepath, 'w') as f:
                json.dump(predictions, f)

            if ct % 10 == 0:
                print(ct)

            ct += 1
        

  "please use transforms.Resize instead.")


# The cells below run the multiprocessing, according to the following procedure:

The outermost loop iterates over manifests for each newspaper.  Each manifest (a text file) contains a list of images (corresponding to newspaper pages).  Iterating over these, the code is optimized as follows:

1. Download files from manifest (this is done on the CPU using multiprocessing by partitioning the processing into equal chunks of files).

2. Predict on the downloaded image using finetuned Detectron2 model (this is spread across the GPUs available using multiprocessing).

3. Extract captions from the METS/ALTO OCR (this is done on the CPU using multiprocessing in the same fashion as 1.)

4. Generate embeddings for identified visual content (this is spread across the GPUs available using multiprocessing).

In [9]:
# function that splits a list into n chunks for multiprocessing
def chunk(file_list, n_chunks):
    
    # make chunks of files to be distributed across processes
    chunks = []
    chunk_size = math.ceil(float(len(file_list))/n_chunks)
    for i in range(0, n_chunks-1):
        chunks.append(file_list[i*chunk_size:(i+1)*chunk_size])
    chunks.append(file_list[(n_chunks-1)*chunk_size:])
    
    return chunks

In [10]:
# function that determines whether the JPG and XML files exist for specified files
def files_exist(filepaths):
    
    ct = 0
    
    no_jpg = []
    no_xml = []
    good_filepaths = []
    s3 = s3fs.S3FileSystem()
    for filepath in filepaths:
        if not s3.exists('ndnp-jpeg-surrogates/' + filepath.replace(".jp2", ".jpg")):
            no_jpg.append(filepath)
            continue
        if not s3.exists('ndnp-batches/' + filepath.replace(".jp2", ".xml")):
            no_xml.append(filepath.replace(".jp2", ".xml"))
            continue
        good_filepaths.append(filepath)
        
        if ct % 10 == 0:
            print(ct)
        ct += 1
        
    return [good_filepaths, no_jpg, no_xml]

In [11]:
from multiprocessing import Pool, get_context, Process, set_start_method
from collections import ChainMap
import shutil

# need main for setting multiprocessing start method to spawn
if __name__ == '__main__':
    
    # sets directory location where the notebook is
    NOTEBOOK_DIR = os.getcwd()
    os.chdir('../')
    # sets destination for saving downloaded S3 files
    S3_SAVE_DIR = os.getcwd() + '/chronam_files/'
    # sets destination for output files, containing new metadata
    OUTPUT_SAVE_DIR = os.getcwd() + '/chronam_output/'
    os.chdir('notebooks/')

    # construct the directories
    if not os.path.isdir(S3_SAVE_DIR):
        os.mkdir(S3_SAVE_DIR)
    if not os.path.isdir(OUTPUT_SAVE_DIR):
        os.mkdir(OUTPUT_SAVE_DIR)

    # sets boto3 to run with s3
    s3 = boto3.resource('s3')

    # sets batch size for GPU inference
    INFERENCE_BATCH_SIZE = 4

    # sets number of processes (be careful based on number of available cores)
    N_CPU_PROCESSES = 48

    # sets number of GPUs available
    N_GPUS = torch.cuda.device_count()

    # sets multiprocessing pool
    pool = Pool(N_CPU_PROCESSES)  

    # sets start method to spawn for GPU multiprocessing
    ctx = get_context('forkserver')

    # grabs all of the manifests
    ## CURRENTLY IN CHRONAM-GET-IMAGES; EVENTUALLY USE IN NEWSPAPER NAVIGATOR
    manifests = glob.glob("../../chronam-get-images/manifests/*.txt")

    # now we iterate over all of the manifests
    for manifest in manifests:

        # sets directory name
        dir_name = manifest.split('/')[-1][:-4] + "/"

        # first, we make the subdirectories for this manifest
        if not os.path.isdir(S3_SAVE_DIR + dir_name):
            os.mkdir(S3_SAVE_DIR + dir_name)
        if not os.path.isdir(OUTPUT_SAVE_DIR + dir_name):
            os.mkdir(OUTPUT_SAVE_DIR + dir_name)

        # read manifest
        page_filepaths = open(manifest, "r").read().split('\n')[:100]
        
        # if there are no files in the manifest, we skip over this newspaper manifest
        if len(page_filepaths) == 0:
            continue

        print("processing manifest: " + str(dir_name) + " (" + str(len(page_filepaths)) + " files)")

        print("validating filepaths...")

        # we check to ensure that all of these files exist; if some don't, we save the filepaths separately from the
        # main execution path
        packed_list = pool.imap(files_exist, chunk(page_filepaths, N_CPU_PROCESSES))

        good_filepaths = []
        no_jpg = []
        no_xml = []
        # we now unroll the lists from the different processes
        for contents in packed_list:
            good_filepaths.extend(contents[0])
            no_jpg.extend(contents[1])
            no_xml.extend(contents[2])

        # make sure all of the files have been tested
        assert len(page_filepaths) == len(good_filepaths) + len(no_jpg) + len(no_xml)

        # we now write this info to an out file
        with open(OUTPUT_SAVE_DIR + dir_name + 'good_filepaths.txt', 'w') as f:
            for filepath in good_filepaths:
                f.write("%s\n" % filepath)

        with open(OUTPUT_SAVE_DIR + dir_name + 'no_jpg.txt', 'w') as f:
            for filepath in no_jpg:
                f.write("%s\n" % filepath)

        with open(OUTPUT_SAVE_DIR + dir_name + 'no_xml.txt', 'w') as f:
            for filepath in no_xml:
                f.write("%s\n" % filepath)

        # now we cd into the directory for the computations
        os.chdir(S3_SAVE_DIR + dir_name)

        # runs multiprocess for downloading of files in manifest
        print("retrieving files for manifest...")
        # chunks good filepaths for multiprocessing
        good_filepath_chunks = chunk(good_filepaths, N_CPU_PROCESSES)
        # adds directory to cd into (each process starts in local path of notebook)
        for i in range(0, len(good_filepath_chunks)):
            good_filepath_chunks[i] = [S3_SAVE_DIR + dir_name, good_filepath_chunks[i]]
        # calls the multiprocessing
        image_size_dicts = pool.imap(retrieve_files, good_filepath_chunks)

        # we now combine the dictionaries into one
        image_size_dict = dict(ChainMap(*image_size_dicts))
        
        # now we generate predictions on all of the downloaded files
        print("predicting on pages...")
        
        # FOR MULTIPROCESSING
        chunked_image_filepaths = chunk(glob.glob("*.jpg"), N_GPUS)

        # https://stackoverflow.com/questions/31386613/python-multiprocessing-what-does-process-join-do
        processes = []
        for i in range(0, N_GPUS):
            zipped = [S3_SAVE_DIR, OUTPUT_SAVE_DIR, dir_name, INFERENCE_BATCH_SIZE, chunked_image_filepaths[i], i]
            p = ctx.Process(target=generate_predictions, args=(zipped,))
            p.start()
            processes.append(p)
            
        for process in processes:
            process.join()
            

        # now, we cd into the directory containing the output files
        os.chdir(OUTPUT_SAVE_DIR + dir_name)

        # now, we grab the JSON predictions and append on image width and height so the data can be zipped 
        # for multiprocessing
        # we want to pass these to the OCR retrieval function because they are necessary to compute bounding
        # boxes relative to METS/ALTO OCR, and opening the image using PIL or the equivalent is costly due to
        # the latency in loading the image into memory
        json_filepaths = glob.glob("*.json")

        # grabs the 
        json_info = []

        for json_filepath in json_filepaths:
            im_width, im_height = image_size_dict[json_filepath.replace('.json', '.jpg')]
            json_info.append([json_filepath, im_width, im_height]) 

        chunked_json_info = chunk(json_info, N_CPU_PROCESSES)
        for i in range(0, len(chunked_json_info)):
            chunked_json_info[i] = [OUTPUT_SAVE_DIR, dir_name, chunked_json_info[i]]

        print("grabbing OCR...")
        pool.map(retrieve_ocr, chunked_json_info) 
        
        print("cropping images...")
        zipped = chunk(json_filepaths, N_CPU_PROCESSES)
        for i in range(0, len(zipped)):
            zipped[i] = [OUTPUT_SAVE_DIR, S3_SAVE_DIR, dir_name, zipped[i]]
        pool.map(crop, zipped)

        print("generating embeddings...")
        
        # FOR MULTIPROCESSING
        chunked_json_filepaths = chunk(json_filepaths, N_GPUS)

        # https://stackoverflow.com/questions/31386613/python-multiprocessing-what-does-process-join-do
        processes = []
        for i in range(0, N_GPUS):
            zipped = [OUTPUT_SAVE_DIR, S3_SAVE_DIR, dir_name, chunked_json_filepaths[i], i]
            p = ctx.Process(target=generate_embeddings, args=(zipped,))
            p.start()
            processes.append(p)
            
        for process in processes:
            process.join()

        # now, we cd back into the 'save' directory 
        os.chdir(OUTPUT_SAVE_DIR)

        # we zip the contents
        shutil.make_archive(dir_name[:-1], 'zip', dir_name)

        # we copy the zipped file over to my S3 bucket
        s3.Bucket('bcgl-bucket').upload_file(dir_name[:-1] + '.zip', "chronam_processed/" + dir_name[:-1] + '.zip')

        # we now remove the zipped file to free up disk space
        os.remove(dir_name[:-1] + '.zip')
        
        os.chdir(dir_name)
        
        # we now compute stats on the processed newspaper and save as json
        stats = {}
        for path in glob.glob("*.json"):
            # loads the JSON
            with open(path) as f:
                data = json.load(f)
                pred_classes = data["pred_classes"]
                pub_date = data["pub_date"]
                img_filepaths = data["visual_content_filepaths"]
                page_stats = {"pub_date": pub_date, "pred_classes": pred_classes, "visual_content_filepaths": img_filepaths}
                stats[data["filepath"]] = page_stats
                
        with open(dir_name[:-1] + "_stats.json", "w") as fp:
            json.dump(stats, fp)
        
        # we write the JSON stats file to S3 bucket
        s3.Bucket('bcgl-bucket').upload_file(dir_name[:-1] + "_stats.json", "chronam_processed/" + dir_name[:-1] + "_stats.json")

        # we now remove the folder & its contents to free up disk space
        for path in glob.glob("*.json"):
            os.remove(path)
        for path in glob.glob("*.txt"):
            os.remove(path)
        for path in glob.glob("*.jpg"):
            os.remove(path)
        os.chdir('../')
        os.rmdir(os.getcwd() + "/" + dir_name)

        # navigate to the ChronAm pages, remove them (as well as the empty folder) 
        # and navigate back to the notebook directory
        os.chdir(S3_SAVE_DIR + dir_name)

        for path in glob.glob("*.xml"):
            os.remove(path)
        for path in glob.glob("*.jpg"):
            os.remove(path)

        os.chdir('../')
        os.rmdir(os.getcwd() + "/" + dir_name)
        os.chdir("../notebooks/")

        sys.exit()

processing manifest: mohi_james_ver01/ (100 files)
validating filepaths...
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
retrieving files for manifest...
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
predicting on pages...
0
20
40
60
80
0
0
0
0
0
0
grabbing OCR...
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
cropping images...
generating embeddings...
0
10
20
30
40
50
60
70
80
90


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# TO-DO Before Full Run:

- add ability to re-start (find current output files in S3 bucket, check against this list)
- add ability to filter by date range
- add updated manifests to newspaper-navigator repo
- add finetuned model weights to repo, once benchmarked

# TO-DO After:

- bounding box logic for filtering redundant boxes

