# This notebook runs the finetuned image recognition model over Chroncling America pages

Using the manifests folder generated using the repo chronam-get-images (https://github.com/bcglee/chronam-get-images), this notebook systematically downloads the JPG files from the 'ndnp-jpeg-surrogates' S3 bucket and the XML files from the 'ndnp-batches' S3 bucket. This notebook then generates predictions for each JPG image, grabs captions from the METS/ALTO OCR in the XML for each image, saves all of this metadata in JSON format, and then deletes old files.  

In [1]:
import boto3
import botocore
import glob
import sys
import os
import time
from PIL import Image
import io
import math
import datetime

# sets destination for saving downloaded S3 files and destination for output files
S3_SAVE_DIR = '../chronam_files/'
OUTPUT_SAVE_DIR = '../chronam_output/'


if not os.path.isdir(S3_SAVE_DIR):
    os.mkdir(S3_SAVE_DIR)
if not os.path.isdir(OUTPUT_SAVE_DIR):
    os.mkdir(OUTPUT_SAVE_DIR)

# sets boto3 to run with s3
s3 = boto3.resource('s3')

# function that retrieves .jpg and .xml files for each filepath in manifest
def retrieve_files(manifest, dir_name):
    
    # first, we make the subdirectory for this manifest
    if not os.path.isdir(S3_SAVE_DIR + dir_name):
        os.mkdir(S3_SAVE_DIR + dir_name)
        
    # creates dict for storing widths/heights of images
    im_size_dict = {}
    
    ct = 0
    
    # read manifest
    page_filepaths = open(manifest, "r").read().split('\n')
        
    # iterate through each filepath and download
    for page_filepath in page_filepaths[:100]:
        
        # sets filepath for download destination (note: file is .jp2, so we need to replace suffixes below)
        local_filepath = S3_SAVE_DIR + dir_name + page_filepath.replace('/', '_').replace('.jp2', '.jpg')

        # see: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrations3.html
        # see also:  https://www.edureka.co/community/17558/python-aws-boto3-how-do-i-read-files-from-s3-bucket
        try:
            obj = s3.Object('ndnp-jpeg-surrogates', page_filepath.replace(".jp2", ".jpg"))
            body = obj.get()['Body'].read()
            im = Image.open(io.BytesIO(body))
            im.resize((math.floor(im.width/6), math.floor(im.height/6)), resample=0).save(local_filepath)
            im_size_dict[page_filepath.replace("/", "_").replace(".jp2", ".jpg")] = (im.width, im.height)

        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        try:
            s3.Bucket('ndnp-batches').download_file(page_filepath.replace(".jp2", ".xml"), local_filepath.replace(".jpg", ".xml"))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        if ct % 10 == 0:
            print(str(ct))
            
        ct += 1
    
    return im_size_dict

# This next cell loads the finetuned model and defines the function for performing predictions on the images saved above

In [2]:
# import some common libraries
import cv2
import random
import json
import numpy as np
import matplotlib.pyplot as plt
# import detectron2, etc.
import detectron2
from detectron2.config import get_cfg
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.logger import setup_logger

setup_logger()

cfg = get_cfg()
# loads in correct pre-trained model parameters
cfg.merge_from_file("../..//detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
# loads pre-trained model weights (from Model Zoo)
cfg.MODEL.WEIGHTS = "../model_weights/model_final.pth"
# sets number of object classes
# (5:  "Illustration/Photograph", "Photograph", "Comics/Cartoon", "Editorial Cartoon", "Map")
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
# helps with re-sizing latency (see https://github.com/facebookresearch/detectron2/issues/73)
# cfg.INPUT.MIN_SIZE_TEST = 0
# cfg.INPUT_MAX_SIZE_TEST = 0

predictor = DefaultPredictor(cfg)

- need to figure out naming scheme for saving json files
- need to add OCR caption extraction
- need to progressively delete xml and jpg files to keep room

In [3]:
def generate_predictions(manifest, dir_name, predictor):
    
    ct = 0

    # now, we cd into the directory containing the downloaded files
    os.chdir(S3_SAVE_DIR + dir_name)

    # iterate through images
    for im_filepath in glob.glob('*.jpg'):

        predictions = {}

        # saves file name in format of ChronAm file structure
        predictions["file_name"] = im_filepath.replace('_', '/')

        # saves date of publication
        date_str = predictions["file_name"].split('/')[-2]
        predictions["date"] = str(datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:8])))

        #opens image and predicts on it
        outputs = predictor(cv2.imread(im_filepath))

        # saves predictions
        predictions["boxes"] = outputs["instances"].get_fields()["pred_boxes"].to("cpu").tensor.tolist()
        predictions["scores"] = outputs["instances"].get_fields()["scores"].to("cpu").tolist()
        predictions["pred_classes"] = outputs["instances"].get_fields()["pred_classes"].to("cpu").tolist()

        if ct % 10 == 0:
            print(ct)

        ct += 1

        with open("../" + OUTPUT_SAVE_DIR + im_filepath.replace('.jpg','.json'), "w") as fp:
            json.dump(predictions, fp)

# Next, we extract the OCR within each predicted box

- the first cell handles the function for returning the proper OCR
- the second cell iterates over the JSON files containing the predictions

In [4]:
import xml.etree.ElementTree as ET

# given a file path and a list of bounding boxes, this function traverses the associated XML
# and returns the OCR within each bounding box
def retrieve_ocr_for_file(xml_filepath, true_img_filepath, page_width_pix, page_height_pix, bounding_boxes, predicted_classes):

    # creates empty nested list fo storing OCR in each box
    ocr = [ [] for i in range(len(bounding_boxes)) ]

    # sets tree and root based on filepath
    tree = ET.parse(xml_filepath)
    root = tree.getroot()

    # sets tag prefix (everywhere)
    prefix = root.tag.split('}')[0] + '}'

    # traverses to layout and then the page and then the print space
    layout = root.find(prefix + 'Layout')
    page = layout.find(prefix + 'Page')
    print_space = page.find(prefix + 'PrintSpace')

    # finds all of the text boxes on the page
    text_boxes = print_space.findall(prefix + 'TextBlock')

    # gets page height and page width in inch1200 units
    page_width_inch = int(page.attrib['WIDTH'])
    page_height_inch = int(page.attrib['HEIGHT'])

#     # sets page height and width in pixel units
#     im = Image.open(true_img_filepath)
#     page_width_pix, page_height_pix = im.size

    # sets conversion between pixels per inch
    CONVERSION = float(page_height_pix)/float(page_height_inch)

    # we now iterate over each bounding box
    for i in range(0, len(bounding_boxes)):

        bounding_box = bounding_boxes[i]
        predicted_class = predicted_classes[i]

        # we then iterate over each text box
        for text_box in text_boxes:

            box_w1 = int(float(text_box.attrib["HPOS"]))
            box_h1 = int(float(text_box.attrib["VPOS"]))
            box_w2 = box_w1 + int(float(text_box.attrib["WIDTH"]))
            box_h2 = box_h1 + int(float(text_box.attrib["HEIGHT"]))
            
            # if the text box and bounding box do not intersect, we skip (as no text will overlap in smaller units)
            if box_w2*CONVERSION < bounding_box[0] and box_h2 < bounding_box[1]:
                continue
            if box_w1*CONVERSION > bounding_box[2] and box_h2 < bounding_box[1]:
                continue
            if box_w2*CONVERSION < bounding_box[0] and box_h1 > bounding_box[3]:
                continue
            if box_w1*CONVERSION > bounding_box[2] and box_h2 > bounding_box[3]:
                continue

            # finds all of the text lines (atomic text units) within the text box
            text_lines = text_box.findall(prefix + 'TextLine')

            # we then iterate over the text lines in each box
            for text_line in text_lines:

                strings = text_line.findall(prefix + 'String')

                # we now iterate over every string in each line (each string is separated by whitespace)
                for string in strings:

                    w1 = int(float(string.attrib["HPOS"]))
                    h1 = int(float(string.attrib["VPOS"]))
                    w2 = w1 + int(float(string.attrib["WIDTH"]))
                    h2 = h1 + int(float(string.attrib["HEIGHT"]))

                    area = ((w1*CONVERSION, h1*CONVERSION), (w2*CONVERSION, h2*CONVERSION))

                    # checks if the text appears within the bounding box
                    if w1*CONVERSION > bounding_box[0]:
                        if w2*CONVERSION < bounding_box[2]:
                            if h1*CONVERSION > bounding_box[1]:
                                if h2*CONVERSION < bounding_box[3]:

                                    # appends text content to list
                                    ocr[i].append(string.attrib["CONTENT"])

    return ocr

In [5]:
def retrieve_ocr(manifest, dir_name, image_size_dict):
    
    ct = 0

    # now, we cd into the directory containing the downloaded files
    os.chdir("../" + OUTPUT_SAVE_DIR)

    json_filepaths = glob.glob("*.json")

    # we now iterate through all of the predictions JSON files
    for json_filepath in json_filepaths:

        # we load the JSON
        with open(json_filepath) as f:
            predictions = json.load(f)

        # pulls off relevant data fields from the JSON
        original_img_filepath = predictions['file_name']
        boxes = predictions['boxes']
        scores = predictions['scores']
        classes = predictions['pred_classes']

        # sets the number of predicted bounding boxes
        n_pred = len(scores)

        # we now find the XML and JPG files corresponding to this predictions JSON
        xml_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.xml')
        jpg_filepath = S3_SAVE_DIR + dir_name + json_filepath.replace('.json', '.jpg')

        # if the XML doesn't exist, skip
        if not os.path.exists(xml_filepath):
            print("MISSING XML")
            continue

        # stores list of OCR
        ocr = []
        
        # we want to pass these to the OCR retrieval function because they are necessary to compute bounding
        # boxes relative to METS/ALTO OCR, and opening the image using PIL or the equivalent is costly due to
        # the latency in loading the image into memory
        im_width, im_height = image_size_dict[json_filepath.replace('.json', '.jpg')]

        # we only try to retrieve the OCR if there is one or more predicted box
        if n_pred > 0:
            ocr = retrieve_ocr_for_file(xml_filepath, jpg_filepath, im_width, im_height, boxes, classes)

        # adds the ocr field to the JSON metadata for the page
        predictions['ocr'] = ocr

        # we save the updated JSON
        with open(json_filepath, 'w') as f:
            json.dump(predictions, f)

        if ct % 10 == 0:
            print(ct)

        ct += 1

In [None]:
# grabs all of the manifests
## CURRENTLY IN CHRONAM-GET-IMAGES; EVENTUALLY USE IN NEWSPAPER NAVIGATOR
manifests = glob.glob("../../chronam-get-images/manifests/*.txt")


for manifest in manifests[:1]:
    dir_name = manifest.split('/')[-1][:-4] + "/"

    print("retrieving files for manifest...")
    image_size_dict = retrieve_files(manifest, dir_name)
    print("predicting on pages...")
    generate_predictions(manifest, dir_name, predictor)
    print("grabbing OCR...")
    retrieve_ocr(manifest, dir_name, image_size_dict)
    
    # navigate to the ChronAm pages, remove them, and navigate back to the notebook directory
    os.chdir(S3_SAVE_DIR + dir_name)
    print(os.getcwd())
    sys.exit()
    for path in glob.glob("*.xml"):
        os.remove(path)
    for path in glob.glob("*.jpg"):
        os.remove(path)
    os.chdir("../../notebooks/")

retrieving files for manifest...
0
10
20
30
40
50
60
70
80
90
predicting on pages...
0
10
20
30
40
50
60
70
80
