# This notebook runs the finetuned image recognition model over Chroncling America pages

Using the manifests folder generated using the repo chronam-get-images (https://github.com/bcglee/chronam-get-images), this notebook systematically downloads the JPG files from the 'ndnp-jpeg-surrogates' S3 bucket and the XML files from the 'ndnp-batches' S3 bucket. This notebook then generates predictions for each JPG image, grabs captions from the METS/ALTO OCR in the XML for each image, saves all of this metadata in JSON format, and then deletes old files.  

In [1]:
import boto3
import botocore
import glob
import sys
import os
import time
from PIL import Image
import io
import math
import datetime

# sets destination for saving downloaded S3 files and destination for output files
S3_SAVE_DIR = '../chronam_files/'
OUTPUT_SAVE_DIR = '../chronam_output/'


if not os.path.isdir(S3_SAVE_DIR):
    os.mkdir(S3_SAVE_DIR)
if not os.path.isdir(OUTPUT_SAVE_DIR):
    os.mkdir(OUTPUT_SAVE_DIR)

# sets boto3 to run with s3
s3 = boto3.resource('s3')

# function that retrieves .jpg and .xml files for each filepath in manifest
def retrieve_files(manifest):
    
    ct = 0
    
    # read manifest
    page_filepaths = open(manifest, "r").read().split('\n')
        
    # iterate through each filepath and download
    for page_filepath in page_filepaths:
        
        # sets filepath for download destination (note: file is .jp2, so we need to replace suffixes below)
        local_filepath = S3_SAVE_DIR + page_filepath.replace('/', '_').replace('.jp2', '.jpg')

        # see: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/migrations3.html
        # see also:  https://www.edureka.co/community/17558/python-aws-boto3-how-do-i-read-files-from-s3-bucket
        try:
            obj = s3.Object('ndnp-jpeg-surrogates', page_filepath.replace(".jp2", ".jpg"))
            body = obj.get()['Body'].read()
            im = Image.open(io.BytesIO(body))
            im.resize((math.floor(im.width/6), math.floor(im.height/6)), resample=0).save(local_filepath)

        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        try:
            s3.Bucket('ndnp-batches').download_file(page_filepath.replace(".jp2", ".xml"), local_filepath.replace(".jpg", ".xml"))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        if ct % 10 == 0:
            print(str(ct))
            
        if ct == 10:
            break
            
        ct += 1

In [2]:
# grabs all of the manifests
## CURRENTLY IN CHRONAM-GET-IMAGES; EVENTUALLY USE IN NEWSPAPER NAVIGATOR
manifests = glob.glob("../../chronam-get-images/manifests/*.txt")

for manifest in manifests[:1]:
    retrieve_files(manifest)

0
10


# This next cell loads the finetuned model and performs predictions on the images saved above

- first cell handles imports and loading in finetuned model
- second cell handles predictions

In [3]:
# import some common libraries
import cv2
import random
import glob
import os
import json
import numpy as np
import matplotlib.pyplot as plt
# import detectron2, etc.
import detectron2
from detectron2.config import get_cfg
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.logger import setup_logger

setup_logger()

cfg = get_cfg()
# loads in correct pre-trained model parameters
cfg.merge_from_file("../..//detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
# loads pre-trained model weights (from Model Zoo)
cfg.MODEL.WEIGHTS = "../model_weights/model_final.pth"
# sets number of object classes
# (5:  "Illustration/Photograph", "Photograph", "Comics/Cartoon", "Editorial Cartoon", "Map")
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
# helps with re-sizing latency (see https://github.com/facebookresearch/detectron2/issues/73)
# cfg.INPUT.MIN_SIZE_TEST = 0
# cfg.INPUT_MAX_SIZE_TEST = 0

predictor = DefaultPredictor(cfg)

- need to figure out naming scheme for saving json files
- need to add OCR caption extraction
- need to progressively delete xml and jpg files to keep room

In [4]:
ct = 0

# now, we cd into the directory containing the downloaded files
os.chdir(S3_SAVE_DIR)

# iterate through images
for im_filepath in glob.glob('*.jpg'):
    
    predictions = {}
    
    # saves file name in format of ChronAm file structure
    predictions["file_name"] = im_filepath.replace('_', '/')
    
    # saves date of publication
    date_str = predictions["file_name"].split('/')[-2]
    predictions["date"] = str(datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:8])))
    
    #opens image and predicts on it
    outputs = predictor(cv2.imread(im_filepath))
    
    # saves predictions
    predictions["boxes"] = outputs["instances"].get_fields()["pred_boxes"].to("cpu").tensor.tolist()
    predictions["scores"] = outputs["instances"].get_fields()["scores"].to("cpu").tolist()
    predictions["pred_classes"] = outputs["instances"].get_fields()["pred_classes"].to("cpu").tolist()
    
    if ct % 10 == 0:
        print(ct)
        
    ct += 1
    
    with open(OUTPUT_SAVE_DIR + im_filepath.replace('.jpg','.json'), "w") as fp:
        json.dump(predictions, fp)

0
10


Now, we delete the JPG and XML files to save space:

In [5]:
for path in glob.glob("*.xml"):
    os.remove(path)

for path in glob.glob("*.jpg"):
    os.remove(path)