# Download the images to a local directory

NOTE: this only needs to be done once for a given set of images. Once they are loaded into the bucket it doesn't need to be run again.

This code uses a list of accession numbers (found as a column in a CSV file) to generate IIIF Image API (v2) URLs for JPEG images that are 1000 pixels in the shortest dimension, then download them into a local directory.

After generating and downloading the images, they need to be uploaded to the Google Cloud bucket used in the Vision analysis.

In [None]:
# google_cloud_vision.ipynb, a Jupyter notebook for analyzing images using the Google Cloud Vision API
version = '0.1.0'
created = '2023-03-27'

# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf
# For more information, see https://github.com/HeardLibrary/linked-data/tree/master/image_analysis

# This script carries out three major tasks:
# 1. It downloads images from the Vanderbilt University Libraries IIIF server at a standard resolution of 1000 pixels in 
# the shortest dimension. If the image is already smaller than 1000 pixels in the shortest dimension, then the image is
# downloaded at the original resolution.
# 2. It analyzes the images using the Google Cloud Vision API using the FACE_DETECTION, LABEL_DETECTION, OBJECT_LOCALIZATION, and
# TEXT_DETECTION features. The results are saved to CSV files.
# 3. Optionally, it can generate an IIIF annotation file for each image that can be used to display the results of the
# object localization analysis of the image.

# -----------------------------------------
# Version 0.1.0 change notes (2023-03-27):
# - Initial version
# -----------------------------------------

import pandas as pd
import requests
import shutil # high-level file operations
from PIL import Image

# Load the image data into a dataframe
base_path = '/Users/baskausj/github/vandycite/gallery_buchanan/image_analysis/'
download_path = '/Users/baskausj/Downloads/'

# Load the source image data into a dataframe
source_image_dataframe = pd.read_csv(base_path + 'combined_images.csv', dtype=str)
# Set the commons_id column as the index
source_image_dataframe = source_image_dataframe.set_index('commons_id')

#source_image_dataframe.head()

# Import CSV data as a dataframe.
accession_dataframe = pd.read_csv(base_path + 'accession_numbers_to_analyze.csv', dtype=str)

# Test with a single row
#accession_dataframe = accession_dataframe.head(1)


# Create a dataframe to hold the accession numbers and dimensions
accession_dimensions_dataframe = pd.DataFrame(columns=['accession_number', 'height', 'width'])

bad_image_list = []

# Loop through the dataframe rows and download the images.
for index, row in accession_dataframe.iterrows():
    accession_number = row['accession_number']
    print(accession_number)
    
    # Look up the image data in the source image dataframe.
    # In cases where there are two images, we want the primary image.
    image_series = source_image_dataframe.loc[(source_image_dataframe['accession_number'] == accession_number) & (source_image_dataframe['rank'] == 'primary')]
    manifest_url = image_series['iiif_manifest'][0]

    # get the manifest from the manifest url
    manifest = requests.get(manifest_url).json()
    #print(json.dumps(manifest, indent=2))
    service_url = manifest['sequences'][0]['canvases'][0]['images'][0]['resource']['service']['@id']
    # Because of the error in original manifests, replace version 3 with version 2 in the URL.
    #service_url = service_url.replace('/3/', '/2/') # This is no longer needed because the manifests have been fixed.
    #print('service_url', service_url)

    # Determine the maximum and minimum dimensions of the image.
    height = image_series['height'][0]
    #print('height', height)
    width = image_series['width'][0]
    #print('width', width)
    shortest_dimension = min(int(height), int(width))
    longest_dimension = max(int(height), int(width))
    #print('shortest_dimension', shortest_dimension)

    # We want to know what the largest dimension needs to be for the shortest dimension to be 1000 pixels.
    # If that calculation makes the longest dimension longer than the actual longest dimension, 
    # then we want to use the actual longest dimension.
    # If the shortest dimension is already less than 1000 pixels, then we will just use the longest dimension as is.
    if shortest_dimension > 1000:
        size = int(1000 * (longest_dimension / shortest_dimension))
        if size > longest_dimension:
            size = longest_dimension
    else:
        size = longest_dimension
    #print('size', size)

    # construct the image url using the "!" size option, that keeps the aspect ratio but sizes to the maximum dimension.
    image_url = service_url + '/full/!' + str(size) + ',' + str(size) + '/0/default.jpg'
    print('image_url', image_url)
    print()
        
    # retrieve the image from the IIIF server
    image_object = requests.get(image_url, stream=True).raw

    # Save the image to a file
    with open(download_path + 'google_vision_images/' + accession_number + '.jpg', 'wb') as out_file:
        shutil.copyfileobj(image_object, out_file)
        # Force the file to be written to disk
        out_file.flush()
    
    # Find the image dimensions
    # Open the image file from disk
    with open(download_path + 'google_vision_images/' + accession_number + '.jpg', 'rb') as image_file:
        reduced_width = 0
        reduced_height = 0
        try:
            image = Image.open(image_file)
            reduced_width, reduced_height = image.size
            #print('reduced_width', reduced_width)
            #print('reduced_height', reduced_height)
        except:
            print('bad image')
            bad_image_list.append(accession_number)
            continue
    
    # Display the image
    #image.show()

    # Add the accession number and dimensions to the dataframe
    accession_dimensions_dataframe = accession_dimensions_dataframe.append({'accession_number': accession_number, 'max_height': height, 'max_width': width, 'height': reduced_height, 'width': reduced_width}, ignore_index=True)

# Save the dataframe to a CSV file
accession_dimensions_dataframe.to_csv(base_path + 'accession_dimensions.csv', index=False)

print('bad_image_list', bad_image_list)

print('done')


# Google Cloud Vision image analysis


In [None]:
# This code is part of google_cloud_vision.ipynb
# For licensing and other information, see https://github.com/HeardLibrary/linked-data/tree/master/image_analysis

# Here's the landing page for Google Cloud Vision
# https://cloud.google.com/vision/
# From it you can try the api by dragging and dropping an image into the browser. You can then 
# view the JSON response, which was helpful at first to understand the structure of the response.

# The following tutorial contains critical information about enabling the API and creating a role
# for the service account to allow it access. This is followed by creating a service account key.
# https://cloud.google.com/vision/docs/detect-labels-image-client-libraries

# I didn't actually do this tutorial, but it was useful to understand the order of operations that
# needed to be done prior to writing to the API.
# https://www.cloudskillsboost.google/focuses/2457?parent=catalog&utm_source=vision&utm_campaign=cloudapi&utm_medium=webpage
# Because I'm using the Python client library, the part about setting up the request body was irrelevant. 
# But the stuff about uploading the files to the bucket, making it publicly accessible, etc. was helpful.
import json
import pandas as pd
from typing import List, Dict, Tuple, Any, Optional

# Imports the Google Cloud client library
# Reference for Google Cloud Vision Python client https://cloud.google.com/python/docs/reference/vision/latest
from google.cloud import vision
from google.cloud import vision_v1
from google.cloud.vision_v1 import AnnotateImageResponse

# Import from Google oauth library
from google.oauth2 import service_account

def extract_object_localization_data(accession_number: str, annotation: List[Dict[str, Any]], width: int, height: int) -> Dict[str, Any]:
    """Extract the object localization data from a hit and turn it into a dict to be added as a row in the dataframe."""
    #print('annotation', annotation)
    description = annotation['name']
    score = annotation['score']
    left_x = annotation['boundingPoly']['normalizedVertices'][0]['x']
    top_y = annotation['boundingPoly']['normalizedVertices'][0]['y']
    right_x = annotation['boundingPoly']['normalizedVertices'][2]['x']
    bottom_y = annotation['boundingPoly']['normalizedVertices'][2]['y']
    #print('description', description)
    #print('score', score)
    #print('left_x', left_x)
    #print('top_y', top_y)
    #print('right_x', right_x)
    #print('bottom_y', bottom_y)
    #print()

    row = {'accession_number': accession_number, 'description': description, 'score': score, 'rel_left_x': left_x, 'rel_right_x': right_x, 'rel_top_y': top_y, 'rel_bottom_y': bottom_y, 'abs_left_x': round(left_x * width), 'abs_right_x': round(right_x * width), 'abs_top_y': round(top_y * height), 'abs_bottom_y': round(bottom_y * height)}
    return row

def extract_face_detection_data(accession_number: str, annotation: List[Dict[str, Any]], width: int, height: int) -> Dict[str, Any]:
    """Extract the face detection data from a hit and turn it into a dict to be added as a row in the dataframe."""
    score = annotation['detectionConfidence']
    left_x = annotation['boundingPoly']['vertices'][0]['x']
    top_y = annotation['boundingPoly']['vertices'][0]['y']
    right_x = annotation['boundingPoly']['vertices'][2]['x']
    bottom_y = annotation['boundingPoly']['vertices'][2]['y']
    roll_angle = annotation['rollAngle']
    pan_angle = annotation['panAngle']
    tilt_angle = annotation['tiltAngle']
    landmarking_confidence = annotation['landmarkingConfidence']
    joy_likelihood = annotation['joyLikelihood']
    sorrow_likelihood = annotation['sorrowLikelihood']
    anger_likelihood = annotation['angerLikelihood']
    surprise_likelihood = annotation['surpriseLikelihood']
    under_exposed_likelihood = annotation['underExposedLikelihood']
    blurred_likelihood = annotation['blurredLikelihood']
    headwear_likelihood = annotation['headwearLikelihood']

    row = {'accession_number': accession_number, 'score': score, 
           'rel_left_x': left_x / width, 'rel_right_x': right_x / width, 'rel_top_y': top_y / height, 'rel_bottom_y': bottom_y /height,
           'abs_left_x': left_x, 'abs_right_x': right_x, 'abs_top_y': top_y, 'abs_bottom_y': bottom_y,
           'roll_angle': roll_angle, 'pan_angle': pan_angle, 'tilt_angle': tilt_angle,
           'landmarking_confidence': landmarking_confidence, 'joy_likelihood': joy_likelihood, 
           'sorrow_likelihood': sorrow_likelihood, 'anger_likelihood': anger_likelihood, 
           'surprise_likelihood': surprise_likelihood, 'under_exposed_likelihood': under_exposed_likelihood,
           'blurred_likelihood': blurred_likelihood, 'headwear_likelihood': headwear_likelihood}
    return row

def extract_label_detection_data(accession_number: str, annotation: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Extract the label detection data from a hit and turn it into a dict to be added as a row in the dataframe."""
    mid = annotation['mid']
    description = annotation['description']
    score = annotation['score']
    topicality = annotation['topicality']
    row = {'accession_number': accession_number, 'mid': mid, 'description': description, 'score': score, 'topicality': topicality}
    return row

def extract_text_detection_data(accession_number: str, annotation: List[Dict[str, Any]], width: int, height: int) -> Dict[str, Any]:
    """Extract the text detection data from a hit and turn it into a dict to be added as a row in the dataframe."""
    locale = annotation['locale']
    description = annotation['description']
    left_x = annotation['boundingPoly']['vertices'][0]['x']
    top_y = annotation['boundingPoly']['vertices'][0]['y']
    right_x = annotation['boundingPoly']['vertices'][2]['x']
    bottom_y = annotation['boundingPoly']['vertices'][2]['y']
    row = {'accession_number': accession_number, 'locale': locale, 'description': description, 
           'rel_left_x': left_x / width, 'rel_right_x': right_x / width, 'rel_top_y': top_y / height, 'rel_bottom_y': bottom_y / height,
           'abs_left_x': left_x, 'abs_right_x': right_x, 'abs_top_y': top_y, 'abs_bottom_y': bottom_y,
           }
    return row

# Customize for your own computer
user_dir = 'baskausj' # Enter your user directory name here
base_path = '/Users/baskausj/github/vandycite/gallery_buchanan/image_analysis/' # Location of the accession number data file
annotations_base_url = 'https://baskaufs.github.io/iiif/baskauf/'

# Set the path to the service account key
key_path = '/Users/' + user_dir + '/image-analysis-376619-193859a33600.json'

# ---------------------------------------------
# Retrieve the service key, create a credentials object, then use it to authenticate and create a `client` object.
# ---------------------------------------------

# Create a credentials object from the service account key
credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

# API documentation https://cloud.google.com/python/docs/reference/vision/latest/google.cloud.vision_v1.services.image_annotator.ImageAnnotatorClient#methods
# The first two versions have no arguments and the credentials are loaded from the environment variable.
#client = vision.ImageAnnotatorClient()
# Used this specific v1 to get the JSON conversion to work
#client = vision_v1.ImageAnnotatorClient()
# Use this line instead of the one above to load the credentials directly from the file
client = vision_v1.ImageAnnotatorClient(credentials=credentials)

# Load the source data from a CSV. The critical column needed here is the `accession_number` column, since it is the one 
# that was used to construct the image file name for the uploaded test images.
accession_dataframe = pd.read_csv(base_path + 'accession_dimensions.csv', dtype=str)
#accession_dataframe.head()

# -----------------------------------
# Loop through all of the accession numbers and perform the analysis on each of the images.
# -----------------------------------

# Create a new dataframe to hold the annotations
object_localization_dataframe = pd.DataFrame(columns=['accession_number', 'description', 'score', 'rel_left_x', 'rel_right_x', 'rel_top_y', 'rel_bottom_y', 'abs_left_x', 'abs_right_x', 'abs_top_y', 'abs_bottom_y'])
face_detection_dataframe = pd.DataFrame(columns=['accession_number', 'score', 'rel_left_x', 'rel_right_x', 'rel_top_y', 'rel_bottom_y', 'abs_left_x', 'abs_right_x', 'abs_top_y', 'abs_bottom_y', 'roll_angle', 'pan_angle', 'tilt_angle', 'landmarking, confidence', 'joy_likelihood', 'sorrow_likelihood', 'anger_likelihood', 'surprise_likelihood', 'under_exposed_likelihood', 'blurred_likelihood', 'headwear_likelihood'])
label_detection_dataframe = pd.DataFrame(columns=['accession_number', 'mid', 'description', 'score', 'topicality'])
text_detection_dataframe = pd.DataFrame(columns=['accession_number', 'locale', 'description', 'rel_left_x', 'rel_right_x', 'rel_top_y', 'rel_bottom_y', 'abs_left_x', 'abs_right_x', 'abs_top_y', 'abs_bottom_y'])

# Loop through the dataframe rows and analyze the images.
for index, row in accession_dataframe.iterrows():
    accession_number = row['accession_number']
    print('accession_number', accession_number)
    width = int(row['width'])
    height = int(row['height'])

    # To access the images, they should be stored in a Google Cloud Storage bucket that is set up for public access.
    # It's also possible to use a publicly accessible URL, but that seems to be unreliable.
    # The storage costs for a few images are negligible.

    # Construct the path to the image file
    image_uri = 'gs://vu-gallery/' + accession_number + '.jpg'
    #print('image_uri', image_uri)
    
    # Here is the API documentation for the Feature object.
    # https://cloud.google.com/vision/docs/reference/rest/v1/Feature
    #analysis_type = vision.Feature.Type.FACE_DETECTION
    #analysis_type = vision.Feature.Type.LABEL_DETECTION
    #analysis_type = vision.Feature.Type.OBJECT_LOCALIZATION

    # This API documentation isn't exactly the one for the .annotate_image method, but it's close enough.
    # https://cloud.google.com/vision/docs/reference/rest/v1/projects.images/annotate
    # In particular, it links to the AnnotateImageRequest object, which is what we need to pass to the annotate_image method.
    response = client.annotate_image({
    'image': {'source': {'image_uri': image_uri}},
    'features': [
        {'type_': vision.Feature.Type.OBJECT_LOCALIZATION},
        {'type_': vision.Feature.Type.FACE_DETECTION},
        {'type_': vision.Feature.Type.LABEL_DETECTION},
        {'type_': vision.Feature.Type.TEXT_DETECTION}  
        ]
    })

    # The API response is a protobuf object, which is not JSON serializable.
    # So we need to convert it to a JSON serializable object.
    # Solution from https://stackoverflow.com/a/65728119
    response_json = AnnotateImageResponse.to_json(response)

    # The structure of the response is detailed in the API documentation here:
    # https://cloud.google.com/vision/docs/reference/rest/v1/AnnotateImageResponse
    # The various bits are detailed for each feature type.
    # Here's the documentation for entity annotations, with a link to the BoundingPoly object.
    # https://cloud.google.com/vision/docs/reference/rest/v1/AnnotateImageResponse#EntityAnnotation
    response_struct = json.loads(response_json)

    # Object localization
    # -------------------

    for annotation in response_struct['localizedObjectAnnotations']:
        row = extract_object_localization_data(accession_number, annotation, width, height)
        object_localization_dataframe = object_localization_dataframe.append(row, ignore_index=True)
    
    # Write the annotations to a CSV file after every image in case the process is interrupted.
    object_localization_dataframe.to_csv(base_path + 'object_localization.csv', index=False)
    
    # Face detection
    # --------------
    '''
    analysis_type = vision.Feature.Type.FACE_DETECTION
    response = client.annotate_image({
    'image': {'source': {'image_uri': image_uri}},
    'features': [{'type_': analysis_type}]
    })
    response_json = AnnotateImageResponse.to_json(response)
    response_struct = json.loads(response_json)
    '''
    for annotation in response_struct['faceAnnotations']:
        row = extract_face_detection_data(accession_number, annotation, width, height)
        face_detection_dataframe = face_detection_dataframe.append(row, ignore_index=True)
    
    # Write the annotations to a CSV file after every image in case the process is interrupted.
    face_detection_dataframe.to_csv(base_path + 'face_detection.csv', index=False)
    
    # Label detection
    # ---------------
    '''
    analysis_type = vision.Feature.Type.LABEL_DETECTION
    response = client.annotate_image({
    'image': {'source': {'image_uri': image_uri}},
    'features': [{'type_': analysis_type}]
    })
    response_json = AnnotateImageResponse.to_json(response)
    response_struct = json.loads(response_json)
    # print(json.dumps(response_struct, indent=2))
    '''
    for annotation in response_struct['labelAnnotations']:
        row = extract_label_detection_data(accession_number, annotation)
        label_detection_dataframe = label_detection_dataframe.append(row, ignore_index=True)
    
    # Write the annotations to a CSV file after every image in case the process is interrupted.
    label_detection_dataframe.to_csv(base_path + 'label_detection.csv', index=False)
    
    # Text detection
    # --------------
    '''
    analysis_type = vision.Feature.Type.TEXT_DETECTION
    response = client.annotate_image({
    'image': {'source': {'image_uri': image_uri}},
    'features': [{'type_': analysis_type}]
    })
    response_json = AnnotateImageResponse.to_json(response)
    response_struct = json.loads(response_json)
    #print(json.dumps(response_struct, indent=2))
    '''
    for annotation in response_struct['textAnnotations']:
        row = extract_text_detection_data(accession_number, annotation, width, height)
        text_detection_dataframe = text_detection_dataframe.append(row, ignore_index=True)

    # Write the annotations to a CSV file after every image in case the process is interrupted.
    text_detection_dataframe.to_csv(base_path + 'text_detection.csv', index=False)

print('done')



# Create IIIF annotation file

To create the annotations, we need to convert the relative dimensions to the absolute pixel dimensions based on the canvas size.

The canvas size is given as the dimensions of the full-sized image, which is reported as `max_height` and `max_width` in the dimensions CSV.

In [None]:
# This code is part of google_cloud_vision.ipynb
# For licensing and other information, see https://github.com/HeardLibrary/linked-data/tree/master/image_analysis

# object_localization.csv contains the results of the object localization analysis
object_localization_dataframe = pd.read_csv(base_path + 'object_localization.csv')
# accession_dimensions.csv is a temporary file that contains the dimensions of the full-size images (max_height and max_width)
# as retrieved from the IIIF manifest
accession_dataframe = pd.read_csv(base_path + 'accession_dimensions.csv', dtype=str)

# Loop through each accession number and create an annotation for each localized object.
for image_index, image_row in accession_dataframe.iterrows():
    print('Processing image ' + str(image_index + 1) + ' of ' + str(len(accession_dataframe)))
    # Build the resources list for the annotations.
    resources = []
    
    # Loop through each object in the image.
    for object_index, object_row in object_localization_dataframe.iterrows():
        if object_row['accession_number'] != image_row['accession_number']:
            continue

        # Create a W3C fragment selector for the annotation.
        # https://www.w3.org/TR/annotation-model/#fragment-selector
        # Calculate the upper left x and y in absolute canvas coordinates.
        x = str(round(object_row['rel_left_x'] * float(image_row['max_width'])))
        y = str(round(object_row['rel_top_y'] * float(image_row['max_height'])))

        # Calculate the width and height in absolute canvas coordinates.
        width = str(round((object_row['rel_right_x'] - object_row['rel_left_x']) * float(image_row['max_width'])))
        height = str(round((object_row['rel_bottom_y'] - object_row['rel_top_y']) * float(image_row['max_height'])))

        fragment_selector = 'xywh=' + x + ',' + y + ',' + width + ',' + height

        # Build the annotation.
        on_value = {
            '@type': 'oa:SpecificResource',
            'full': 'https://iiif-manifest.library.vanderbilt.edu/gallery/' + image_row['accession_number'].split('.')[0] + '/' + image_row['accession_number'] + '.json_1',
            'selector': {
                'type': 'oa:FragmentSelector',
                'value': fragment_selector
            },
            'within': {
                '@id': 'https://iiif-manifest.library.vanderbilt.edu/gallery/' + image_row['accession_number'].split('.')[0] + '/' + image_row['accession_number'] + '.json',
                '@type': 'sc:Manifest'
            }
        }
        resource_value = {
            '@type': 'dctypes:Text',
            'format': 'text/plain',
            'chars': object_row['description']
        }

        annotation = {
            '@context': 'http://iiif.io/api/presentation/2/context.json',
            '@id': 'https://iiif-manifest.library.vanderbilt.edu/gallery/' + image_row['accession_number'].split('.')[0] + '/' + image_row['accession_number'] + '/annotation/' + str(object_index),
            '@type': 'oa:Annotation',
            'motivation': [
                'oa:commenting'
            ],
            'on': on_value,
            'resource': [
                resource_value
            ]
        }
        resources.append(annotation)
    
    annotations = {
        "@context": "http://www.shared-canvas.org/ns/context.json",
        "@id": annotations_base_url + image_row['accession_number'].split('.')[0] + "/" + image_row['accession_number'] + "_annotations.json",
        "@type": "sc:AnnotationList",
        "resources": resources
    }

    # Write the annotations to a JSON file.
    with open(base_path + 'annotations/' + image_row['accession_number'] + '_annotations.json', 'w') as outfile:
        output_text = json.dumps(annotations, indent=2)
        outfile.write(output_text)

print('done')



# Add the link from the manifest to the annotation URL

The annotations file can't be applied to the manifest unless the manifest has a link to it's web address. So an `otherContent` link must be added to the canvas that's being annotated. The link URL has to be a real URL that dereferences, since the annotations have to be retrieved on the fly when the viewer applies the annotations to the canvas.

In [None]:
# Step through each image in the accession dimensions CSV file.
for image_index, image_row in accession_dataframe.iterrows():
    print(image_row['accession_number'])

    # Look up the manifest URL for the image in the source image dataframe.
    manifest_url = source_image_dataframe.loc[source_image_dataframe['accession_number'] == image_row['accession_number'], 'iiif_manifest'].iloc[0]
    
    # Get the manifest JSON.
    manifest_response = requests.get(manifest_url)
    manifest_json = manifest_response.json()
    
    # Create otherContent dictionary.
    other_content = [
        {
        '@id': annotations_base_url + image_row['accession_number'].split('.')[0] + "/" + image_row['accession_number'] + "_annotations.json",
        '@type': 'sc:AnnotationList'
        }
    ]

    # Add the otherContent dictionary to the manifest.
    manifest_json['sequences'][0]['canvases'][0]['otherContent'] = other_content

    # Write the manifest to a JSON file.
    with open(base_path + 'manifests/' + image_row['accession_number'] + '.json', 'w') as outfile:
        text = json.dumps(manifest_json, indent=4)
        outfile.write(text)

print('done')

