In [None]:
# commonsbot.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.
version = '0.4'
created = '2022-01-18'

# (c) 2022 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# -----------------------------------------
# Version 0.4 change notes: 
# - Removed double spaces from labels before they are used to generate image filenames.
# - Skip over images with raw filenames that contain spaces and log an error for them to be manually removed.
# -----------------------------------------

# IMPORTANT NOTE: If you hack this script to upload media to Commons, you MUST NOT decrease the delay time between
# API writes to less than 5 seconds in order to speed up writing. If you do, then your script isn't 
# BaskaufCommonsBot and you need to change the user_agent_string to use your own URL and email address.
# The same holds true if you make other substantive changes to the way that the script interacts with the API.
# This script attempts to respect the "good citizen" guidelines for using the API and you should too.

# These are recommended delay times to avoid hitting the APIs too frequently and getting blocked
sparql_sleep = 0.1 # delay time between calls to Wikidata SPARQL endpoint, probably could be lower (like 0.1)

# commons_sleep not used in this script because of built-in-delays between media uploads.
commons_sleep = 5 # non-critical edits to commons no faster than this. https://commons.wikimedia.org/wiki/Commons:Bots#Bot_speed

read_api_sleep = 0.1

# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php

# NOTE: this script recycles code from the more full-featured and better tested VanderBot script:
# https://github.com/HeardLibrary/linked-data/tree/master/vanderbot

# ----------------
# Configuration
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
import datetime
import os
import pandas as pd
import urllib.parse
import webbrowser

# AWS Python SDK
import boto3
import botocore

# Hard coded values

max_items_to_upload = 1
open_browser_tab_after_upload = True

# Change working directory to image upload directory
os.chdir('/users/baskausj/github/vandycite/gallery_works/image_upload/')

# Should be saved in current working directory
log_path = 'error_log.txt'

local_image_directory_path = 'gallery_digital_image_archive/'
path_is_relative_to_home_directory = True

local_image_root_directory = '/users/baskausj/gallery_pyramidal_tiffs/'
s3_iiif_bucket_name = 'iiif-library-cantaloupe-storage'
s3_manifest_bucket_name = 'iiif-manifest.library.vanderbilt.edu'
s3_iiif_project_directory = 'gallery'
iiif_server_url_root = 'https://iiif.library.vanderbilt.edu/iiif/3/'
manifest_iri_stem = 'https://iiif-manifest.library.vanderbilt.edu/'
actual_manifest_iri_stem = 'https://iiif-manifest.library.vanderbilt.edu/'

user_agent_string = 'BaskaufCommonsBot/' + version + ' (https://github.com/HeardLibrary/linked-data/tree/master/commonsbot; mailto:steve.baskauf@vanderbilt.edu)'
public_domain_categories = [
    'artist died before copyright cutoff', 
    'artist was born before 1800', 
    'assessed to be out of copyright', 
    'from style or period that ended prior to copyright cutoff',
    'inception prior to copyright cutoff'
]
copyright_cutoff_date = 1926

# Options for filtering by image size
size_filter = 'pixsquared' # options: filetype, filesize, pixsquared
requrired_filetype = 'tiff' # not implemented (yet)
minimum_filesize = 1000
minimup_pixel_squared = 1000000

templated_institution = 'Vanderbilt University Fine Arts Gallery' # Name used in an existing Institution template
source_name = 'Vanderbilt University Fine Arts Gallery'
category_strings = ['Vanderbilt University Fine Arts Gallery'] # Commons categories to be added to the image.
default_language = 'en'

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            
# Commons identifier/URL conversion functions
# There are three identifiers used in Commons:

# The most basic one is the filename, unencoded and with file extension.

# The Commons web page URL is formed from the filename by prepending a subpath and "File:", replacing spaces in the filename with _, and URL-encoding the file name string
# The reverse process may be lossy because it assumes that underscores should be turned into spaces and the filename might actuall contain underscores.

# The Wikidata IRI identifier for the image is formed from the filename by URL-encoding it and prepending a subpath and "Special:FilePath/"
# It the reverse process is lossless since it simply reverse URL-encodes the local name part of the IRI.

commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url


# Authentication functions

def login(path, relative_to_home):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        full_credentials_path = home + '/' + path
    else:
        full_credentials_path = path
    credentials = retrieve_credentials(full_credentials_path)
    
    resource_url = '/w/api.php' # default API resource URL for all Wikimedia APIs
    endpoint_url = credentials['url'] + resource_url

    # Instantiate session
    session = requests.Session()
    # Set default User-Agent header so you don't have to send it with every request
    session.headers.update({'User-Agent': user_agent_string})

    # Go through the sequence of steps needed to get get the CSRF token
    login_token = get_login_token(endpoint_url, session)
    data = session_login(endpoint_url, login_token, credentials['username'], credentials['password'], session)
    csrf_token = get_csrf_token(endpoint_url, session)
    return {'session': session, 'csrftoken': csrf_token, 'endpoint': endpoint_url}

def retrieve_credentials(path):
    with open(path, 'rt') as file_object:
        line_list = file_object.read().split('\n')
    endpoint_url = line_list[0].split('=')[1]
    username = line_list[1].split('=')[1]
    password = line_list[2].split('=')[1]
    #user_agent = line_list[3].split('=')[1]
    credentials = {'url': endpoint_url, 'username': username, 'password': password}
    return credentials

def get_login_token(apiUrl, session):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def session_login(apiUrl, token, username, password, session):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def get_csrf_token(apiUrl, session):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# Commons identifier conversion functions
commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url

# Data upload functions

# Create the date_string for various uncertainty situations. Used in creating template (next function)
def create_template_date_string(inception_val, inception_prec, inception_earliest_date_val, inception_earliest_date_prec, inception_latest_date_val, inception_latest_date_prec, inception_sourcing_circumstances):
    # See https://commons.wikimedia.org/wiki/Template:Other_date for formatting information
    
    # Return nothing if there is no inception date
    if inception_val == '':
        return ''
    
    if inception_earliest_date_val == '': # no date range
        # No VU gallery works have precisions > 9, but it's handled just in case
        if inception_prec == '11': # precise to day
            date_string = inception_val[:10]
        elif inception_prec == '10': # precise to month
            date_string = inception_val[:7]
        else:
            date_string = inception_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
        # Handle circa
        if inception_sourcing_circumstances == 'Q5727902': # Q ID for circa
            date_string = '{{other date|circa|'+ date_string + '}}'
            
    else: # date range must be handled
        if inception_earliest_date_prec == '11': # precise to day
            early_date_string = inception_earliest_date_val[:10]
        elif inception_earliest_date_prec == '10': # precise to month
            early_date_string = inception_earliest_date_val[:7]
        else:
            early_date_string = inception_earliest_date_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
            
        if inception_latest_date_prec == '11': # precise to day
            late_date_string = inception_latest_date_val[:10]
        elif inception_latest_date_prec == '10': # precise to month
            late_date_string = inception_latest_date_val[:7]
        else:
            late_date_string = inception_latest_date_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
            
        # Handle circa
        if inception_sourcing_circumstances == 'Q5727902': # Q ID for circa
            date_string = '{{other date|circa|'+ early_date_string + '|'+ late_date_string + '}}'
        else:
            date_string = '{{other date|between|'+ early_date_string + '|'+ late_date_string + '}}'
    
    return date_string
            
# Insert metadata into the Commons Artwork template
def create_commons_template(work_qid, label, description, description_language, artist_qid, date_string, width, height, source_name, templated_institution, reference_url, reference_iso_date_string, notes, medium, category_strings):
    
    # Convert the dateTime formatted string to European style date with month word
    datetime_object = datetime.datetime.fromisoformat(reference_iso_date_string[:10])
    reference_date = datetime_object.strftime('%d %B %Y')
    # Remove leading zero on day if any
    if reference_date[0] == '0':
        reference_date = reference_date[1:]
        
    if artist_qid == '':
        creator_template = ''
    else:
        creator_template = '{{ Creator | Wikidata = ' + artist_qid + ' | Option = {{{1|}}} }}'
            
    page_wikitext = '''
=={{int:filedesc}}==
{{Artwork
 |artist             = ''' + creator_template + '''
 |title              = ''' + "{{en|'''" + label + "'''.}}" + '''
 |description        = {{''' + description_language + '''|1=''' + description + '''}}
 |depicted people    =
 |depicted place     =
 |date               = ''' + date_string + '''
 |medium             = ''' + medium + '''
 |dimensions         = {{Size|in|''' + width + '|' + height + '''}}
 |institution        = {{Institution:''' + templated_institution + '''}}
 |department         =
 |accession number   = 
 |place of creation  = 
 |place of discovery =
 |object history     =
 |exhibition history =
 |credit line        =
 |inscriptions       =
 |notes              = ''' + notes + '''
 |references         = {{cite web |title=''' + label + ' |url=' + reference_url + ' |accessdate=' + reference_date + '''}}
 |source             = ''' + source_name + '''
 |permission         =
 |other_versions     =
 |wikidata           = ''' + work_qid + '''
 |other_fields       =
}}

=={{int:license-header}}==
{{PD-Art|PD-old-100-expired}}

'''
    # Add all of the categories in the list
    for category_string in category_strings:
        page_wikitext += '[[Category:' + category_string + ''']]
'''
    
    return page_wikitext

# API file Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
def upload_file_to_commons(image_filename, commons_filename, directory_path, relative_to_home, session, csrftoken, sleeptime, wikitext):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        directory_path = home + '/' + directory_path

    parameters = {
        'action': 'upload',
        'filename': commons_filename,
        'format': 'json',
        'token': csrftoken,
        'ignorewarnings': 1,
        'text': wikitext,
        # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
        # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
        'comment': 'Uploaded media file and metadata via API'
    }
    #directory_path = 'Downloads/'
    file_path = directory_path + image_filename
    file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
    #print(parameters)
    #print(file_dict)

    print('uploading', commons_filename) # This line is important for large TIFF files that will take a while to upload
    response = session.post('https://commons.wikimedia.org/w/api.php', files=file_dict, data = parameters)
    # Trap for errors. Note: as far as I can tell, no sort of error code or HTTP header gets sent identifying the 
    # cause of the error. So at this point, just report an error by returning an empty dictionary.
    try:
        data = response.json()
    except:
        data = {}
    #print(json.dumps(data, indent=2))

    # for non-critical applications, do not hit the API rapidly. See notes at the top of the script.
    sleep(sleeptime)
    return(data)


# This function is used in the following function, which needs a page ID rather than a name
def get_commons_image_pageid(image_filename):
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations
    # NOTE: appears to return '-1' when it can't find the page.
    return page_id

# Wikibase edit entity function (upload both caption and all Structured data statements at once)
def wbeditentity_upload(commons_session, commons_csrf_token, maxlag, mid, caption, caption_language, property_p_id, value_q_id):
    # Code hacked from VanderBot https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/vanderbot.py

    # Set up the parameter JSON object that will be passed to the API
    parameter_dictionary = {
        'action': 'wbeditentity',
        'format':'json',
        'token': commons_csrf_token,
        'id': mid, # use id key instead of new since it already exists
        'summary': 'Add caption and structured data via API'
        }

    # This structure will be encoded as JSON, then used as the value of a "data" name in the parameter object
    # First create the labels part
    data_structure = {
        'labels': {
            caption_language: {
                'language': caption_language,
                'value': caption
            }
        }
    }

    # Now create a JSON array of the claims (Structured data statements) to be added.
    # Currently there is only one, but array items could be created in a loop to add multiple claims at once.
    claims_list = []

    # start loop here
    snak_dict = {
        'mainsnak': {
            'snaktype': 'value',
            'property': property_p_id,
            'datatype': 'wikibase-item',
            'datavalue': {
                'value': {
                    'id': value_q_id
                    },
                'type': 'wikibase-entityid'
                }
            },
        'type': 'statement',
        'rank': 'normal'
        }
    claims_list.append(snak_dict)
    # end loop here

    # Now add the array of claims to the data structure
    data_structure['claims'] = claims_list

    #print(json.dumps(data_structure, indent = 2))
    #print()

    # Confusingly, the data structure has to be encoded as a JSON string before adding as a value of the data name 
    # in the parameter object, which will itself be encoded as JSON before passing to the API by the requests module.
    parameter_dictionary['data'] = json.dumps(data_structure)

    # Support maxlag if the API is too busy
    if maxlag > 0:
        parameter_dictionary['maxlag'] = maxlag

    #print(json.dumps(parameter_dictionary, indent = 2))

    response = attemptPost('https://commons.wikimedia.org/w/api.php', parameter_dictionary, commons_session)
    #response = commons_session.post('https://commons.wikimedia.org/w/api.php', data = parameter_dictionary)
    return response


# This function attempts to post and handles maxlag errors
# Code reused from VanderBot https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/vanderbot.py
def attemptPost(apiUrl, parameters, session):
    maxRetries = 10
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        #print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = commons_sleep*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script
    

# !!! DEPRECATED IN FAVOR OF THE SINGLE POST USING wbeditentity
# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical
def set_commons_image_caption(image_filename, caption, caption_language, session, csrftoken, sleeptime):
    parameters = {
        'action': 'wbsetlabel',
        'format': 'json',
        'token': csrftoken,
        'site': 'commonswiki',
        'title': 'File:' + image_filename,
        'value': caption,
        'language': caption_language,
        'summary': 'Add caption via API'
    }

    #print(json.dumps(parameters, indent = 2))

    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return(data)

# !!! DEPRECATED IN FAVOR OF THE SINGLE POST USING wbeditentity
# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html
def create_commons_claim(image_filename, property_p_id, value_q_id, session, csrftoken, sleeptime):
    wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
    #property_p_id = 'P180' # depicts
    #value_q_id = 'Q384177' # Egyptian Revival (architecture)

    stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
    value_dictionary = {
        'entity-type': 'item',
        'numeric-id': stripped_q_number
    }
    value_json_string = json.dumps(value_dictionary)

    parameters = {
        'action':'wbcreateclaim',
        'format':'json',
        'token': csrftoken,
        'entity': wikibase_subject_id,
        'snaktype':'value',
        'property': property_p_id,
        # note: the value of 'value' is a JSON string, not an actual data structure.  
        #It will get URL encoded by requests before posting
        'value': value_json_string,
        'summary': 'Add structured data via API'
    }

    #print(json.dumps(parameters, indent = 2))
    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return data

# ---------------------------
# Body of main script
# ---------------------------

# This section contains configuration information and performs necessary logins
# It needs to be run once before the rest of the code
# No writing is done, so it's "safe" to run any time

# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1
# If you don't know what you are doing, leave this value alone. In any case, it is rude to use a value greater than 5.

maxlag = 5

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

# This is the format of the credentials file. 
# Username and password are for a bot that you've created.

'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
'''

# ---------------------------
# Load data from CSVs into DataFrames
# ---------------------------

# Note: setting the index to be the Q ID requires that qid has a unique value for each row. This should be the case.
works_metadata = pd.read_csv('/Users/baskausj/github/vandycite/gallery_works/image_upload/commons_images.csv', na_filter=False, dtype = str)
works_metadata.set_index('qid', inplace=True)

raw_metadata = pd.read_csv('../gallery_works_renamed1.csv', na_filter=False, dtype = str)
raw_metadata.set_index('accession_number', inplace=True)

image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype = str)
# Convert some columns to integers
image_dimensions[['kilobytes', 'height', 'width']] = image_dimensions[['kilobytes', 'height', 'width']].astype(int)

works_classification = pd.read_csv('../../gallery_buchanan/works_classification.csv', na_filter=False, dtype = str)
works_classification.set_index('qid', inplace=True)

works_ip_status = pd.read_csv('../items_status_abbrev.csv', na_filter=False, dtype = str)
works_ip_status.set_index('qid', inplace=True)

existing_images = pd.read_csv('commons_images.csv', na_filter=False, dtype = str) # Don't make the Q IDs the index!

# For testing purposes, just use the first few rows of the works metadata
#test_rows = 70
#works_metadata = works_metadata.head(test_rows).copy()

# ---------------------------
# Commons API Post Authentication (create session and generate CSRF token)
# ---------------------------

# If credentials file location is relative to current working directory, use subfolders through file name with no leading slash
# Example: myproj/credentials/commons_credentials.txt
# If credentials file is in current working directory, only filename is necessary
# Need to give example for absolute path on Windows - use Unix forward slashes?
path = 'commons_credentials.txt'
path_is_relative_to_home_directory = True # set to True if relative home directory, False if absolute path or relative to working directory
result = login(path, path_is_relative_to_home_directory)
# print(result)
commons_session = result['session']
commons_csrf_token = result['csrftoken']
# Commons API endpoint URL is in result['endpoint'], but it is going to be hard coded anyway, so ignore

print('done')

In [None]:
# ---------------------------
# Screen works for appropriate images to upload
# ---------------------------

# Place limit on number of items to upload in a session by tracking number uploaded
items_uploaded = 0

# Open an error log to record errors
log_object = open(log_path, 'wt', encoding='utf-8')
errors = False

# The row index is the Q ID and is a string. The work object is the data in the row and is a Pandas series
# The items in the row series can be referred to by their labels, which are the column headers, e.g. work['label_en']
for index, work in works_metadata.iterrows():
    upload_error = False
    #print(work['label_en'])
    
    # Get the remaining metadata from the VanderBot upload CSV
    work_qid = index
    label = work['label_en']

    # The local_filename is the name of the file as it exists locally.
    # NOTE: if the image filename contains a space, it will generate an error when the IIIF manifest link is uploaded
    # to the Wikidata API. It's better if the images don't have spaces, so the script will just skip over it and 
    # flag the image to have its name changed manually, rather than automatically changing spaces to underscores 
    # (potentially causing a naming collision).
    if ' ' in work['local_filename']:
        print('Raw filename "' + work['local_filename'] + '" for ' + work_qid + ' contains spaces that need to be removed manually.')
        print('Unallowed spaces in raw filename "' + work['local_filename'] + '" for ' + work_qid, file=log_object)
        errors = True
        continue
    else:
        local_filename = work['local_filename']

    # subdirectory is the directory that contains the local file. It's within the local_image_directory_path. 
    # Don't include a trailing slash.
    # If images are directly in the directory_path, use empty string ('') as the value.
    subdirectory = work['directory']

    # ----------------
    # Upload highres images to IIIF server s3 bucket
    # ----------------
    
    tiff_extensions = ['tif', 'TIF', 'tiff', 'TIFF']
    file_extension = local_filename.split('.')[-1]
    #print(file_extension)
    if file_extension in tiff_extensions:
    
        # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3.html#uploads
        local_file_path = local_image_root_directory + subdirectory + '/' + local_filename
        s3_iiif_key = s3_iiif_project_directory + '/' + subdirectory + '/' + local_filename

        s3 = boto3.client('s3')
        print('Uploading to s3:', local_filename)
        s3.upload_file(local_file_path, s3_iiif_bucket_name, s3_iiif_key)

        # For the image in the "iiif-library-cantaloupe-storage" bucket with the key "gallery/1979/1979.0264P.tif"
        # the IIIF URL would be https://iiif.library.vanderbilt.edu/iiif/3/gallery%2F1979%2F1979.0264P.tif/full/max/0/default.jpg
        print(iiif_server_url_root + s3_iiif_project_directory + '%2F' + subdirectory + '%2F' + local_filename + '/full/1000,/0/default.jpg')
    #print()


if not errors:
    print('No errors occurred.', file=log_object)
log_object.close()
print('done')