In [None]:
# commonsbot.ipynb, a Python script for uploading files and data to Wikimedia Commons using the API.
version = '0.3'
created = '2021-12-04'

# (c) 2021 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# IMPORTANT NOTE: If you use this script to upload media to Commons, you MUST NOT decrease the delay time between
# API writes to less than 5 seconds in order to speed up writing. If you do, then your script isn't 
# BaskaufCommonsBot and you need to change the user_agent_string to use your own URL and email address.
# The same holds true if you make other substantive changes to the way that the script interacts with the API.
# This script attempts to respect the "good citizen" guidelines for using the API and you should too.

# These are recommended delay times to avoid hitting the APIs too frequently and getting blocked
sparql_sleep = 0.25 # delay time between calls to Wikidata SPARQL endpoint, probably could be lower (like 0.1)
commons_sleep = 5 # non-critical edits to commons no faster than this. https://commons.wikimedia.org/wiki/Commons:Bots#Bot_speed
read_api_sleep = 0.1

# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php

# NOTE: this script recycles code from the more full-featured and better tested VanderBot script:
# https://github.com/HeardLibrary/linked-data/tree/master/vanderbot

# ----------------
# Configuration
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
import datetime
import os
import pandas as pd

# Hard coded values

# Change working directory to image upload directory
os.chdir('/users/baskausj/github/vandycite/gallery_works/image_upload/')

local_image_directory_path = 'gallery_digital_image_archive/'
path_is_relative_to_home_directory = True

user_agent_string = 'BaskaufCommonsBot/' + version + ' (https://github.com/HeardLibrary/linked-data/tree/master/commonsbot; mailto:steve.baskauf@vanderbilt.edu)'
public_domain_categories = [
    'artist died before copyright cutoff', 
    'artist was born before 1800', 
    'assessed to be out of copyright', 
    'from style or period that ended prior to copyright cutoff',
    'inception prior to copyright cutoff'
]

# Options for filtering by image size
size_filter = 'pixsquared' # options: filetype, filesize, pixsquared
requrired_filetype = 'tiff' # not implemented (yet)
minimum_filesize = 1000
minimup_pixel_squared = 1000000

templated_institution = 'Vanderbilt University Fine Arts Gallery' # Name used in an existing Institution template
source_name = 'Vanderbilt University Fine Arts Gallery'
category_strings = ['Vanderbilt University Fine Arts Gallery'] # Commons categories to be added to the image.
default_language = 'en'

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            
# Commons identifier/URL conversion functions
# There are three identifiers used in Commons:

# The most basic one is the filename, unencoded and with file extension.

# The Commons web page URL is formed from the filename by prepending a subpath and "File:", replacing spaces in the filename with _, and URL-encoding the file name string
# The reverse process may be lossy because it assumes that underscores should be turned into spaces and the filename might actuall contain underscores.

# The Wikidata IRI identifier for the image is formed from the filename by URL-encoding it and prepending a subpath and "Special:FilePath/"
# It the reverse process is lossless since it simply reverse URL-encodes the local name part of the IRI.

commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url


# Authentication functions

def login(path, relative_to_home):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        full_credentials_path = home + '/' + path
    else:
        full_credentials_path = path
    credentials = retrieve_credentials(full_credentials_path)
    
    resource_url = '/w/api.php' # default API resource URL for all Wikimedia APIs
    endpoint_url = credentials['url'] + resource_url

    # Instantiate session
    session = requests.Session()
    # Set default User-Agent header so you don't have to send it with every request
    session.headers.update({'User-Agent': user_agent_string})

    # Go through the sequence of steps needed to get get the CSRF token
    login_token = get_login_token(endpoint_url, session)
    data = session_login(endpoint_url, login_token, credentials['username'], credentials['password'], session)
    csrf_token = get_csrf_token(endpoint_url, session)
    return {'session': session, 'csrftoken': csrf_token, 'endpoint': endpoint_url}

def retrieve_credentials(path):
    with open(path, 'rt') as file_object:
        line_list = file_object.read().split('\n')
    endpoint_url = line_list[0].split('=')[1]
    username = line_list[1].split('=')[1]
    password = line_list[2].split('=')[1]
    #user_agent = line_list[3].split('=')[1]
    credentials = {'url': endpoint_url, 'username': username, 'password': password}
    return credentials

def get_login_token(apiUrl, session):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def session_login(apiUrl, token, username, password, session):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def get_csrf_token(apiUrl, session):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# Data upload functions

# API file Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
def upload_file_to_commons(image_filename, commons_filename, directory_path, relative_to_home, session, csrftoken, sleeptime, wikitext):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        directory_path = home + '/' + directory_path

    parameters = {
        'action': 'upload',
        'filename': commons_filename,
        'format': 'json',
        'token': csrftoken,
        'ignorewarnings': 1,
        'text': wikitext,
        # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
        # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
        'comment': 'Uploaded media file and metadata via API'
    }
    #directory_path = 'Downloads/'
    file_path = directory_path + image_filename
    file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
    #print(parameters)
    #print(file_dict)

    print('uploading', commons_filename) # This line is important for large TIFF files that will take a while to upload
    response = session.post('https://commons.wikimedia.org/w/api.php', files=file_dict, data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    # for non-critical applications, do not hit the API rapidly
    sleep(sleeptime)
    return(data)

# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical
def set_commons_image_caption(image_filename, caption, caption_language, session, csrftoken, sleeptime):
    parameters = {
        'action': 'wbsetlabel',
        'format': 'json',
        'token': csrftoken,
        'site': 'commonswiki',
        'title': 'File:' + image_filename,
        'value': caption,
        'language': caption_language,
        'summary': 'Add caption via API'
    }

    #print(json.dumps(parameters, indent = 2))

    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return(data)

# This function is used in the following function, which needs a page ID rather than a name
def get_commons_image_pageid(image_filename):
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations
    return page_id

# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html
def create_commons_claim(image_filename, property_p_id, value_q_id, session, csrftoken, sleeptime):
    wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
    #property_p_id = 'P180' # depicts
    #value_q_id = 'Q384177' # Egyptian Revival (architecture)

    stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
    value_dictionary = {
        'entity-type': 'item',
        'numeric-id': stripped_q_number
    }
    value_json_string = json.dumps(value_dictionary)

    parameters = {
        'action':'wbcreateclaim',
        'format':'json',
        'token': csrftoken,
        'entity': wikibase_subject_id,
        'snaktype':'value',
        'property': property_p_id,
        # note: the value of 'value' is a JSON string, not an actual data structure.  
        #It will get URL encoded by requests before posting
        'value': value_json_string,
        'summary': 'Add structured data via API'
    }

    #print(json.dumps(parameters, indent = 2))
    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return data

# Create the date_string for various uncertainty situations
def create_template_date_string(inception_val, inception_prec, inception_earliest_date_val, inception_earliest_date_prec, inception_latest_date_val, inception_latest_date_prec, inception_sourcing_circumstances):
    # See https://commons.wikimedia.org/wiki/Template:Other_date for formatting information
    
    # Return nothing if there is no inception date
    if inception_val == '':
        return ''
    
    if inception_earliest_date_val == '': # no date range
        # No VU gallery works have precisions > 9, but it's handled just in case
        if inception_prec == '11': # precise to day
            date_string = inception_val[:10]
        elif inception_prec == '10': # precise to month
            date_string = inception_val[:7]
        else:
            date_string = inception_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
        # Handle circa
        if inception_sourcing_circumstances == 'Q5727902': # Q ID for circa
            date_string = '{{other date|circa|'+ date_string + '}}'
            
    else: # date range must be handled
        if inception_earliest_date_prec == '11': # precise to day
            early_date_string = inception_earliest_date_val[:10]
        elif inception_earliest_date_prec == '10': # precise to month
            early_date_string = inception_earliest_date_val[:7]
        else:
            early_date_string = inception_earliest_date_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
            
        if inception_latest_date_prec == '11': # precise to day
            late_date_string = inception_latest_date_val[:10]
        elif inception_latest_date_prec == '10': # precise to month
            late_date_string = inception_latest_date_val[:7]
        else:
            late_date_string = inception_latest_date_val[:4] # return the year, regardless of whether precision is year, decade, century, etc.
            
        # Handle circa
        if inception_sourcing_circumstances == 'Q5727902': # Q ID for circa
            date_string = '{{other date|circa|'+ early_date_string + '|'+ late_date_string + '}}'
        else:
            date_string = '{{other date|between|'+ early_date_string + '|'+ late_date_string + '}}'
    
    return date_string
            

# Insert metadata into the Commons Artwork template
def create_commons_template(work_qid, label, description, description_language, artist_qid, date_string, width, height, source_name, templated_institution, reference_url, reference_iso_date_string, notes, medium, category_strings):
    
    # Convert the dateTime formatted string to European style date with month word
    datetime_object = datetime.datetime.fromisoformat(reference_iso_date_string[:10])
    reference_date = datetime_object.strftime('%d %B %Y')
    # Remove leading zero on day if any
    if reference_date[0] == '0':
        reference_date = reference_date[1:]
            
    page_wikitext = '''
=={{int:filedesc}}==
{{Artwork
 |artist             = {{ Creator | Wikidata = ''' + artist_qid + ''' | Option = {{{1|}}} }}
 |title              = ''' + "{{en|'''" + label + "'''.}}" + '''
 |description        = {{''' + description_language + '''|1=''' + description + '''}}
 |depicted people    =
 |depicted place     =
 |date               = ''' + date_string + '''
 |medium             = ''' + medium + '''
 |dimensions         = {{Size|in|''' + width + '|' + height + '''}}
 |institution        = {{Institution:''' + templated_institution + '''}}
 |department         =
 |accession number   = 
 |place of creation  = 
 |place of discovery =
 |object history     =
 |exhibition history =
 |credit line        =
 |inscriptions       =
 |notes              = ''' + notes + '''
 |references         = {{cite web |title=''' + label + ' |url=' + reference_url + ' |accessdate=' + reference_date + '''}}
 |source             = ''' + source_name + '''
 |permission         =
 |other_versions     =
 |wikidata           = ''' + work_qid + '''
 |other_fields       =
}}

=={{int:license-header}}==
{{PD-Art|PD-old-100-expired}}

'''
    # Add all of the categories in the list
    for category_string in category_strings:
        page_wikitext += '[[Category:' + category_string + ''']]
'''
    
    return page_wikitext

# ---------------
# Not used yet
# ---------------
# borrowed from VanderBot

# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = commons_sleep*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

In [None]:
# ---------------------------
# Body of main script
# ---------------------------

# This section contains configuration information and performs necessary logins
# It needs to be run once before the rest of the code
# No writing is done, so it's "safe" to run any time

'''
# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1

# NOTE: as of 2020-04-27, the function that needs maxlag isn't being used for anything, so this value doesn't matter
maxlag = 5
'''

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

# This is the format of the credentials file. 
# Username and password are for a bot that you've created.

'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
'''

# ---------------------------
# Load data from CSVs into DataFrames
# ---------------------------

# Note: setting the index to be the Q ID requires that qid has a unique value for each row. This should be the case.
works_metadata = pd.read_csv('../works_multiprop.csv', na_filter=False, dtype = str)
works_metadata.set_index('qid', inplace=True)

raw_metadata = pd.read_csv('../gallery_works_renamed1.csv', na_filter=False, dtype = str)
raw_metadata.set_index('accession_number', inplace=True)

image_dimensions = pd.read_csv('image_dimensions.csv', na_filter=False, dtype = str)
# Convert some columns to integers
image_dimensions[['kilobytes', 'height', 'width']] = image_dimensions[['kilobytes', 'height', 'width']].astype(int)

works_classification = pd.read_csv('../../gallery_buchanan/works_classification.csv', na_filter=False, dtype = str)
works_classification.set_index('qid', inplace=True)

works_ip_status = pd.read_csv('../items_status_abbrev.csv', na_filter=False, dtype = str)
works_ip_status.set_index('qid', inplace=True)

existing_images = pd.read_csv('commons_images.csv', na_filter=False, dtype = str)
existing_images.set_index('qid', inplace=True)

# For testing purposes, just use the first few rows of the works metadata
test_rows = 60
works_metadata = works_metadata.head(test_rows).copy()

# ---------------------------
# Commons API Post Authentication (create session and generate CSRF token)
# ---------------------------

# If credentials file location is relative to current working directory, use subfolders through file name with no leading slash
# Example: myproj/credentials/commons_credentials.txt
# If credentials file is in current working directory, only filename is necessary
# Need to give example for absolute path on Windows - use Unix forward slashes?
path = 'commons_credentials.txt'
path_is_relative_to_home_directory = True # set to True if relative home directory, False if absolute path or relative to working directory
result = login(path, path_is_relative_to_home_directory)
# print(result)
commons_session = result['session']
commons_csrf_token = result['csrftoken']
# Commons API endpoint URL is in result['endpoint'], but it is going to be hard coded anyway, so ignore

print('done')

In [None]:
# ---------------------------
# Screen works for appropriate images to upload
# ---------------------------

# The row index is the Q ID and is a string. The work object is the data in the row and is a Pandas series
# The items in the row series can be referred to by their labels, which are the column headers, e.g. work['label_en']
for index, work in works_metadata.iterrows():    
    # Screen out images that are already in Commons
    if index in existing_images.index:
        continue
    
    # Screen for 2 dimensional works
    if index in works_classification.index:
        # Find the row whose index matches the Q ID of the work, then the item by name within the series (dimension)
        # Note: this method of location works because the Q ID index is unique for each row in the lookup table.
        if works_classification.loc[index, 'dimension'] != '2D': # skip this work if not 2D
            continue

    # Screen for public domain images
    # NOTE: the IP status was only done for cases where the script was able to match up image file names with accession numbers.
    # It should be done again to pick up more images based on the new image_dimensions.csv file after it's cleaned up.
    # There are at least a thousand works that will get screened out here because they aren't imaged.
    if not index in works_ip_status.index:
        continue
    else:
        ip_status = works_ip_status.loc[index, 'status']
        if not ip_status in public_domain_categories:
            continue
    
    # Screen for high resolution images
    image_dimension_frame = image_dimensions.loc[image_dimensions.accession == work['inventory_number']] # result is DataFrame
    if len(image_dimension_frame) == 0: # skip any works whose image can't be found in the dimensions data
        continue

    # Order rows by size in kB, then take the first row
    image_dimension_series = image_dimension_frame.sort_values(by=['kilobytes'], ascending=False).iloc[0]
    # Skip work if its image doesn't meet the minimum size requirement
    if size_filter == 'pixsquared':
        if image_dimension_series['height'] * image_dimension_series['width'] < minimup_pixel_squared:
            continue
    elif size_filter == 'filesize':
        if image_dimension_series['kilobytes'] < minimum_filesize:
            continue
    else: # don't apply a size filter
        pass

    # Create inception date string for template from VanderBot upload CSV
    inception_val = work['inception_val']
    inception_prec = work['inception_prec']
    inception_earliest_date_val = work['inception_earliest_date_val']
    inception_earliest_date_prec = work['inception_earliest_date_prec']
    inception_latest_date_val = work['inception_latest_date_val']
    inception_latest_date_prec = work['inception_latest_date_prec']
    inception_sourcing_circumstances = work['inception_sourcing_circumstances']
    date_string = create_template_date_string(inception_val, inception_prec, inception_earliest_date_val, inception_earliest_date_prec, inception_latest_date_val, inception_latest_date_prec, inception_sourcing_circumstances)

    # Get raw string data directly from the Artstor downloae
    try:
        raw_series = raw_metadata.loc[work['inventory_number']]
        gift = raw_series['gift_of']
        if gift != '':
            notes = 'Gift of ' + gift
        else:
            notes = ''
        medium = raw_series['medium']
    except:
        notes = ''
        medium = ''

    # Get the remaining metadata from the VanderBot upload CSV
    work_qid = index
    label = work['label_en']
    wikidata_description = work['description_en']
    artist_qid = work['creator']
    description = label + ', ' + wikidata_description
    description_language = default_language
    width = work['width_val']
    height = work['height_val']
    reference_url = work['inventory_number_ref1_referenceUrl']
    reference_date = work['inventory_number_ref1_retrieved_val']

    page_wikitext = create_commons_template(work_qid, label, description, description_language, artist_qid, date_string, width, height, source_name, templated_institution, reference_url, reference_date, notes, medium, category_strings)
    print(page_wikitext)

    # The local_filename is the name of the file as it exists locally.
    local_filename = image_dimension_series['name']

    # subdirectory is the directory that contains the local file. It's within the local_image_directory_path. 
    # Don't include a trailing slash.
    # If images are directly in the directory_path, use empty string ('') as the value.
    subdirectory = image_dimension_series['subdir']

    # filename_institution is the name of the institution to be inserted between the descriptive text and the local filename
    filename_institution = 'Vanderbilt Fine Arts Gallery'

    # file_prefix is descriptive text to be prepended to the local_filename, to be used when the file is in Commons
    # Commons filename length limit is 240 bytes. To be safe, limit to 230.
    byte_limit = 230 - len((' - ' + filename_institution + ' - ' + local_filename).encode("utf8"))
    if len(label.encode("utf8")) < byte_limit:
        file_prefix = label
    else:
        file_prefix = label.encode("utf8")[:byte_limit].decode('utf8')

    # Set image_filename to the raw filename (can include spaces). The API will substitute underscores as it likes.
    # For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

    commons_filename = file_prefix + ' - ' + filename_institution + ' - ' + local_filename

    # Add the subdirectory (if any) to the path
    if subdirectory != '':
        full_path = local_image_directory_path + subdirectory + '/'
    else:
        full_path = local_image_directory_path

    sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
    data = upload_file_to_commons(local_filename, commons_filename, full_path, path_is_relative_to_home_directory, commons_session, commons_csrf_token, sleeptime, page_wikitext)

    #response = commons_session.post(endpointUrl, files=file_dict, data = parameters)
    #data = response.json()
    print(json.dumps(data, indent=2))


In [None]:
# ----------------
# Set the image caption
# ----------------

# This has to be done in an API call separate from the upload 
# since the caption is a Wikibase label and not part of the Wikitext

'''
parameters = {
    'action': 'wbsetlabel',
    'format': 'json',
    'token': commons_csrf_token,
    'site': 'commonswiki',
    'title': 'File:' + image_filename,
    'value': caption,
    'language': caption_language,
    'summary': 'Add caption via API'
}
'''
#print(json.dumps(parameters, indent = 2))

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping

# The caption does not have to be the same as the description, but for convenience, I'm making them the same
caption = description
caption_language = default_language

data = set_commons_image_caption(commons_filename, caption, caption_language, commons_session, commons_csrf_token, sleeptime)

#response = commons_session.post(endpointUrl, data = parameters)
#data = response.json()
print(json.dumps(data, indent=2))

#sleep(commons_sleep)



In [None]:
# ----------------
# Add structured data
# ----------------

# Intro on structured data: https://commons.wikimedia.org/wiki/Commons:Structured_data
# See also this on GLAM https://commons.wikimedia.org/wiki/Commons:Structured_data/GLAM

# NOTE: artworks will get flagged if they don't have P6243 in their structured data
# non-public domain works get flagged if they don't have a P275 license statement in structured data

property_p_id = 'P6243' # digital representaion of
value_q_id = work_qid # the artwork in Wikidata

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
data = create_commons_claim(commons_filename, property_p_id, value_q_id, commons_session, commons_csrf_token, sleeptime)
'''
wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
property_p_id = 'P180' # depicts
value_q_id = 'Q384177' # Egyptian Revival (architecture)

stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
value_dictionary = {
    'entity-type': 'item',
    'numeric-id': stripped_q_number
}
value_json_string = json.dumps(value_dictionary)

parameters = {
    'action':'wbcreateclaim',
    'format':'json',
    'token': commons_csrf_token,
    'entity': wikibase_subject_id,
    'snaktype':'value',
    'property': property_p_id,
    # note: the value is a JSON string, not an actual data structure.  I think it will get URL encoded by requests before posting
    'value': value_json_string,
    'summary': 'Add depicts value structured data via API'
}

#print(json.dumps(parameters, indent = 2))
response = commons_session.post(endpointUrl, data = parameters)
data = response.json()
'''
print(json.dumps(data, indent=2))

#sleep(commons_sleep)


# Code development cells

Don't run these, they are left for historical reference.

In [None]:
image_filename = 'Adoration of the Sheperds - Vanderbilt Fine Arts Gallery - 1979.0264P.tif'
get_commons_image_pageid(image_filename)

In [None]:
# path where image file is located with trailing forward slash
# relative to working directory or an absolute path if path_is_relative_to_home_directory = False
# relative to home directory if path_is_relative_to_home_directory = True
directory_path = 'gallery_digital_image_archive/'
path_is_relative_to_home_directory = True

# The local_filename is the name of the file as it exists locally.
local_filename = '1984.021.tif'

# subdirectory is the directory that contains the local file. It's within the directory_path. 
# Don't include a trailing slash.
# If images are directly in the directory_path, use empty string ('') as the value.
subdirectory = '1984'

# file_prefix is descriptive text to be prepended to the local_filename, to be used when the file is in Commons
file_prefix = 'A Conversation with Guido di Brettinoro'

# filename_institution is the name of the institution to be inserted between the descriptive text and the local filename
filename_institution = 'Vanderbilt Fine Arts Gallery'

# Set image_filename to the raw filename (can include spaces). The API will substitute underscores as it likes.
# For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

commons_filename = file_prefix + ' - ' + filename_institution + ' - ' + local_filename

# The caption has to be set in a separate operation from the file upload
# But it's set here so that its text can be used for the description
# Captions must be less than 255 characters. 
# There can be multiple captions in different languages, but only one per language.
caption = "A Conversation with Guido di Brettinoro, a print by John Flaxman from Illustrations to Dante's Divine Comedy"
caption_language = 'en'

# The description doesn't have to be the same as the caption.
# It can be much longer and contain Wiki formatting, such as links. 
description = caption
description_language = 'en'

# Category
category_strings = ['Vanderbilt University Fine Arts Gallery']

artist_qid = 'Q366066'
label = "A Conversation with Guido di Brettinoro, (Purgatorio, Canto 14) from Illustrations to Dante's Divine Comedy"
date_string = '1807'
width = '13'
height = '9'
templated_institution = 'Vanderbilt University Fine Arts Gallery'
notes = 'Gift of Thomas B. Brumbaugh'
reference_url = 'https://library.artstor.org/#/asset/26755766'
reference_date = '2 December 2020'
source_name = 'Vanderbilt University Fine Arts Gallery'
work_qid = 'Q102961225'

# ---------------------------
# Upload a local file to Commons and set basic metadata
# ---------------------------

# The 'text' parameter value provides the required file information that shows up in new sections.

# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork
# Historical photographs (e.g. museums) https://commons.wikimedia.org/wiki/Template:Photograph
# Art photo template adds to artwork template https://commons.wikimedia.org/wiki/Template:Art_Photo
# Credit line template provides attribution text requred for CC BY licenses https://commons.wikimedia.org/wiki/Template:Credit_line

# Wiki text based on Artwork template:
# Note: script is hardcoded with 'page_wikitext' so need to change that depending on if this one or the Information template is used

# See https://commons.wikimedia.org/wiki/Commons:When_to_use_the_PD-Art_tag for info about tagging artwork as Public Domain
# See also https://commons.wikimedia.org/wiki/Commons:Licensing#Material_in_the_public_domain
# See also https://commons.wikimedia.org/wiki/Commons:Copyright_tags/Country-specific_tags#United_States_of_America

# NOTE: when art template is used and a Wikidata Q ID is given, 
# the page will pick up the Object type and Place of creation automatically from Wikidata, so values don't need to be provided.
# Not sure if that's a result of the wikidata field here or providing the P6243 (digital representation) value in Structured data.
# Other stuff like accession number and inscriptions also get picked up

# artwork_description_wikitext = '''
page_wikitext = '''
=={{int:filedesc}}==
{{Artwork
 |artist             = {{ Creator | Wikidata = ''' + artist_qid + ''' | Option = {{{1|}}} }}
 |title              = ''' + "{{en|'''" + label + "'''.}}" + '''
 |description        = {{''' + description_language + '''|1=''' + description + '''}}
 |depicted people    =
 |depicted place     =
 |date               = ''' + date_string + '''
 |medium             =
 |dimensions         = {{Size|in|''' + width + '|' + height + '''}}
 |institution        = {{Institution:''' + templated_institution + '''}}
 |department         =
 |accession number   = 
 |place of creation  = 
 |place of discovery =
 |object history     =
 |exhibition history =
 |credit line        =
 |inscriptions       =
 |notes              = ''' + notes + '''
 |references         = {{cite web |title=''' + label + ' |url=' + reference_url + ' |accessdate=' + reference_date + '''}}
 |source             = ''' + source_name + '''
 |permission         =
 |other_versions     =
 |wikidata           = ''' + work_qid + '''
 |other_fields       =
}}

=={{int:license-header}}==
{{PD-Art|PD-old-100-expired}}

'''

# Here's what I used for a basic description template (the "Information" template):
basic_description_wikitext = '''
=={{int:filedesc}}==
{{Information
 |description={{''' + description_language + '''|1=''' + description + '''}}
 |date=2020-12-27 10:00:00
 |source={{own}}
 |author=[[User:Baskaufs|Steven J. Baskauf]]
 |permission=
 |other versions=
 |other_fields={{Credit line 
  |Author = © Steven J. Baskauf
  |Other = Wikimedia Commons
  |License = [https://creativecommons.org/licenses/by/4.0/ CC BY 4.0]}}}}

{{Location|41.30941681442741|-72.92922321706307}}

=={{int:license-header}}==
{{self|cc-by-4.0}}

'''
# Note: trailing blank line assumes that categories will be appended, see loop below.

'''
parameters = {
    'action': 'upload',
    'filename': image_filename,
    'format': 'json',
    'token': commons_csrf_token,
    'ignorewarnings': 1,
    'text': page_wikitext,
    # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
    # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
    'comment': 'Uploaded image file and metadata via API'
}
'''

# Add all of the categories in the list
for category_string in category_strings:
    page_wikitext += '''[[Category:''' + category_string + ''']]
    '''

# Add the subdirectory (if any) to the path
if subdirectory != '':
    full_path = directory_path + subdirectory + '/'
else:
    full_path = directory_path

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
data = upload_file_to_commons(local_filename, commons_filename, full_path, path_is_relative_to_home_directory, commons_session, commons_csrf_token, sleeptime, page_wikitext)

#response = commons_session.post(endpointUrl, files=file_dict, data = parameters)
#data = response.json()
print(json.dumps(data, indent=2))

# Most page info can be set when the page is created as above.
# To edit an existing page, the edit action must be used
# API information on edit action: https://commons.wikimedia.org/w/api.php?action=help&modules=edit

In [None]:

# The local_filename is the name of the file as it exists locally.
local_filename = image_dimension_series['name']

# subdirectory is the directory that contains the local file. It's within the local_image_directory_path. 
# Don't include a trailing slash.
# If images are directly in the directory_path, use empty string ('') as the value.
subdirectory = image_dimension_series['subdir']

# file_prefix is descriptive text to be prepended to the local_filename, to be used when the file is in Commons
file_prefix = 'Chinese Winter Landscape'

# filename_institution is the name of the institution to be inserted between the descriptive text and the local filename
filename_institution = 'Vanderbilt Fine Arts Gallery'

# Set image_filename to the raw filename (can include spaces). The API will substitute underscores as it likes.
# For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

commons_filename = file_prefix + ' - ' + filename_institution + ' - ' + local_filename


# Add the subdirectory (if any) to the path
if subdirectory != '':
    full_path = local_image_directory_path + subdirectory + '/'
else:
    full_path = local_image_directory_path

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
data = upload_file_to_commons(local_filename, commons_filename, full_path, path_is_relative_to_home_directory, commons_session, commons_csrf_token, sleeptime, page_wikitext)

#response = commons_session.post(endpointUrl, files=file_dict, data = parameters)
#data = response.json()
print(json.dumps(data, indent=2))


In [None]:
# Look at https://commons.wikimedia.org/wiki/File:USS_Arizona_afloat_after_launch_NARA_19-LC-19A-24.tif
# to see how they linked to their collection in Wikidata and also how they did the Record ID
# They seem to be using their own NARA template

# Categorization: according to https://commons.wikimedia.org/wiki/Commons:Bots#Bot_accounts all uploads are expected to apply at least one category



# Linking
# Wikilinks from the Commons: https://en.wikipedia.org/wiki/Wikipedia:Wikilinks_from_the_Commons


# Specific script to upload Learn Wikidata videos

In [None]:
import pandas as pd
language_dict = {'en': 'in English', 'es': 'en español', 'zh-Hans': '汉语'}

# Category
category_strings = ['Wikidata videos', 'Learn Wikidata videos by the Vanderbilt Libraries']

# path where image file is located with trailing forward slash
# relative to working directory or an absolute path if path_is_relative_to_home_directory = False
# relative to home directory if path_is_relative_to_home_directory = True
directory_path = 'Documents/video_conversion/webm/'
path_is_relative_to_home_directory = True
sleeptime = commons_sleep # use zero if running individual cells manually, use commons_sleep if code in single script or looping

filename = 'upload_metadata.csv'
media_items_list = read_dict(filename)
for media_item in media_items_list[12:]:

    # Set image_filename to the raw filename (can include spaces). The API will substitute underscores as it likes.
    # For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

    image_filename = media_item['filename']
    print(image_filename)
    print()

    # The caption has to be set in a separate operation from the file upload
    # But it's set here so that its text can be used for the description
    # Captions must be less than 255 characters. 
    # There can be multiple captions in different languages, but only one per language.
    caption = media_item['caption']
    caption_language = media_item['language'].lower()

    # The description doesn't have to be the same as the caption.
    # It can be much longer and contain Wiki formatting, such as links. 
    description = media_item['description']
    description_language = media_item['language'].lower()
    iso_date = str(pd.to_datetime(media_item['date']))
    #print(iso_date)

    # ---------------------------
    # Upload a local file to Commons and set basic metadata
    # ---------------------------

    #basic_description_wikitext = '''
    page_wikitext = '''
=={{int:filedesc}}==
{{Information
|description={{''' + description_language.lower() + '''|1=''' + description + '''}}
|date=''' + iso_date + '''
|source=[https://www.learnwikidata.net/?''' + media_item['language'] + ''' Learn Wikidata website]
|author=Vanderbilt University
|permission=
|other versions= <gallery>
File:''' + media_item['other_versions1'] + '|[[:File:' + media_item['other_versions1'] + '|' + language_dict[media_item['other_versions1_language']] + ''']]
File:''' + media_item['other_versions2'] + '|[[:File:' + media_item['other_versions2'] + '|' + language_dict[media_item['other_versions2_language']] + ''']]
</gallery>
|other_fields={{Credit line 
 |Author = © Vanderbilt University
 |Other = Wikimedia Commons
 |License = [https://creativecommons.org/licenses/by/4.0/ CC BY 4.0]}}}}

=={{int:license-header}}==
{{self|cc-by-4.0}}

'''
    # Note: trailing blank line assumes that categories will be appended, see loop below.

    # Add all of the categories in the list
    for category_string in category_strings:
        page_wikitext += '''[[Category:''' + category_string + ''']]
'''
    # print(page_wikitext)

    data = upload_file_to_commons(image_filename, directory_path, path_is_relative_to_home_directory, commons_session, commons_csrf_token, sleeptime, page_wikitext)
    #print(json.dumps(data, indent=2))
    print('Upload:', data['upload']['result'])

    # ----------------
    # Set the image caption
    # ----------------

    # This has to be done in an API call separate from the upload 
    # since the caption is a Wikibase label and not part of the Wikitext

    data = set_commons_image_caption(image_filename, caption, caption_language, commons_session, commons_csrf_token, sleeptime)
    #print(json.dumps(data, indent=2))
    if data['success'] == 1:
        status_message = 'Success'
    else:
        status_message = 'Failed'
    print('Caption:', status_message)

    # ----------------
    # Add structured data
    # ----------------

    property_p_id = 'P275' # copyright license
    value_q_id = 'Q20007257' # Creative Commons Attribution 4.0 International
    data = create_commons_claim(image_filename, property_p_id, value_q_id, commons_session, commons_csrf_token, sleeptime)
    #print(json.dumps(data, indent=2))
    if data['success'] == 1:
        status_message = 'Success'
    else:
        status_message = 'Failed'
    print('License claim:', status_message)

    property_p_id = 'P6216' # copyright status
    value_q_id = 'Q50423863' # copyrighted
    data = create_commons_claim(image_filename, property_p_id, value_q_id, commons_session, commons_csrf_token, sleeptime)
    #print(json.dumps(data, indent=2))
    if data['success'] == 1:
        status_message = 'Success'
    else:
        status_message = 'Failed'
    print('Copyrighted claim:', status_message)
    print()
    print('--------------------------')
    print()

print('done')