In [47]:
# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# ----------------
# Common code
# ----------------

# This section contains import statements, configuration values, and function definitions.
# It should be run before running other cells

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys

sparqlSleep = 0.25 # delay time between calls to SPARQL endpoint
commonsSleep = 5 # non-critical edits to commons no faster than this. https://commons.wikimedia.org/wiki/Commons:Bots#Bot_speed

# -----------------------------------------------------------------
# function definitions

def retrieveCredentials(path):
    with open(path, 'rt') as fileObject:
        lineList = fileObject.read().split('\n')
    endpointUrl = lineList[0].split('=')[1]
    username = lineList[1].split('=')[1]
    password = lineList[2].split('=')[1]
    userAgent = lineList[3].split('=')[1]
    credentials = [endpointUrl, username, password, userAgent]
    return credentials

def getLoginToken(apiUrl):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def logIn(apiUrl, token, username, password):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def getCsrfToken(apiUrl):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# read a CSV into a list of dictionaries
def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

# gunction to get local name from an IRI
def extractFromIri(iri, numberPieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[numberPieces]

# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    baseDelay = 5 # Wikidata recommends a delay of at least 5 seconds
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

{
  "batchcomplete": "",
  "query": {
    "normalized": [
      {
        "from": "File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg",
        "to": "File:Masonry patterns in doorway Tetouan Morocco.jpg"
      }
    ],
    "pages": {
      "89481185": {
        "pageid": 89481185,
        "ns": 6,
        "title": "File:Masonry patterns in doorway Tetouan Morocco.jpg",
        "categories": [
          {
            "ns": 14,
            "title": "Category:Architecture"
          },
          {
            "ns": 14,
            "title": "Category:CC-BY-4.0"
          },
          {
            "ns": 14,
            "title": "Category:Morocco"
          },
          {
            "ns": 14,
            "title": "Category:Pages with local camera coordinates and missing SDC coordinates"
          },
          {
            "ns": 14,
            "title": "Category:Pages with maps"
          },
          {
            "ns": 14,
            "title": "Category:Self-published work"
         

In [48]:
# ---------------------------
# Commons API Post Authentication
# ---------------------------

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

# This is the format of the wikibase_credentials.txt file. Username and password
# are for a bot that you've created.  Save file in your home directory.
# Set your own User-Agent header. Do not use the one listed here
# See https://meta.wikimedia.org/wiki/User-Agent_policy
'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
userAgentHeader=YourBot/0.1 (someuser@university.edu)
'''

# default API resource URL when a Wikibase/Wikidata instance is installed.
resourceUrl = '/w/api.php'

home = str(Path.home()) # gets path to home directory; supposed to work for Win and Mac
credentialsFilename = 'commons_credentials.txt'
credentialsPath = home + '/' + credentialsFilename
credentials = retrieveCredentials(credentialsPath)
endpointUrl = credentials[0] + resourceUrl
user = credentials[1]
pwd = credentials[2]
userAgentHeader = credentials[3]

# Instantiate session outside of any function so that it's globally accessible.
session = requests.Session()
# Set default User-Agent header so you don't have to send it with every request
session.headers.update({'User-Agent': userAgentHeader})


loginToken = getLoginToken(endpointUrl)
data = logIn(endpointUrl, loginToken, user, pwd)
csrfToken = getCsrfToken(endpointUrl)

# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1
maxlag = 5

In [50]:
# Generic Commons API reference: https://commons.wikimedia.org/w/api.php
# API Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
# General Sandbox index: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles

# Need to figure out how to add "summary". That might prevent the blank "Description" box on the
# contributions page for the user, since adding description later doesn't do it.
# See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
# The action=edit API help says 'summary' is used for edit summary, but when I use it, I get the warning:
# "warnings": {"main": {"*": "Unrecognized parameter: summary."}}
# Use 'comment' value in the post parameters during upload 

# The 'text' parameter value provides the required file information that shows up in new sections.
# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork

# Set image_filename to the name to be used for the image once it has been uploaded.
# This will also be used to generate the image page title, which will be the same, but with
# spaces replacing the underscores.
# For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

image_filename = 'Company_seal_ballard_and_ballard_obelisk_flour_building_memphis.jpg'
caption = 'Company seal in obelisk ornament on Balard and Ballard Company Obelisk Flour Building, Memphis, Tennessee.'
language = 'en'
description = caption # The description doesn't have to be the same as the caption and is less restricted.

parameters = {
    'action': 'upload',
    'filename': image_filename,
    'format': 'json',
    'token': csrfToken,
    'ignorewarnings': 1,
    'text': '''=={{int:filedesc}}==
{{Artwork
 |artist             = {{unknown|artist}}
 |title              =
 |description        = {{en|1=''' + description + '''}}
 |date               = {{other date|between|1923|1924}}
 |medium             = concrete
 |dimensions         =
 |institution        =
 |department         =
 |place of discovery =
 |object history     =
 |exhibition history =
 |credit line        =
 |inscriptions       = {{inscription |1=BALLARDS OBELISK LOUISVILLE |description=company seal}}
 |notes              = 
 |accession number   =
 |place of creation  =
 |source             = [[User:Baskaufs|Steven J. Baskauf]]
 |permission         =
 |other_versions     =
 |references         = Building description in [https://sah-archipedia.org/buildings/TN-01-157-0065 Society of Architectural Historians Archipedia]
 |depicted place     =
 |wikidata           =
}}

{{Location|35.13832222222222|-90.05915}}

=={{int:license-header}}==
{{self|cc-by-4.0}}''',
    'comment': 'Uploaded own work via API'
}

# Here's what I used for a basic Description template (not the Artwork template):

# parameters = {
#    'action': 'edit',
#    'format': 'json',
#    'token': csrfToken,
#    'title': 'File:San xavier del bac door handle.jpg',
#    'sectiontitle': 'Licensing and Summary',
#    'text': '''=={{int:filedesc}}==
#{{Information
#|description={{en|1=Door handle shaped like a snake at San Xavier del Bac mission, Arizona.}}
#|date=2016-05-25 11:31:08
#|source={{own}}
#|author=[[User:Baskaufs|Steven J. Baskauf]]
#|permission=
#|other versions=
#}}
#{{Location|32.106994444444446|-111.00788055555556}}

directory_path = 'Downloads/'
file_path = directory_path + image_filename
file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
#print(parameters)
#print(file_dict)

response = session.post(endpointUrl, files=file_dict, data = parameters)
data = response.json()
print(json.dumps(data, indent=2))

# for non-critical applications, do not hit the API rapidly
sleep(commonsSleep)

# Most page info can be set when the page is created as above.
# To edit an existing page, the edit action must be used
# API information on edit action: https://commons.wikimedia.org/w/api.php?action=help&modules=edit

{
  "upload": {
    "result": "Success",
    "filename": "Company_seal_ballard_and_ballard_obelisk_flour_building_memphis.jpg",
    "imageinfo": {
      "timestamp": "2020-04-27T01:38:13Z",
      "user": "Baskaufs",
      "userid": 210439,
      "size": 1210501,
      "width": 2448,
      "height": 2448,
      "parsedcomment": "Uploaded own work via API",
      "comment": "Uploaded own work via API",
      "canonicaltitle": "File:Company seal ballard and ballard obelisk flour building memphis.jpg",
      "url": "https://upload.wikimedia.org/wikipedia/commons/8/84/Company_seal_ballard_and_ballard_obelisk_flour_building_memphis.jpg",
      "descriptionurl": "https://commons.wikimedia.org/wiki/File:Company_seal_ballard_and_ballard_obelisk_flour_building_memphis.jpg",
      "sha1": "5544158b0e47caf73d19d79ec65dbf8eb89cee4d",
      "metadata": [
        {
          "name": "Make",
          "value": "Apple"
        },
        {
          "name": "Model",
          "value": "iPhone 5s"
     

In [60]:
# ----------------
# Set the image caption using wbsetlabel
# ----------------

# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical

parameters = {
    'action': 'wbsetlabel',
    'format': 'json',
    'token': csrfToken,
    'site': 'commonswiki',
    'title': 'File:' + image_filename,
    'value': caption,
    'language': language,
    'summary': 'Add caption via API'
}

#print(json.dumps(parameters, indent = 2))

response = session.post(endpointUrl, data = parameters)
data = response.json()
print(json.dumps(data, indent=2))

sleep(commonsSleep)


{
  "action": "wbsetlabel",
  "format": "json",
  "token": "0c43eb738048e512b4161a13b63cbc7b5ea63751+\\",
  "site": "commonswiki",
  "title": "File:company_seal_ballard_and_ballard_obelisk_flour_building_memphis.jpg",
  "value": "Company seal in obelisk ornament on Balard and Ballard Company Obelisk Flour Building, Memphis, Tennessee.",
  "language": "en",
  "summary": "Add caption via API"
}
{
  "entity": {
    "labels": {
      "en": {
        "language": "en",
        "value": "Company seal in obelisk ornament on Balard and Ballard Company Obelisk Flour Building, Memphis, Tennessee."
      }
    },
    "id": "M89518466",
    "type": "mediainfo",
    "lastrevid": 415221224
  },
  "success": 1
}


In [67]:
# -------------------
# Retrieving image data from the Commons MediaWiki API
# -------------------

# Initially, I thought that it was necessary to know the Wikibase entity ID to set the caption.
# So I wrote code to extract that from a query. However, one can use the page title, so that
# isn't actually necessary. But it might be needed anyway for the structured data part.

# The Wikibase entity ID can be used in lieu of the site+page name.
# The format is "M" plus the page ID. So page ID 41837276 has the entity ID M41837276
# Use action=query&prop=info&titles=File:Pluto-01_Stern_03_Pluto_Color_TXT.jpg and 
# extract the pageid field from result.

# Commons API examples: https://commons.wikimedia.org/wiki/Commons:API/MediaWiki
# sandbox: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles=File%3AMasonry_patterns_in_doorway_Tetouan_Morocco.jpg
apiUrl = 'https://commons.wikimedia.org/w/api.php'

'''
# get photos by a user
params = {
    'action': 'query',
    'format': 'json',
    'list': 'allimages',
    'aiuser': 'Baskaufs',
    'aisort': 'timestamp'
}
'''

'''
# get information about a photo
params = {
    'action': 'query',
    'format': 'json',
    'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
    'prop': 'categories|imageinfo'
}
'''

'''
# get raw metadata embedded in a photo
params = {
    'action': 'query',
    'format': 'json',
    'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
    'prop': 'imageinfo',
    'iiprop': 'metadata',
    'iimetadataversion': 'latest'
}
'''

'''
# get metadata for a photo including from file page
params = {
    'action': 'query',
    'format': 'json',
    'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
    'prop': 'imageinfo',
    'iiprop': 'extmetadata'
}
'''

# get metadata for a photo including from file page
params = {
    'action': 'query',
    'format': 'json',
    'titles': 'File:' + image_filename,
    'prop': 'info'
}

response = requests.get(apiUrl, params=params)
data = response.json()
#print(json.dumps(data, indent=2))
page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
page_id = page_id_list[0] # info on only one page was requested, so get item 0
print('Page ID:',page_id)

Page ID: 89518466


In [70]:
# ----------------
# Add structured data
# ----------------

# Intro on structured data: https://commons.wikimedia.org/wiki/Commons:Structured_data
# See also this on GLAM https://commons.wikimedia.org/wiki/Commons:Structured_data/GLAM

# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html

wikibase_subject_id = 'M' + page_id
property_p_id = 'P180' # depicts
value_q_id = 'Q384177' # Egyptian Revival (architecture)

stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
value_dictionary = {
    'entity-type': 'item',
    'numeric-id': stripped_q_number
}
value_json_string = json.dumps(value_dictionary)

parameters = {
    'action':'wbcreateclaim',
    'format':'json',
    'token': csrfToken,
    'entity': wikibase_subject_id,
    'snaktype':'value',
    'property': property_p_id,
    # note: the value is a JSON string, not an actual data structure.  I think it will get URL encoded by requests before posting
    'value': value_json_string,
    'summary': 'Add depicts value structured data via API'
}

#print(json.dumps(parameters, indent = 2))
response = session.post(endpointUrl, data = parameters)
data = response.json()
print(json.dumps(data, indent=2))

sleep(commonsSleep)


{
  "action": "wbcreateclaim",
  "format": "json",
  "token": "0c43eb738048e512b4161a13b63cbc7b5ea63751+\\",
  "entity": "M89518466",
  "snaktype": "value",
  "property": "P180",
  "value": "{\"entity-type\": \"item\", \"numeric-id\": \"384177\"}"
}
{
  "pageinfo": {
    "lastrevid": 415222179
  },
  "success": 1,
  "claim": {
    "mainsnak": {
      "snaktype": "value",
      "property": "P180",
      "hash": "0c4f254ee789fc7cbe7e0cee8d4f3680ff7a7559",
      "datavalue": {
        "value": {
          "entity-type": "item",
          "numeric-id": 384177,
          "id": "Q384177"
        },
        "type": "wikibase-entityid"
      },
      "datatype": "wikibase-item"
    },
    "type": "statement",
    "id": "M89518466$E97FF702-B2A5-4656-AB8F-E4DEC359A2A8",
    "rank": "normal"
  }
}


In [None]:
# Look at https://commons.wikimedia.org/wiki/File:USS_Arizona_afloat_after_launch_NARA_19-LC-19A-24.tif
# to see how they linked to their collection in Wikidata and also how they did the Record ID
# They seem to be using their own NARA template

# Categorization: according to https://commons.wikimedia.org/wiki/Commons:Bots#Bot_accounts all uploads are expected to apply at least one category



# Linking
# Wikilinks from the Commons: https://en.wikipedia.org/wiki/Wikipedia:Wikilinks_from_the_Commons
