In [214]:
# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php



# ----------------
# Common code
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)

version = '0.2'
user_agent_string = 'BaskaufCommonsBot/' + version + ' (https://github.com/HeardLibrary/linked-data/tree/master/commonsbot; mailto:steve.baskauf@vanderbilt.edu)'

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            
# Commons identifier/URL conversion functions
# There are three identifiers used in Commons:

# The most basic one is the filename, unencoded and with file extension.

# The Commons web page URL is formed from the filename by prepending a subpath and "File:", replacing spaces in the filename with _, and URL-encoding the file name string
# The reverse process may be lossy because it assumes that underscores should be turned into spaces and the filename might actuall contain underscores.

# The Wikidata IRI identifier for the image is formed from the filename by URL-encoding it and prepending a subpath and "Special:FilePath/"
# It the reverse process is lossless since it simply reverse URL-encodes the local name part of the IRI.

commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
commons_page_prefix = 'https://commons.wikimedia.org/wiki/File:'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url

def commons_page_url_to_filename(url):
    # form of URL is: https://commons.wikimedia.org/wiki/File:Castle_De_Haar_(1892-1913)_-_360%C2%B0_Panorama_of_Castle_%26_Castle_Grounds.jpg
    string = url.split(commons_page_prefix)[1] # get local name file part of URL
    string = string.replace('_', ' ')
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_page_url(filename):
    filename = filename.replace(' ', '_')
    encoded_filename = urllib.parse.quote(filename)
    url = commons_page_prefix + encoded_filename
    url = url.replace('%28', '(').replace('%29', ')').replace('%2C', ',')
    return url


# Authentication functions

def login(path, relative_to_home):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        full_credentials_path = home + '/' + path
    else:
        full_credentials_path = path
    credentials = retrieve_credentials(full_credentials_path)
    
    resource_url = '/w/api.php' # default API resource URL for all Wikimedia APIs
    endpoint_url = credentials['url'] + resource_url

    # Instantiate session
    session = requests.Session()
    # Set default User-Agent header so you don't have to send it with every request
    session.headers.update({'User-Agent': user_agent_string})

    # Go through the sequence of steps needed to get get the CSRF token
    login_token = get_login_token(endpoint_url, session)
    data = session_login(endpoint_url, login_token, credentials['username'], credentials['password'], session)
    csrf_token = get_csrf_token(endpoint_url, session)
    return {'session': session, 'csrftoken': csrf_token, 'endpoint': endpoint_url}

def retrieve_credentials(path):
    with open(path, 'rt') as file_object:
        line_list = file_object.read().split('\n')
    endpoint_url = line_list[0].split('=')[1]
    username = line_list[1].split('=')[1]
    password = line_list[2].split('=')[1]
    #user_agent = line_list[3].split('=')[1]
    credentials = {'url': endpoint_url, 'username': username, 'password': password}
    return credentials

def get_login_token(apiUrl, session):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def session_login(apiUrl, token, username, password, session):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def get_csrf_token(apiUrl, session):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# Data upload functions

# API file Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
def upload_file_to_commons(image_filename, directory_path, relative_to_home, session, csrftoken, sleeptime, wikitext):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        directory_path = home + '/' + directory_path

    parameters = {
        'action': 'upload',
        'filename': image_filename,
        'format': 'json',
        'token': csrftoken,
        'ignorewarnings': 1,
        'text': wikitext,
        # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
        # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
        'comment': 'Uploaded media file and metadata via API'
    }
    #directory_path = 'Downloads/'
    file_path = directory_path + image_filename
    file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
    #print(parameters)
    #print(file_dict)

    response = session.post('https://commons.wikimedia.org/w/api.php', files=file_dict, data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    # for non-critical applications, do not hit the API rapidly
    sleep(sleeptime)
    return(data)

# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical
def set_commons_image_caption(image_filename, caption, caption_language, session, csrftoken, sleeptime):
    parameters = {
        'action': 'wbsetlabel',
        'format': 'json',
        'token': csrftoken,
        'site': 'commonswiki',
        'title': 'File:' + image_filename,
        'value': caption,
        'language': caption_language,
        'summary': 'Add caption via API'
    }

    #print(json.dumps(parameters, indent = 2))

    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return(data)

# This function is used in the following function, which needs a page ID rather than a name
def get_commons_image_pageid(image_filename):
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations
    return page_id

# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html
def create_commons_claim(image_filename, property_p_id, value_q_id, session, csrftoken, sleeptime):
    wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
    #property_p_id = 'P180' # depicts
    #value_q_id = 'Q384177' # Egyptian Revival (architecture)

    stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
    value_dictionary = {
        'entity-type': 'item',
        'numeric-id': stripped_q_number
    }
    value_json_string = json.dumps(value_dictionary)

    parameters = {
        'action':'wbcreateclaim',
        'format':'json',
        'token': csrftoken,
        'entity': wikibase_subject_id,
        'snaktype':'value',
        'property': property_p_id,
        # note: the value of 'value' is a JSON string, not an actual data structure.  
        #It will get URL encoded by requests before posting
        'value': value_json_string,
        'summary': 'Add structured data via API'
    }

    #print(json.dumps(parameters, indent = 2))
    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return data

# ---------------
# Not used yet
# ---------------

# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    baseDelay = 5 # Wikidata recommends a delay of at least 5 seconds
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

In [217]:
# ---------------------------
# Body of main script
# ---------------------------

# This section contains configuration information and performs necessary logins
# It needs to be run once before the rest of the code
# No writing is done, so it's "safe" to run any time

# These are recommended delay times to avoid hitting the APIs too frequently and getting blocked
sparql_sleep = 0.25 # delay time between calls to Wikidata SPARQL endpoint, probably could be lower (like 0.1)
commons_sleep = 5 # non-critical edits to commons no faster than this. https://commons.wikimedia.org/wiki/Commons:Bots#Bot_speed
read_api_sleep = 0.1

'''
# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1

# NOTE: as of 2020-04-27, the function that needs maxlag isn't being used for anything, so this value doesn't matter
maxlag = 5
'''

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

# This is the format of the credentials file. 
# Username and password are for a bot that you've created.
# Set your own User-Agent header. Do not use the one listed here
# See https://meta.wikimedia.org/wiki/User-Agent_policy
'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
userAgentHeader=YourBot/0.1 (someuser@university.edu)
'''

# ---------------------------
# Commons API Post Authentication (create session and generate CSRF token)
# ---------------------------

# If credentials file location is relative to current working directory, use subfolders through file name with no leading slash
# Example: myproj/credentials/commons_credentials.txt
# If credentials file is in current working directory, only filename is necessary
# Need to give example for absolute path on Windows - use Unix forward slashes?
path = 'commons_credentials.txt'
path_is_relative_to_home_directory = True # set to True if relative home directory, False if absolute path or relative to working directory
result = login(path, path_is_relative_to_home_directory)
# print(result)
commons_session = result['session']
commons_csrf_token = result['csrftoken']
# Commons API endpoint URL is in result['endpoint'], but it is going to be hard coded anyway, so ignore

print('done')

done


In [22]:
# Set image_filename to the name to be used for the image once it has been uploaded.
# This will also be used to generate the image page title, which will be the same, but with
# spaces replacing the underscores.
# For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

image_filename = 'Madrassa_Ben_Youssef_alcove_ceiling.jpg'

# The caption has to be set in a separate operation from the file upload
# But it's set here so that its text can be used for the description
# Captions must be less than 255 characters. 
# There can be multiple captions in different languages, but only one per language.
caption = 'Ceiling detail of alcove in Madrassa Ben Youssef, Marrakech, Morocco.'
caption_language = 'en'

# The description doesn't have to be the same as the caption.
# It can be much longer and contain Wiki formatting, such as links. 
# As this code stands, however, it will be the same as the caption.
description = caption

# ---------------------------
# Upload a local file to Commons and set basic metadata
# ---------------------------

# The 'text' parameter value provides the required file information that shows up in new sections.
# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork

# Wiki text based on Artwork template:
artwork_description_wikitext = '''
=={{int:filedesc}}==
{{Artwork
 |artist             = {{unknown|artist}}
 |title              =
 |description        = {{en|1=''' + description + '''}}
 |date               = {{other date|between|1923|1924}}
 |medium             = concrete
 |dimensions         =
 |institution        =
 |department         =
 |place of discovery =
 |object history     =
 |exhibition history =
 |credit line        =
 |inscriptions       = {{inscription |1=BALLARDS OBELISK LOUISVILLE |description=company seal}}
 |notes              = 
 |accession number   =
 |place of creation  =
 |source             = [[User:Baskaufs|Steven J. Baskauf]]
 |permission         =
 |other_versions     =
 |references         = Building description in [https://sah-archipedia.org/buildings/TN-01-157-0065 Society of Architectural Historians Archipedia]
 |depicted place     =
 |wikidata           =
}}

{{Location|35.13832222222222|-7.986186666666667}}

=={{int:license-header}}==
{{self|cc-by-4.0}}
'''

# Here's what I used for a basic Description template:
#basic_description_wikitext = '''
page_wikitext = '''
=={{int:filedesc}}==
{{Information
 |description={{en|1=''' + description + '''}}
 |date=2015-06-11 08:08:00
 |source={{own}}
 |author=[[User:Baskaufs|Steven J. Baskauf]]
 |permission=
 |other versions=
}}

{{Location|31.632058333333333|-7.986186666666667}}

=={{int:license-header}}==
{{self|cc-by-4.0}}
'''

'''
parameters = {
    'action': 'upload',
    'filename': image_filename,
    'format': 'json',
    'token': commons_csrf_token,
    'ignorewarnings': 1,
    'text': page_wikitext,
    # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
    # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
    'comment': 'Uploaded image file and metadata via API'
}
'''

#directory_path = 'Downloads/'
#file_path = directory_path + image_filename
#file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
#print(parameters)
#print(file_dict)

# path where image file is located with trailing forward slash
# relative to working directory or an absolute path if path_is_relative_to_home_directory = False
# relative to home directory if path_is_relative_to_home_directory = True
directory_path = 'Downloads/'
path_is_relative_to_home_directory = True
sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
data = upload_file_to_commons(image_filename, directory_path, path_is_relative_to_home_directory, commons_session, commons_csrf_token, sleeptime, page_wikitext)

#response = commons_session.post(endpointUrl, files=file_dict, data = parameters)
#data = response.json()
print(json.dumps(data, indent=2))

# Most page info can be set when the page is created as above.
# To edit an existing page, the edit action must be used
# API information on edit action: https://commons.wikimedia.org/w/api.php?action=help&modules=edit

{
  "upload": {
    "result": "Success",
    "filename": "Madrassa_Ben_Youssef_alcove_ceiling.jpg",
    "imageinfo": {
      "timestamp": "2020-04-27T16:07:55Z",
      "user": "Baskaufs",
      "userid": 210439,
      "size": 1671812,
      "width": 2448,
      "height": 2448,
      "parsedcomment": "Uploaded image file and metadata via API",
      "comment": "Uploaded image file and metadata via API",
      "canonicaltitle": "File:Madrassa Ben Youssef alcove ceiling.jpg",
      "url": "https://upload.wikimedia.org/wikipedia/commons/c/cd/Madrassa_Ben_Youssef_alcove_ceiling.jpg",
      "descriptionurl": "https://commons.wikimedia.org/wiki/File:Madrassa_Ben_Youssef_alcove_ceiling.jpg",
      "sha1": "824f76116df68c628b08cd03ef6ce4ce85ce0c53",
      "metadata": [
        {
          "name": "Make",
          "value": "Apple"
        },
        {
          "name": "Model",
          "value": "iPhone 5"
        },
        {
          "name": "Orientation",
          "value": 1
        },
  

In [23]:
# ----------------
# Set the image caption
# ----------------

# This has to be done in an API call separate from the upload 
# since the caption is a Wikibase label and not part of the Wikitext

'''
parameters = {
    'action': 'wbsetlabel',
    'format': 'json',
    'token': commons_csrf_token,
    'site': 'commonswiki',
    'title': 'File:' + image_filename,
    'value': caption,
    'language': caption_language,
    'summary': 'Add caption via API'
}
'''
#print(json.dumps(parameters, indent = 2))

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping

data = set_commons_image_caption(image_filename, caption, caption_language, commons_session, commons_csrf_token, sleeptime)

#response = commons_session.post(endpointUrl, data = parameters)
#data = response.json()
print(json.dumps(data, indent=2))

#sleep(commons_sleep)



{
  "entity": {
    "labels": {
      "en": {
        "language": "en",
        "value": "Ceiling detail of alcove in Madrassa Ben Youssef, Marrakech, Morocco."
      }
    },
    "id": "M89538708",
    "type": "mediainfo",
    "lastrevid": 415374189
  },
  "success": 1
}


In [25]:
# ----------------
# Add structured data
# ----------------

# Intro on structured data: https://commons.wikimedia.org/wiki/Commons:Structured_data
# See also this on GLAM https://commons.wikimedia.org/wiki/Commons:Structured_data/GLAM

property_p_id = 'P180' # depicts
value_q_id = 'Q2520975' # decoration = architectural features intended to beautify an area, a building or an object

sleeptime = 0 # use zero if running individual cells manually, use commons_sleep if code in single script or looping
data = create_commons_claim(image_filename, property_p_id, value_q_id, commons_session, commons_csrf_token, sleeptime)
'''
wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
property_p_id = 'P180' # depicts
value_q_id = 'Q384177' # Egyptian Revival (architecture)

stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
value_dictionary = {
    'entity-type': 'item',
    'numeric-id': stripped_q_number
}
value_json_string = json.dumps(value_dictionary)

parameters = {
    'action':'wbcreateclaim',
    'format':'json',
    'token': commons_csrf_token,
    'entity': wikibase_subject_id,
    'snaktype':'value',
    'property': property_p_id,
    # note: the value is a JSON string, not an actual data structure.  I think it will get URL encoded by requests before posting
    'value': value_json_string,
    'summary': 'Add depicts value structured data via API'
}

#print(json.dumps(parameters, indent = 2))
response = commons_session.post(endpointUrl, data = parameters)
data = response.json()
'''
print(json.dumps(data, indent=2))

#sleep(commons_sleep)


{
  "pageinfo": {
    "lastrevid": 415375013
  },
  "success": 1,
  "claim": {
    "mainsnak": {
      "snaktype": "value",
      "property": "P180",
      "hash": "137cf2807669705931c118832d0f72deebe411a4",
      "datavalue": {
        "value": {
          "entity-type": "item",
          "numeric-id": 2520975,
          "id": "Q2520975"
        },
        "type": "wikibase-entityid"
      },
      "datatype": "wikibase-item"
    },
    "type": "statement",
    "id": "M89538708$39358FDF-4A64-47B2-99AC-7B257FE72800",
    "rank": "normal"
  }
}


In [None]:
# Look at https://commons.wikimedia.org/wiki/File:USS_Arizona_afloat_after_launch_NARA_19-LC-19A-24.tif
# to see how they linked to their collection in Wikidata and also how they did the Record ID
# They seem to be using their own NARA template

# Categorization: according to https://commons.wikimedia.org/wiki/Commons:Bots#Bot_accounts all uploads are expected to apply at least one category



# Linking
# Wikilinks from the Commons: https://en.wikipedia.org/wiki/Wikipedia:Wikilinks_from_the_Commons
