In [None]:
# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php
# NOTE: this script does not distinguish between the image file name, which contains underscores
# between words, and the image file name part of the page title, which contains spaces.
# When the image file name is formed by prepending "File:" to the image file name,
# the API seems to automatically convert the underscores to spaces (or something) and
# be able to match with the actual page title. Don't know if it would be important
# to explicitly differentiate between the two in case this automatic conversion doesn't always work.

# ----------------
# Common code
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# Authentication functions

def login(path, relative_to_home):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        full_credentials_path = home + '/' + path
    else:
        full_credentials_path = path
    credentials = retrieve_credentials(full_credentials_path)
    
    resource_url = '/w/api.php' # default API resource URL for all Wikimedia APIs
    endpoint_url = credentials['url'] + resource_url

    # Instantiate session
    session = requests.Session()
    # Set default User-Agent header so you don't have to send it with every request
    session.headers.update({'User-Agent': credentials['user_agent']})

    # Go through the sequence of steps needed to get get the CSRF token
    login_token = get_login_token(endpoint_url, session)
    data = session_login(endpoint_url, login_token, credentials['username'], credentials['password'], session)
    csrf_token = get_csrf_token(endpoint_url, session)
    return {'session': session, 'csrftoken': csrf_token, 'endpoint': endpoint_url}

def retrieve_credentials(path):
    with open(path, 'rt') as file_object:
        line_list = file_object.read().split('\n')
    endpoint_url = line_list[0].split('=')[1]
    username = line_list[1].split('=')[1]
    password = line_list[2].split('=')[1]
    user_agent = line_list[3].split('=')[1]
    credentials = {'url': endpoint_url, 'username': username, 'password': password, 'user_agent': user_agent}
    return credentials

def get_login_token(apiUrl, session):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def session_login(apiUrl, token, username, password, session):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def get_csrf_token(apiUrl, session):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# Data upload functions

# API file Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
def upload_file_to_commons(image_filename, directory_path, relative_to_home, session, csrftoken, sleeptime, wikitext):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        directory_path = home + '/' + directory_path

    parameters = {
        'action': 'upload',
        'filename': image_filename,
        'format': 'json',
        'token': csrftoken,
        'ignorewarnings': 1,
        'text': wikitext,
        # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
        # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
        'comment': 'Uploaded image file and metadata via API'
    }
    #directory_path = 'Downloads/'
    file_path = directory_path + image_filename
    file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
    #print(parameters)
    #print(file_dict)

    response = session.post('https://commons.wikimedia.org/w/api.php', files=file_dict, data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    # for non-critical applications, do not hit the API rapidly
    sleep(sleeptime)
    return(data)

# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical
def set_commons_image_caption(image_filename, caption, caption_language, session, csrftoken, sleeptime):
    parameters = {
        'action': 'wbsetlabel',
        'format': 'json',
        'token': csrftoken,
        'site': 'commonswiki',
        'title': 'File:' + image_filename,
        'value': caption,
        'language': caption_language,
        'summary': 'Add caption via API'
    }

    #print(json.dumps(parameters, indent = 2))

    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return(data)

# This function is used in the following function, which needs a page ID rather than a name
def get_commons_image_pageid(image_filename):
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations
    return page_id

# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html
def create_commons_claim(image_filename, property_p_id, value_q_id, session, csrftoken, sleeptime):
    wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
    #property_p_id = 'P180' # depicts
    #value_q_id = 'Q384177' # Egyptian Revival (architecture)

    stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
    value_dictionary = {
        'entity-type': 'item',
        'numeric-id': stripped_q_number
    }
    value_json_string = json.dumps(value_dictionary)

    parameters = {
        'action':'wbcreateclaim',
        'format':'json',
        'token': csrftoken,
        'entity': wikibase_subject_id,
        'snaktype':'value',
        'property': property_p_id,
        # note: the value of 'value' is a JSON string, not an actual data structure.  
        #It will get URL encoded by requests before posting
        'value': value_json_string,
        'summary': 'Add structured data via API'
    }

    #print(json.dumps(parameters, indent = 2))
    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return data

# ---------------
# Not used yet
# ---------------

# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    baseDelay = 5 # Wikidata recommends a delay of at least 5 seconds
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

In [None]:
# -------------------
# Retrieving image data from the Commons MediaWiki API
# -------------------

# Initially, I thought that it was necessary to know the Wikibase entity ID to set the caption.
# So I wrote code to extract that from a query. However, one can use the page title, so that
# isn't actually necessary. But it might be needed anyway for the structured data part.

# The Wikibase entity ID can be used in lieu of the site+page name.
# The format is "M" plus the page ID. So page ID 41837276 has the entity ID M41837276
# Use action=query&prop=info&titles=File:Pluto-01_Stern_03_Pluto_Color_TXT.jpg and 
# extract the pageid field from result.

# Commons API examples: https://commons.wikimedia.org/wiki/Commons:API/MediaWiki
# sandbox: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles=File%3AMasonry_patterns_in_doorway_Tetouan_Morocco.jpg
# General Sandbox index: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles
def retrieve_commons_api_info(act_id, filename):
    apiUrl = 'https://commons.wikimedia.org/w/api.php'

    '''
    # get photos by a user
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'allimages',
        'aiuser': 'Baskaufs',
        'aisort': 'timestamp'
    }
    '''

    '''
    # get category information about a photo. NOTE: fewer items than categories provided in the extmetadata option
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'categories|imageinfo'
    }
    '''

    '''
    # get raw metadata embedded in a photo
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'imageinfo',
        'iiprop': 'metadata',
        'iimetadataversion': 'latest'
    }
    '''

    '''
    # get metadata for a photo including from file page (does not produce much except the page title, basically the filename)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }
    '''

    # get metadata for a photo including from file page (lots of data, but doesn't correspond exactly to data on image page)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + filename,
        'prop': 'imageinfo',
        'iiprop': 'extmetadata'
    }

    response = requests.get(apiUrl, params=params)
    data = response.json()
    # print(json.dumps(data, indent=2))

    info = {'act_id': act_id, 'filename': filename} # dict to collect results
    pages = data['query']['pages']
    # Oddly, the only key within pages object is the Wikibase ID
    # Assuming there is only one ID per page title, just get the first one
    wikibase_id = list(pages.keys())[0]
    info['wikibase_id'] = wikibase_id
    # Also oddly, the imageinfo value is a dict. Not sure why an image would have multiple imageinfos, so just get 1st one
    item_data = pages[wikibase_id]['imageinfo'][0]['extmetadata']
    # print(json.dumps(item_data, indent = 2))
    # print(item_data.keys())

    if 'Categories' in item_data:
        categories_list = item_data['Categories']['value'].split('|')
    else:
        categories_list = []
    info['categories_list'] = json.dumps(categories_list)

    info['artist_name'] = ''
    info['artist_description'] = ''    
    info['user_name'] = ''
    info['user_url'] = ''

    if "Artist" in item_data:
        soup = BeautifulSoup(item_data['Artist']['value'], 'html.parser')
        if 'bdi' == soup.contents[0].name:
            try:
                info['artist_name'] = soup.bdi.a.span.string
            except:
                info['artist_name'] = str(soup)
            try:
                info['artist_description'] = soup.bdi.a.span['title']
            except:
                info['artist_description'] = ''
        if 'a' == soup.contents[0].name:
            try:
                info['user_name'] = soup.a.string
            except:
                info['user_name'] = str(soup)
            try:
                info['user_url'] = soup.a['href']
            except:
                info['user_url'] = ''
        if not('bdi' == soup.contents[0].name) and not('a' == soup.contents[0].name):
            info['artist_name'] = str(soup)

    if 'LicenseShortName' in item_data:
        info['license_name'] = item_data['LicenseShortName']['value']
    else:
        info['license_name'] = ''
    if 'License' in item_data:
        info['license_code'] = item_data['License']['value']
    else:
        info['license_code'] = ''

    if 'DateTimeOriginal' in item_data:
        info['dateTime_original_string'] = item_data['DateTimeOriginal']['value'].split('<')[0].strip()
    else:
        info['dateTime_original_string'] = ''

    if 'ImageDescription' in item_data:
        soup = BeautifulSoup(item_data['ImageDescription']['value'], 'html.parser')
        try:
            info['main_subject_P921_label'] = soup.div.a.span.string
            info['main_subject_P921_full'] = soup.div.a.span['title']
            info['main_subject_P921_qid'] = extract_localname(soup.div.a['href'])
        except:
            info['main_subject_P921_label'] = str(soup)
            info['main_subject_P921_full'] = ''
            info['main_subject_P921_qid'] = ''       
    else:
        info['main_subject_P921_label'] = ''
        info['main_subject_P921_full'] = ''
        info['main_subject_P921_qid'] = ''

    if 'ObjectName' in item_data:
        soup = BeautifulSoup(item_data['ObjectName']['value'], 'html.parser')
        # use regex to match any text
        my_regex = re.compile(".*")
        title_soup = soup.find_all(text=my_regex)
        title_list = list(title_soup)
        clean_title_list = [x.strip() for x in title_list if x != '\n'] # remove newline items from list of title strings
    else:
        clean_title_list = []
    if len(clean_title_list) > 0:
        info['title'] = clean_title_list[0]
    else:
        info['title'] = ''
    info['titles_list'] = json.dumps(clean_title_list)
    return info

    # print(json.dumps(info, indent = 2))


In [None]:
# ---------------------------
# Scrape data from table on image page
# ---------------------------

server_sleep = 0.1

# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork
# Historical photographs (e.g. museums) https://commons.wikimedia.org/wiki/Template:Photograph
# Art photo template adds to artwork template https://commons.wikimedia.org/wiki/Template:Art_Photo
# Credit line template provides attribution text requred for CC BY licenses https://commons.wikimedia.org/wiki/Template:Credit_line

file_path = '../../vandycite/act/processed_lists/add_to_wikidata.csv'
file_data = read_dict(file_path)

with open('all_fields.json', 'rt') as file_object:
    all_field_list = json.loads(file_object.read())
    field_list = all_field_list # stop using separate lists

# Create a list of fields to be used for the CSV output
output_fields = ['filename', 'template_type']
for field in all_field_list:
    output_fields.append(field)

nonstandard_fields = []
output_list = []
#if True:
for record in file_data[200:300]:
    print(record['filename'])
    image_filename = record['filename']
    #image_filename = 'Fra_Angelico_-_The_Coronation_of_the_Virgin_-_WGA0630.jpg'
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    # image_filename = "Christ's_temptation_(Monreale).jpg"

   # Create a dictionary with keys for all fields in the template type and empty string values
    output_dict = {'filename': image_filename}
    for field in all_field_list:
        output_dict[field] = []
    
    # Retrieve the page HTML
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) == 0:
        output_dict['template_type'] = 'none'
        print('No data table')
        print()
        output_list.append(output_dict)
        continue
    try:
        template_type = str(tables[0]['class'][0]).split('-')[-1]
    except:
        template_type = 'unknown'
    print('Page template:', template_type) # useful types are "information" and "artwork"
    print()
    output_dict['template_type'] = template_type
        
    # stop using separate lists
    #with open(template_type + '.json', 'rt') as file_object:
    #    field_list = json.loads(file_object.read())
    #print(field_list)
    
    # Step through all of the rows looking for columns with particular labels
    rows = tables[0].tbody.findAll('tr', recursive=False) # find only in direct child tr elements
    for row in rows:        
        columns = row.findAll('td')
        if len(columns) > 0:
            # In cases where the contents of the td tag are more complext than a simple string (e.g. if the 
            # contents include text and markup tags), the strings attribute generates a tuple-like iterable
            # of strings included inside the tag (a "generator"). The first iterable string is always the one we want
            string_list = []
            field_matched = False
            strings = columns[0].strings
            for string in strings:
                string_list.append(string)
            if len(string_list) > 0:
                field_name = string_list[0]
                for field in field_list:
                    if field_name.lower() == field:
                        # The value of field will remain as the last matched one
                        field_matched = True
                        break

            if columns[1].string:
                # Turn the string into a list of one string
                value_string = json.dumps([columns[1].string])
            else:
                # use regex to match any text
                my_regex = re.compile(".*")
                string_list = columns[1].find_all(text=my_regex)
                no_newlines_list = [x.strip() for x in string_list if x != '\n'] # remove newline items from list of strings
                clean_string_list = [x.strip() for x in no_newlines_list if x != ''] # remove newline items from list of strings
                if len(clean_string_list) == 1:
                    # clean string and turn into list of length 1
                    value_string = json.dumps([clean_string_list[0].strip()])
                else:
                    value_string = json.dumps(clean_string_list)
            value_string = value_string.strip()
                    
            if field_matched:
                #print('Matched standard field:', field_name)
                #print(value_string)
                # Insert the found value into the dict for the matched field
                output_dict[field] = value_string
            else:
                #print('Non-standard field:', field_name)
                #print(value_string)
                nonstandard_fields.append({'filename': image_filename, 'fieldname': field_name, 'value': value_string})
            #print()
    sleep(server_sleep) # Don't hit the web server too fast
    #print(json.dumps(output_dict, indent = 2))
    output_list.append(output_dict)
    #print('-----------------')
            
#print(json.dumps(nonstandard_fields, indent = 2))
write_dicts_to_csv(nonstandard_fields, 'nonstandard_fields.csv', ['filename', 'fieldname', 'value'])
write_dicts_to_csv(output_list, 'templated_data.csv', output_fields)

print('done')

In [None]:
# ---------------------------
# Look for a Wikidata link on the image page
# ---------------------------

# For references on art in Wikidata, see https://www.wikidata.org/wiki/Wikidata:WikiProject_sum_of_all_paintings
# https://www.wikidata.org/wiki/Wikidata:WikiProject_Visual_arts/Item_structure

file_path = '../../vandycite/act/processed_lists/add_to_wikidata.csv'
file_data = read_dict(file_path)

output_list = []
#if True:
for record in file_data[860:]:
    print(record['filename'])

    # Retrieve the page HTML
    image_filename = record['filename']
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) > 0:

        # Have to check for this span because there are other subtables with a tags besides the one at the top
        span = tables[0].findAll('span', id = 'artwork')
        if len(span) > 0:
            # The link to the Wikidata item will be in an href
            anchors = span[0].findAll('a', href = re.compile('https://www.wikidata.org/wiki/'))
            if len(anchors) > 0:
                link = anchors[0]['href']
                qid = extract_localname(link)

                retrieved_data = {'act_id': record['RecordNumber'], 'qid': qid, 'filename': record['filename']}
                # print(retrieved_data)
                output_list.append(retrieved_data)
    sleep(read_api_sleep) # Don't hit the API too fast
    
fieldnames = ['act_id', 'qid', 'filename']
write_dicts_to_csv(output_list, 'wikidata_found.csv', fieldnames)

print('done')


In [None]:
fieldnames = ['act_id', 'qid', 'filename']
write_dicts_to_csv(output_list, 'wikidata_found2.csv', fieldnames)

print('done')