In [1]:
# Description of bots on Commons: https://commons.wikimedia.org/wiki/Commons:Bots
# See guidelines for operating a bot in Commons: https://commons.wikimedia.org/wiki/Commons:Bots/Requests
# Need to decide whether this applies if non autonomous. It probably does.
# Bot flag is an indication of community trust and prevents new images/recent changes lists from getting swamped.
# It's also an indication of community trust; confirms edits not likely to need manual checking

# Generic Commons API reference: https://commons.wikimedia.org/w/api.php
# NOTE: this script does not distinguish between the image file name, which contains underscores
# between words, and the image file name part of the page title, which contains spaces.
# When the image file name is formed by prepending "File:" to the image file name,
# the API seems to automatically convert the underscores to spaces (or something) and
# be able to match with the actual page title. Don't know if it would be important
# to explicitly differentiate between the two in case this automatic conversion doesn't always work.

# ----------------
# Common code
# ----------------

# This section contains import statements and function definitions.
# It should be run before running other sections of the code

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
from bs4 import BeautifulSoup # web-scraping library, use PIP to install beautifulsoup4 (included in Anaconda)

# ------------------------
# function definitions
# ------------------------

# Utility functions

# gunction to get local name from an IRI
def extract_localname(iri):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[len(pieces)-1] # return the last piece

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# Authentication functions

def login(path, relative_to_home):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        full_credentials_path = home + '/' + path
    else:
        full_credentials_path = path
    credentials = retrieve_credentials(full_credentials_path)
    
    resource_url = '/w/api.php' # default API resource URL for all Wikimedia APIs
    endpoint_url = credentials['url'] + resource_url

    # Instantiate session
    session = requests.Session()
    # Set default User-Agent header so you don't have to send it with every request
    session.headers.update({'User-Agent': credentials['user_agent']})

    # Go through the sequence of steps needed to get get the CSRF token
    login_token = get_login_token(endpoint_url, session)
    data = session_login(endpoint_url, login_token, credentials['username'], credentials['password'], session)
    csrf_token = get_csrf_token(endpoint_url, session)
    return {'session': session, 'csrftoken': csrf_token, 'endpoint': endpoint_url}

def retrieve_credentials(path):
    with open(path, 'rt') as file_object:
        line_list = file_object.read().split('\n')
    endpoint_url = line_list[0].split('=')[1]
    username = line_list[1].split('=')[1]
    password = line_list[2].split('=')[1]
    user_agent = line_list[3].split('=')[1]
    credentials = {'url': endpoint_url, 'username': username, 'password': password, 'user_agent': user_agent}
    return credentials

def get_login_token(apiUrl, session):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def session_login(apiUrl, token, username, password, session):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def get_csrf_token(apiUrl, session):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# Data upload functions

# API file Upload example: https://www.mediawiki.org/wiki/API:Upload#POST_request
# API Sandbox can be used to generate test JSON, but DO NOT RUN since it actually uploads.
# Specifically for uploads, see https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=upload&filename=Wiki.png&url=http%3A//upload.wikimedia.org/wikipedia/en/b/bc/Wiki.png&token=123ABC
def upload_file_to_commons(image_filename, directory_path, relative_to_home, session, csrftoken, sleeptime, wikitext):
    if relative_to_home:
        home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
        directory_path = home + '/' + directory_path

    parameters = {
        'action': 'upload',
        'filename': image_filename,
        'format': 'json',
        'token': csrftoken,
        'ignorewarnings': 1,
        'text': wikitext,
        # this is what generates the text in the Description box on user Uploads page and initial edit summary for page
        # See https://commons.wikimedia.org/wiki/Commons:First_steps/Quality_and_description#Upload_summary
        'comment': 'Uploaded image file and metadata via API'
    }
    #directory_path = 'Downloads/'
    file_path = directory_path + image_filename
    file_dict = {'file':(image_filename, open(file_path, 'rb'), 'multipart/form-data')}
    #print(parameters)
    #print(file_dict)

    response = session.post('https://commons.wikimedia.org/w/api.php', files=file_dict, data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    # for non-critical applications, do not hit the API rapidly
    sleep(sleeptime)
    return(data)

# Adding the image caption seems to be a hack that uses the Wikibase API command wbsetlabel.
# Captions are Wikibase labels (language specific), limit 255 characters length.
# See https://commons.wikimedia.org/wiki/Commons:File_captions#Technical
def set_commons_image_caption(image_filename, caption, caption_language, session, csrftoken, sleeptime):
    parameters = {
        'action': 'wbsetlabel',
        'format': 'json',
        'token': csrftoken,
        'site': 'commonswiki',
        'title': 'File:' + image_filename,
        'value': caption,
        'language': caption_language,
        'summary': 'Add caption via API'
    }

    #print(json.dumps(parameters, indent = 2))

    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return(data)

# This function is used in the following function, which needs a page ID rather than a name
def get_commons_image_pageid(image_filename):
    # get metadata for a photo including from file page
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }

    response = requests.get('https://commons.wikimedia.org/w/api.php', params=params)
    data = response.json()
    #print(json.dumps(data, indent=2))
    page_dict = data['query']['pages'] # this value is a dict that has the page IDs as keys
    page_id_list = list(page_dict.keys()) # the result of the .keys() method is a "dict_keys" object, so coerce to a list
    page_id = page_id_list[0] # info on only one page was requested, so get item 0
    #print('Page ID:',page_id)
    
    # Don't think I need to add a sleep time for API reads, which are less resource-intensive
    # than write operations
    return page_id

# Code comes from writeStatement() function at https://github.com/HeardLibrary/digital-scholarship/blob/master/code/wikibase/api/load_csv.py
# Described in this blog post: http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html
def create_commons_claim(image_filename, property_p_id, value_q_id, session, csrftoken, sleeptime):
    wikibase_subject_id = 'M' + get_commons_image_pageid(image_filename)
    #property_p_id = 'P180' # depicts
    #value_q_id = 'Q384177' # Egyptian Revival (architecture)

    stripped_q_number = value_q_id[1:len(value_q_id)] # remove initial "Q" from object string
    value_dictionary = {
        'entity-type': 'item',
        'numeric-id': stripped_q_number
    }
    value_json_string = json.dumps(value_dictionary)

    parameters = {
        'action':'wbcreateclaim',
        'format':'json',
        'token': csrftoken,
        'entity': wikibase_subject_id,
        'snaktype':'value',
        'property': property_p_id,
        # note: the value of 'value' is a JSON string, not an actual data structure.  
        #It will get URL encoded by requests before posting
        'value': value_json_string,
        'summary': 'Add structured data via API'
    }

    #print(json.dumps(parameters, indent = 2))
    response = session.post('https://commons.wikimedia.org/w/api.php', data = parameters)
    data = response.json()
    #print(json.dumps(data, indent=2))

    sleep(sleeptime)
    return data

# ---------------
# Not used yet
# ---------------

# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    baseDelay = 5 # Wikidata recommends a delay of at least 5 seconds
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        print(r.text)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script

In [250]:
# -------------------
# Retrieving image data from the Commons MediaWiki API
# -------------------

# Initially, I thought that it was necessary to know the Wikibase entity ID to set the caption.
# So I wrote code to extract that from a query. However, one can use the page title, so that
# isn't actually necessary. But it might be needed anyway for the structured data part.

# The Wikibase entity ID can be used in lieu of the site+page name.
# The format is "M" plus the page ID. So page ID 41837276 has the entity ID M41837276
# Use action=query&prop=info&titles=File:Pluto-01_Stern_03_Pluto_Color_TXT.jpg and 
# extract the pageid field from result.

# Commons API examples: https://commons.wikimedia.org/wiki/Commons:API/MediaWiki
# sandbox: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles=File%3AMasonry_patterns_in_doorway_Tetouan_Morocco.jpg
# General Sandbox index: https://commons.wikimedia.org/wiki/Special:ApiSandbox#action=query&format=json&prop=categories%7Cimageinfo&titles
def retrieve_commons_api_info(act_id, filename):
    apiUrl = 'https://commons.wikimedia.org/w/api.php'

    '''
    # get photos by a user
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'allimages',
        'aiuser': 'Baskaufs',
        'aisort': 'timestamp'
    }
    '''

    '''
    # get category information about a photo. NOTE: fewer items than categories provided in the extmetadata option
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'categories|imageinfo'
    }
    '''

    '''
    # get raw metadata embedded in a photo
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:Masonry_patterns_in_doorway_Tetouan_Morocco.jpg',
        'prop': 'imageinfo',
        'iiprop': 'metadata',
        'iimetadataversion': 'latest'
    }
    '''

    '''
    # get metadata for a photo including from file page (does not produce much except the page title, basically the filename)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + image_filename,
        'prop': 'info'
    }
    '''

    # get metadata for a photo including from file page (lots of data, but doesn't correspond exactly to data on image page)
    params = {
        'action': 'query',
        'format': 'json',
        'titles': 'File:' + filename,
        'prop': 'imageinfo',
        'iiprop': 'extmetadata'
    }

    response = requests.get(apiUrl, params=params)
    data = response.json()
    # print(json.dumps(data, indent=2))

    info = {'act_id': act_id, 'filename': filename} # dict to collect results
    pages = data['query']['pages']
    # Oddly, the only key within pages object is the Wikibase ID
    # Assuming there is only one ID per page title, just get the first one
    wikibase_id = list(pages.keys())[0]
    info['wikibase_id'] = wikibase_id
    # Also oddly, the imageinfo value is a dict. Not sure why an image would have multiple imageinfos, so just get 1st one
    item_data = pages[wikibase_id]['imageinfo'][0]['extmetadata']
    # print(json.dumps(item_data, indent = 2))
    # print(item_data.keys())

    if 'Categories' in item_data:
        categories_list = item_data['Categories']['value'].split('|')
    else:
        categories_list = []
    info['categories_list'] = json.dumps(categories_list)

    info['artist_name'] = ''
    info['artist_description'] = ''    
    info['user_name'] = ''
    info['user_url'] = ''

    if "Artist" in item_data:
        soup = BeautifulSoup(item_data['Artist']['value'], 'html.parser')
        if 'bdi' == soup.contents[0].name:
            try:
                info['artist_name'] = soup.bdi.a.span.string
            except:
                info['artist_name'] = str(soup)
            try:
                info['artist_description'] = soup.bdi.a.span['title']
            except:
                info['artist_description'] = ''
        if 'a' == soup.contents[0].name:
            try:
                info['user_name'] = soup.a.string
            except:
                info['user_name'] = str(soup)
            try:
                info['user_url'] = soup.a['href']
            except:
                info['user_url'] = ''
        if not('bdi' == soup.contents[0].name) and not('a' == soup.contents[0].name):
            info['artist_name'] = str(soup)

    if 'LicenseShortName' in item_data:
        info['license_name'] = item_data['LicenseShortName']['value']
    else:
        info['license_name'] = ''
    if 'License' in item_data:
        info['license_code'] = item_data['License']['value']
    else:
        info['license_code'] = ''

    if 'DateTimeOriginal' in item_data:
        info['dateTime_original_string'] = item_data['DateTimeOriginal']['value'].split('<')[0].strip()
    else:
        info['dateTime_original_string'] = ''

    if 'ImageDescription' in item_data:
        soup = BeautifulSoup(item_data['ImageDescription']['value'], 'html.parser')
        try:
            info['main_subject_P921_label'] = soup.div.a.span.string
            info['main_subject_P921_full'] = soup.div.a.span['title']
            info['main_subject_P921_qid'] = extract_localname(soup.div.a['href'])
        except:
            info['main_subject_P921_label'] = str(soup)
            info['main_subject_P921_full'] = ''
            info['main_subject_P921_qid'] = ''       
    else:
        info['main_subject_P921_label'] = ''
        info['main_subject_P921_full'] = ''
        info['main_subject_P921_qid'] = ''

    if 'ObjectName' in item_data:
        soup = BeautifulSoup(item_data['ObjectName']['value'], 'html.parser')
        # use regex to match any text
        my_regex = re.compile(".*")
        title_soup = soup.find_all(text=my_regex)
        title_list = list(title_soup)
        clean_title_list = [x.strip() for x in title_list if x != '\n'] # remove newline items from list of title strings
    else:
        clean_title_list = []
    if len(clean_title_list) > 0:
        info['title'] = clean_title_list[0]
    else:
        info['title'] = ''
    info['titles_list'] = json.dumps(clean_title_list)
    return info

    # print(json.dumps(info, indent = 2))


In [17]:
# ---------------------------
# Scrape data from table on image page
# ---------------------------

# Guidelines for providing information using the Information template: https://commons.wikimedia.org/wiki/Template:Information
# Note special template for artwork having more extensive metadata: https://commons.wikimedia.org/wiki/Template:Artwork
# Historical photographs (e.g. museums) https://commons.wikimedia.org/wiki/Template:Photograph
# Art photo template adds to artwork template https://commons.wikimedia.org/wiki/Template:Art_Photo
# Credit line template provides attribution text requred for CC BY licenses https://commons.wikimedia.org/wiki/Template:Credit_line

if True:
    # Retrieve the page HTML
    image_filename = 'Allegory_of_Wisdom_MET_DP145142.jpg'
    #image_filename = "Christ's_temptation_(Monreale).jpg"
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    template_type = str(tables[0]['class'][0]).split('-')[-1]
    print(template_type) # useful types are "information" and "artwork"
    
    # Step through all of the rows looking for columns with particular labels
    rows = tables[0].findAll('tr')
    for row in rows:
        columns = row.findAll('td')
        if len(columns) > 0:
            if columns[0].string == 'Date':
                print(columns[1].string)
    

artwork

late 16th–early 17th century


In [61]:
# ---------------------------
# Look for a Wikidata link on the image page
# ---------------------------

file_path = '../../vandycite/act/processed_lists/add_to_wikidata.csv'
file_data = read_dict(file_path)

output_list = []
#if True:
for record in file_data[860:]:
    print(record['filename'])

    # Retrieve the page HTML
    image_filename = record['filename']
    # image_filename = 'Christ sur la mer de Galilée (Delacroix) Walters Art Museum 37.186.jpg'
    # image_filename = 'Christ_and_SocratesSAAM_1974.28.341A_B_1.jpg'
    page_url = 'https://commons.wikimedia.org/wiki/File:' + image_filename
    response = requests.get(page_url)
    
    # Create a soup object and find the file info table
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table', class_= re.compile('fileinfotpl'))
    if len(tables) > 0:

        # Have to check for this span because there are other subtables with a tags besides the one at the top
        span = tables[0].findAll('span', id = 'artwork')
        if len(span) > 0:
            # The link to the Wikidata item will be in an href
            anchors = span[0].findAll('a', href = re.compile('https://www.wikidata.org/wiki/'))
            if len(anchors) > 0:
                link = anchors[0]['href']
                qid = extract_localname(link)

                retrieved_data = {'act_id': record['RecordNumber'], 'qid': qid, 'filename': record['filename']}
                # print(retrieved_data)
                output_list.append(retrieved_data)
    sleep(read_api_sleep) # Don't hit the API too fast
    
fieldnames = ['act_id', 'qid', 'filename']
write_dicts_to_csv(output_list, 'wikidata_found.csv', fieldnames)

print('done')


Filippino Lippi 008.jpg
Filippino Lippi 011.jpg
Filippino Lippi 015.jpg
Fine Art, War and peace (Honorable Mention) 141202-F-PO994-001.jpg
Firenze.Baptistry.door01.JPG
First Four Articles of the Creed tapestry.jpg
First Lady Betty Ford’s “Bloomer Flag”.jpg
Fish Symbol on Shrine Wall - Abomey - Benin.jpg
Flämischer Meister des 13. Jahrhunderts 001.jpg
Flemish - Sphinx - Walters 54233 - Profile.jpg
Flemish 17th century Moses and the Brass Serpent.jpg
Flemish Denial of Saint Peter.jpg
Flickr - archer10 (Dennis) - Guatemala-1682.jpg
Flickr - Israel Defense Forces - Christian Pilgrims Celebrate the Epiphany in the Jordan Valley, Jan 2011 (1).jpg
Flickr - Official U.S. Navy Imagery - The Master of ceremonies start the seventh annual Walk Out of Darkness. (1).jpg
Flickr - USCapitol - William Brewster.jpg
Florence Nightingale as the lady with the lamp. Oil painting Wellcome V0017994.jpg
Florentinischer Meister des 14. Jahrhunderts 001.jpg
Fol. 214v-215r Egmond Gospels.jpg
Foligno107.jpg
Folio 

Giovanni di Paolo 003.jpg
Giovanni di paolo, Creation and the Expulsion from the Paradise.jpg
Giovanni Maria Morandi - Religieux.jpg
Giovanni Paolo Pannini - Apostle Paul Preaching on the Ruins - WGA16977.jpg
Girl disguised Nazarene 2.jpg
GISWatch 2011 PDF.pdf
Giunta Pisano 001.jpg
Giuseppe Arcimboldo - Rudolf II of Habsburg as Vertumnus - Google Art Project.jpg
Give-us-this-day-our-daily-bread-reickemeyer.jpg
Glasfenster Fußwaschung Korntal Christuskirche.jpg
God is Love.JPG
Going to Church, by William H. Johnson.jpg
Gold cross with a small bust of Jesus Christ, from the Transfiguration of Christ mosaic apse, 6th century AD, Basilica Sant'Apollinare in Classe, Ravenna - 49760117772.jpg
Good Friday church service, Market Place, Wetherby (19th April 2019).jpg
Good Samaritan (Watts).jpg
Good shepherd 01.jpg
Good shepherd 02.jpg
Good Shepherd 04.jpg
Good shepherd m2.jpg
Goodwill clothing drop-off bin.jpg
GoremeFresco2.jpg
Gospel Book, Publican and the Pharisee, Walters Manuscript W.540, f

Italo-Byzantinischer Meister 001.jpg
Italo-Byzantinischer Meister 002.jpg
Ivan Yermenyov. Singing beggars.jpg
Iványi Doubting Saint Thomas.jpg
Iwan Iwanowitsch Schischkin 002.jpg
Jaali or marble lattice screen showing a mihrab, from inside Humayun's tomb, Delhi.jpg
Jacob Jordaens - Abraham Grapheus as Job.JPG
Jacob Jordaens - Self-Portrait among Parents, Brothers and Sisters - WGA12024.jpg
Jacob Symonsz Pynas Ruth beschliesst mit Naomi nach Bethlehem zu ziehen.jpg
Jacob Wrestling with the Angel by Maurice Denis.jpg
Jacob y el rebaño de Labán, de José de Ribera (Monasterio de El Escorial).jpg
Jacopo da Ponte 002.jpg
Jacopo da Ponte 003.jpg
Jacopo da Ponte 007.jpg
Jacopo Pontormo 001.jpg
Jacopo Pontormo 002.jpg
Jacopo Pontormo 004.jpg
Jacopo Pontormo 005.jpg
Jacopo Pontormo 006.jpg
Jacopo Pontormo 007.jpg
Jacopo Pontormo 008.jpg
Jacopo Pontormo 009.jpg
Jacopo Pontormo 010.jpg
Jacopo Pontormo 011.jpg
Jacopo Pontormo 017.jpg
Jacopo Pontormo 020.jpg
Jacopo Pontormo 022.jpg
Jacopo Pontormo 0

Lorenzo Lippi 004.jpg
Lorenzo Lotto - Christ Carrying the Cross (detail) - WGA13694.jpg
Lorenzo Monaco - St Jerome in the Wilderness - WGA13606.jpg
Lorenzo Monaco - The Flight into Egypt - WGA13585.jpg
Lotharingia (metz), placca d'avorio con scene a emmaus, 850-900 circa (carolingio).JPG
Louis9+Poor.jpg
Love and happiness.jpg
Lucas Cranach d.Ä. - Gastmahl des Herodes (Städelsches Kunstinstitut).jpg
Lucas Cranach d.J. - Christus als guter Hirte (Angermuseum).jpg
Lucas Cranach d.J. - Christus segnet die Kinder (ca.1540).jpg
Lucas Cranach the Elder Christ blessing the Children, Frankfurt am Main, Städel Museum.jpg
Lucas Cranach, Heart shaped altar, Nuremberg.jpg
Lucas Moser 001.jpg
Lucy Higgs Union Civil War Nurse.JPG
Ludovico Mazzolino - Mozes en de tafelen der Wet.jpg
Ludovico mazzolino, cristo tra i dottori del tempio, 1524, 15.JPG
Ludwig Knaus - Salomonische Weisheit (1878).jpg
Luigi Nono Studie zu dem Gemälde Abbandonati 1875.jpg
Luther at Erfurt - Justification by Faith.jpg
Lutherbi

Nicolas Poussin 043.jpg
Nicolas Poussin 073.jpg
Niels Larsen Stevns- Salvning.jpg
Niels Larsen Stevns- Spedalske.jpg
Nikodim and Jesus.jpeg
Nikolay Ge 015.jpeg
Nikolay Ge 020.jpeg
NMAAHC (33628525655).jpg
Noah's Ark - Google Art Project.jpg
Noah's Ark by T.Poulakis after engraving of J.Sadeler (17th c.).jpg
Noah's Ark on Mount Ararat by Simon de Myle.jpg
Nonel La Viuda.jpg
Noordwijk Sint-Jeroenskerk beeld.jpg
Noravank (35283103663).jpg
Nordenskirker Slaghellig(25).jpg
Nordside(16)Nordre sideskib nordvæg, Abraham og Isak vender hjem til Sarah.JPG
Norman Rockwell - Boy with Baby Carriage - Google Art Project.jpg
Norman Rockwell Mural (Marion County, Oregon scenic images) (marDA0168).jpg
Nørre.Helev.jpg
North door of iconostasis v.3.jpg
Nuremberg chronicles - Golden calf.png
Nuremberg chronicles f 30v 2.png
Octave TASSAERT The Waif -L'abandonnée.jpg
Oil painting "Greed" located in fifth floor, main library, Department of Justice, Washington, D.C LCCN2010720176.tif
Oltár Levoča 03.jpg
Ołta

Rembrandt - Der reiche Narr.jpg
Rembrandt - Judas repentent.jpg
Rembrandt - Landscape with the Good Samaritan - WGA19228.jpg
Rembrandt - The Return of the Prodigal Son - WGA19074.jpg
Rembrandt 221.jpg
Rembrandt 227.jpg
Rembrandt Harmensz. van Rijn 024.jpg
Rembrandt Harmensz. van Rijn 031.jpg
Rembrandt Harmensz. van Rijn 085b.jpg
Rembrandt Harmensz. van Rijn 139.jpg
Rembrandt prophet hannah.jpg
Rembrandt van Rijn - Christ Preaching (The Hundred Guilder Print) - Google Art Project.jpg
Rembrandt-Elison.png
Rembrandt-The return of the prodigal son.jpg
Rembrandt, "St. Paul in Prison".jpg
Renier de Huy JPG04.jpg
Rensig, Everhard - Esau Gives up his Birthright; Jacob and Esau with the Bowl of Pottage - Google Art Project.jpg
Restoration of T'ang dynasty Nestorian image of Jesus Christ.jpg
Resurrection Chapel of Washington National Cathedral.jpg
Resurrezioneditabita.jpg
ReuternAbraham.jpg
Riemenschneider (Werkstatt) Gottvater mit Christus.jpg
Rilakloster Wandgemälde b 20090407 018.JPG
RillaMon

Singing Windows stained glass, designed by J&R Lamb, located in the University chapel at Tuskegee University, Tuskegee, Alabama LCCN2010637794.tif
Sisters of Mercy.JPG
SkopjeMT.jpg
Skulyabin Alms House Vologda Roof.jpg
Slave in chains RMG E9148.tif
Slave pen, Alexandria, Va LCCN2013651888.jpg
Slavery memorial - Stone Town-2.jpg
Smart Hymn21 Generosity.jpg
Smithsonian-Saint-Gaudens-Adams Memorial-2264.jpg
Söderåkra kyrka04.JPG
Sodoma - St Sebastian - WGA21550.jpg
Sojourner truth c1870.jpg
Sonnenaufgang, Margret Hofheinz-Döring, Öl, 1991 (WV-Nr.8463).jpg
Söraby kyrka Tavla 021.JPG
Soup Kitchen for the Jewish Poor, Brune Street, London - geograph.org.uk - 984504.jpg
South Doors of the Florence Baptistry - Detail 3.JPG
South Tower Fountain of "Reflecting Absence at the National September 11 Memorial.jpg
Southwark Cathedral stained glass windows 01082013 54.jpg
Sowing, by William H. Johnson.jpg
Spanzotti SanDomenico TO.jpg
Spatial Concept 'Waiting', cut canvas by Lucio Fontana, Tate Modern 

The Vision of Christ Butts set.jpg
The Vision of Eliphaz Butts set.jpg
The War Refugees' Camp, Earl's Court 1918 Art.IWMART2449.jpg
The Wrath of Elihu Butts set.jpg
The-Maesta-Altarpiece-The-Incredulity-of-Saint-Thomas-1461 Duccio.jpg
TheKingSaul.jpg
Theodor Schnell dJ Schmalegg Hochaltar Gnadenstuhl.jpg
Théodule-Augustin Ribot - The Good Samaritan - WGA19393.jpg
There Will Be No MIracles Here, Edinburgh.jpg
Thomas Cole - A View of the Mountain Pass Called the Notch of the White Mountains.jpg
Thomas Cole The Garden of Eden Amon Carter Museum.jpg
Thomas the Doubter by Eduard von Gebhardt.jpg
Thomas Webster - A Village Choir.jpg
Thorma Alms.jpg
Three Friends, by William H. Johnson.jpg
Tiffany Education (center).JPG
Tihanyi Gipsy Woman with Child.jpg
Tijuana-san diego border deaths.jpg
Tintoretto - The Visit of the Queen of Sheba to Solomon.gif
Tissot Deborah Beneath the Palm Tree.jpg
Tissot God's Promises to Abram.jpg
Tissot Hagar and the Angel in the Desert.jpg
Tissot The Pharisee and t

Wiggins-Hunger.jpg
Wilderness of Engedi, with the monastery of St. Saba, lookin Wellcome V0049464.jpg
Wilhelm Morgner 001.jpg
Wilhelm Wachtel - Hannah at prayer.jpg
Willem de Poorter - A Woman Praying - WGA18150.jpg
Willem de Poorter's The Parable of The Talents or Minas.png
Willem Vrelant (Flemish, died 1481, active 1454 - 1481) - A Man Praying to the Holy Spirit - Google Art Project.jpg
Willem Vrelant (Flemish, died 1481, active 1454 - 1481) - Pentecost - Google Art Project.jpg
Willem Vrelant (Flemish, died 1481, active 1454 - 1481) - Solomon Praying to the Holy Spirit - Google Art Project.jpg
Willem Vrelant (Flemish, died 1481, active 1454 - 1481) - The Savior of the World - Google Art Project.jpg
William Aiken Walker - Sharecroppers Gathered in the Yard.jpg
William Blake - Adam and Eve Sleeping - WGA02224.jpg
William Blake - Jerusalem, Plate 91, "It is easier to forgive an enemy...." - Google Art Project.jpg
William Blake - Job's Sons and Daughters Overwhelmed by Satan.jpg
William 

In [60]:
fieldnames = ['act_id', 'qid', 'filename']
write_dicts_to_csv(output_list, 'wikidata_found2.csv', fieldnames)

print('done')

done


In [46]:
# ---------------------------
# Body of main script
# ---------------------------

# This section contains configuration information and performs necessary logins
# It needs to be run once before the rest of the code
# No writing is done, so it's "safe" to run any time

# These are recommended delay times to avoid hitting the APIs too frequently and getting blocked
sparql_sleep = 0.25 # delay time between calls to Wikidata SPARQL endpoint, probably could be lower (like 0.1)
commons_sleep = 5 # non-critical edits to commons no faster than this. https://commons.wikimedia.org/wiki/Commons:Bots#Bot_speed
read_api_sleep = 0.1

# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1

# NOTE: as of 2020-04-27, the function that needs maxlag isn't being used for anything, so this value doesn't matter
maxlag = 5

# Set image_filename to the name to be used for the image once it has been uploaded.
# This will also be used to generate the image page title, which will be the same, but with
# spaces replacing the underscores.
# For file naming conventions, see: https://commons.wikimedia.org/wiki/Commons:File_naming

image_filename = 'Madrassa_Ben_Youssef_alcove_ceiling.jpg'

# The caption has to be set in a separate operation from the file upload
# But it's set here so that its text can be used for the description
# Captions must be less than 255 characters. 
# There can be multiple captions in different languages, but only one per language.
caption = 'Ceiling detail of alcove in Madrassa Ben Youssef, Marrakech, Morocco.'
caption_language = 'en'

# The description doesn't have to be the same as the caption.
# It can be much longer and contain Wiki formatting, such as links. 
# As this code stands, however, it will be the same as the caption.
description = caption

# This section needs to be run prior to running any code that interacts with the Commons API
# It generates the CSRF token required to post to the API on behalf of the user whose username and pwd are being used

# This is the format of the credentials file. 
# Username and password are for a bot that you've created.
# Set your own User-Agent header. Do not use the one listed here
# See https://meta.wikimedia.org/wiki/User-Agent_policy
'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
userAgentHeader=YourBot/0.1 (someuser@university.edu)
'''

# ---------------------------
# Commons API Post Authentication (create session and generate CSRF token)
# ---------------------------

# If credentials file location is relative to current working directory, use subfolders through file name with no leading slash
# Example: myproj/credentials/commons_credentials.txt
# If credentials file is in current working directory, only filename is necessary
# Need to give example for absolute path on Windows - use Unix forward slashes?
path = 'commons_credentials.txt'
path_is_relative_to_home_directory = True # set to True if relative home directory, False if absolute path or relative to working directory
result = login(path, path_is_relative_to_home_directory)
# print(result)
commons_session = result['session']
commons_csrf_token = result['csrftoken']
# Commons API endpoint URL is in result['endpoint'], but it is going to be hard coded anyway, so ignore

print('done')

done


In [251]:
file_path = '../../vandycite/act/processed_lists/add_to_wikidata.csv'
file_data = read_dict(file_path)

output_list = []
for record in file_data[500:600]:
    print(record['filename'])
    retrieved_data = retrieve_commons_api_info(record['RecordNumber'], record['filename'])
    # print(retrieved_data)
    output_list.append(retrieved_data)
    sleep(read_api_sleep) # Don't hit the API too fast
    
fieldnames = ['act_id', 'filename', 'title', 'wikibase_id', 'artist_name', 'artist_description', 'user_name', 'user_url', 'dateTime_original_string', 'license_name', 'license_code', 'main_subject_P921_label', 'main_subject_P921_full', 'main_subject_P921_qid', 'categories_list', 'titles_list']
write_dicts_to_csv(output_list, 'test_output.csv', fieldnames)

print('done')

Chora Church Constantinople (6).JPG
Chorus from Legenda Aurea.jpg
Christ and a Monk Holding the Ends of a Staff and Two Shepherds with their Flock - Google Art Project.jpg
Christ and SocratesSAAM 1974.28.341A B 1.jpg
Christ and the pauper.jpg
Christ and the Pharisees, from Das Plenarium MET DP849932.jpg
Christ and the Pharisees; verso; Christ and a Pharisee MET DP802093.jpg
Christ and the Pharisees; verso; Christ and a Pharisee MET DP802094.jpg
Christ and the Samaritan Woman among Ruins.jpg
Christ and the sinner.jpg
Christ anв Samaritan woman (Monreale).jpg
Christ as Gardener Appearing to Saint Mary Magdalen LACMA M.84.133.jpg
Christ at Rest, by Hans Holbein the Younger.jpg
Christ Carrying the Cross MET DT714.jpg
Christ Crucified between the Two Thieves (The Three Crosses).jpg
Christ cures the paralytic at the therapeutic pool at Bethes Wellcome V0049237.jpg
Christ et Pilate 9098.JPG
Christ in Glory - Vaul mosaic - San Vitale - Ravenna 2016.jpg
Christ in Majesty with Symbols of the Fou