2021-08-17 I used this notebook to work out how to convert the Commons raw filenames required by the API and the URL-encoded URLs that are returned from the Query Service. The two functions several cells below resulted and were used in the VanderBot 1.8 update.

In [14]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
import sys # Read CLI arguments
import urllib.parse

# ----------------
# Configuration settings
# ----------------

if len(sys.argv) == 2: # if exactly one argument passed (i.e. the configuration file path)
    file_path = sys.argv[1] # sys.argv[0] is the script name
else:
    file_path = 'act.csv'

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.7.1 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

# function to use in sort
def sort_funct(row):
    return row['filename']



In [6]:
query = '''
select distinct ?qid ?iri
where {
#?qid wdt:P9092 ?id.
#?qid wdt:P18 ?iri.
wd:Q13406268 wdt:P18 ?iri.
}
limit 10'''
print(query)


select distinct ?qid ?iri
where {
#?qid wdt:P9092 ?id.
#?qid wdt:P18 ?iri.
wd:Q13406268 wdt:P18 ?iri.
}
limit 10


Tested uploading a file with spaces and it worked. The value returned from SPARQL had escaped spaces.

Tested uploading a file with underscores replacing the spaces (as in displayed commons URL) and it worked. The value returned from SPARQL had underscores. In the web interface there was a warning saying that the commons link should be well-formed. When clicked on, the text had underscores. 


In [10]:
# ----------------
# send request to Wikidata Query Service
# ----------------

print('querying SPARQL endpoint to acquire item metadata')
response = requests.post(endpoint, data=query.encode('utf-8'), headers=requestheader)
#print(response.text)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']

print('done retrieving data')
print(json.dumps(results, indent=2))


querying SPARQL endpoint to acquire item metadata
done retrieving data
[
  {
    "iri": {
      "type": "uri",
      "value": "http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg"
    }
  }
]


In [16]:
value = results[0]['iri']['value']
string = value.split('FilePath/')[1]
urllib.parse.unquote(string)
#urllib.parse.quote(string) # URL encode

'Castle De Haar (1892-1913) - 360° Panorama of Castle & Castle Grounds.jpg'

In [22]:
commons_prefix = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
def commons_url_to_filename(url):
    # form of URL is: http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg
    string = url.split(commons_prefix)[1] # get local name file part of URL
    filename = urllib.parse.unquote(string) # reverse URL-encode the string
    return filename

def filename_to_commons_url(filename):
    encoded_filename = urllib.parse.quote(filename)
    url = commons_prefix + encoded_filename
    return url


In [23]:
url = "http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg"
filename = commons_url_to_filename(url)
print(filename)

new_url = filename_to_commons_url(filename)
print(new_url)

Castle De Haar (1892-1913) - 360° Panorama of Castle & Castle Grounds.jpg
http://commons.wikimedia.org/wiki/Special:FilePath/Castle%20De%20Haar%20%281892-1913%29%20-%20360%C2%B0%20Panorama%20of%20Castle%20%26%20Castle%20Grounds.jpg


In [None]:
data = read_dict(file_path)
input_list = []
iri_values = ''  # VALUES list for query
for record in data:
    record_dict = {'act_id': record['RecordNumber']}
    # some records have spaces with other junk after them
    strings = record['filename'].split(' ')
    # use only the first string in the list (item 0)
    record_dict['filename'] = strings[0]
    # to generate the IRIs, the underscores need to be replaced with escaped spaces (%20)
    filename = strings[0].replace('_','%20')
    url = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + filename
    record_dict['url'] = url
    input_list.append(record_dict)
    iri_values += '<' + url + '>\n'

# remove trailing newline
iri_values = iri_values[:len(iri_values)-1]

In [None]:
print(json.dumps(output_list, indent=2))

In [None]:
print(iri_values)

In [None]:
# ----------------
# extract Q IDs from the results and match them with the ACT IDs
# ----------------

output_list = []
for record in input_list:
    found = False
    for result in results:
        if record['url'] == result['iri']['value']:
            found = True
            qid = extract_qnumber(result['qid']['value'])
            record['qid'] = qid
            break
    if not found:
        record['qid'] = ''
    output_list.append(record)
print(json.dumps(output_list, indent = 2))

In [None]:
output_list.sort(key = sort_funct) # sort by the filename field
fieldnames = ['act_id', 'qid', 'filename', 'url']
write_dicts_to_csv(output_list, 'output.csv', fieldnames)
print('done')