# Script to pull metadata from Wikidata

Throughout the script I refer to "Wikidata" but this could be used for any Wikibase instance.

## Configuration section

Import modules, set values, and define functions

In [None]:
# acquire-wikidata-metadata.ipynb This is part of the VandyCite project https://www.wikidata.org/wiki/Wikidata:WikiProject_VandyCite
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf 2020-09-06

from pathlib import Path
import requests
from time import sleep
import json
import csv

# ----------------
# Configuration settings
# ----------------

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
data_path = home + '/divinity-law/'
item_source_csv = 'identified-journals.csv'
output_file_name = 'test-output.csv'

endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# ----------------
# Utility functions
# ----------------

def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

# ----------------
# Specialty functions
# ----------------

# old version of function
def sparql_append_properties(variable_name, select_prefix, graph_pattern_prefix):
    select_prefix += '?' + variable_name + ' ?' + variable_name + '_uuid ?' + variable_name + '_ref1_hash ?' + variable_name + '_ref1_statedIn ?' + variable_name + '_ref1_referenceUrl ?' + variable_name + '_ref1_retrieved_nodeid ?' + variable_name + '_ref1_retrieved_val ?' + variable_name + '_ref1_retrieved_prec '
    graph_pattern_prefix += '''optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P248 ?' + variable_name + '''_ref1_statedIn.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P854 ?' + variable_name + '''_ref1_referenceUrl.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash prv:P813 ?' + variable_name + '''_ref1_retrieved_nodeid.
?''' + variable_name + '_ref1_retrieved_nodeid wikibase:timeValue ?' + variable_name + '''_ref1_retrieved_val.
?''' + variable_name + '_ref1_retrieved_nodeid wikibase:timePrecision ?' + variable_name + '''_ref1_retrieved_prec.
}
'''
    return select_prefix, graph_pattern_prefix

# new version of function
def sparql_append_property(prop, select_prefix, graph_pattern_prefix):
    variable_name = prop['variable']
    prop_id = prop['pid']
    select_prefix += '?' + variable_name + ' ?' + variable_name + '_uuid ?' + variable_name + '_ref1_hash ?' + variable_name + '_ref1_statedIn ?' + variable_name + '_ref1_referenceUrl ?' + variable_name + '_ref1_retrieved_nodeid ?' + variable_name + '_ref1_retrieved_val ?' + variable_name + '_ref1_retrieved_prec '
    graph_pattern_prefix += '''optional{
?qid p:''' + prop['pid'] + ' ?' + prop['variable'] + '''_uuid.
?qid wdt:''' + prop['pid'] + ' ?' + prop['variable'] + '''.
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P248 ?' + variable_name + '''_ref1_statedIn.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P854 ?' + variable_name + '''_ref1_referenceUrl.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash prv:P813 ?' + variable_name + '''_ref1_retrieved_nodeid.
?''' + variable_name + '_ref1_retrieved_nodeid wikibase:timeValue ?' + variable_name + '''_ref1_retrieved_val.
?''' + variable_name + '_ref1_retrieved_nodeid wikibase:timePrecision ?' + variable_name + '''_ref1_retrieved_prec.
}}
'''
    return select_prefix, graph_pattern_prefix

# function to add columns to column list for CSV header
def csv_header_append(prop, header_list):
    variable_name = prop['variable']

    header_list.append(variable_name)
    header_list.append(variable_name + '_uuid')
    header_list.append(variable_name + '_ref1_hash')
    header_list.append(variable_name + '_ref1_statedIn')
    header_list.append(variable_name + '_ref1_referenceUrl')
    header_list.append(variable_name + '_ref1_retrieved_nodeid')
    header_list.append(variable_name + '_ref1_retrieved_val')
    header_list.append(variable_name + '_ref1_retrieved_prec')

    return header_list

## Load list of items from file

The CSV has a header row with column headers: `qid` and `label`. The `qid` column contains the Wikidata Q identifiers for each item. The `label` column contains the label, which isn't necessarily the label in Wikidata, but provides a way for humans to recognize the item.

In [None]:
# Load item data from csv
print('loading item data from file')
filename = data_path + item_source_csv
items = read_dict(filename)

# Create VALUES list for journals
item_qids = ''
for item in items:
    item_qids += 'wd:' + item['qid'] + '\n'
# remove trailing newline
item_qids = item_qids[:len(item_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'},
    {'pid': 'P1476', 'variable': 'title', 'value_type': 'string'},
    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'},
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
]

#print(item_qids)

## Create query to get the statement UUIDs and values

SPARQL query to be sent to the Wikidata Query Service (WDQS)

In [None]:
property = prop_list[12]

# create a string for the query
query = '''
select distinct ?qid '''

query += '?' + property['variable'] + '_value '
query += '?' + property['variable'] + '_statement '

query += 'where {'

query += '''
  VALUES ?qid
{
''' + item_qids + '''
}

'''


query += '?qid wdt:' + property['pid'] + ' ?' + property['variable'] + '_value.\n'
query += '?qid p:' + property['pid'] + ' ?' + property['variable'] + '_statement.\n'
query += '}'

#print(query)

Send the query to the WDQS and extract the results from the returned JSON

In [None]:
# Send SPARQL query to the Wikidata Query Service
print('retrieving data from Wikidata')

# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
#print(json.dumps(results, indent = 2))

Extract the qids, values, and statement UUIDs from the response data and store in a list of dicts (`statements_list`). Note: the `extract_statement_uuid()` function returns `uuid`,`qid` extracted from the statement IRI, so they have to be re-assembled to make the statement identifier: *qid*-*UUID*.

In [None]:
statements_list = [] # create list of data for statements, not really used for anything yet
statement_uuids = '' # create VALUES list for statmements
for result in results:
    row_dict = {}
    row_dict = {'qid': extract_qnumber(result['qid']['value'])}

    if property['value_type'] == 'item':
        # remove wd: 'http://www.wikidata.org/entity/'
        value = extract_qnumber(result[property['variable'] + '_value']['value'])
    else:
        value = result[property['variable'] + '_value']['value']
    row_dict[property['variable'] + '_value'] = value
    uuid, temp_qid = extract_statement_uuid(result[property['variable'] + '_statement']['value'])
    row_dict[property['variable'] + '_statement'] = uuid
    statement_uuids += 'wds:' + temp_qid + '-' + uuid + '\n'
    statements_list.append(row_dict)

# remove trailing newline
statement_uuids = statement_uuids[:len(statement_uuids)-1]

#print(statement_uuids)
#print(statements_list)

Create query to retrieve the reference metadata

In [None]:
select_prefix = ''
graph_pattern_prefix = ''
prop = property['variable']

add_select, add_graph_pattern = sparql_append_properties(prop, select_prefix, graph_pattern_prefix)

# create a string for the query
query = 'select distinct ?label '

query += add_select + '''where {
'''

query += '  VALUES ?' + property['variable'] + '''_uuid
{
''' + statement_uuids + '''
}

'''

query += '''
?''' + property['variable'] + '_uuid ps:' + property['pid'] + ' ?' + property['variable'] + '''.
?qid p:''' + property['pid'] + ' ?' + property['variable'] + '''_uuid.
?qid rdfs:label ?label.
filter(lang(?label)='en')
'''

query += add_graph_pattern + '}'

# print(query)

Retrieve the reference metadata from Wikidata

In [None]:
# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
print(response.text)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
print('done retrieving reference data')
# print(results)

Extract the reference metadata from the JSON returned from the endpoint into flat JSON

In [None]:
# extract results
references_list = []
for result in results:
    row_dict = {}
    row_dict[property['variable'] + '_uuid'], row_dict['qid'] = extract_statement_uuid(result[property['variable'] + '_uuid']['value'])
    row_dict[property['variable']] = result[property['variable']]['value']
    row_dict['label'] = result['label']['value']
    try:
        row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result[property['variable'] + '_ref1_hash']['value'])
    except:
        row_dict[property['variable'] + '_ref1_hash'] = ''
    try:
        row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result[property['variable'] + '_ref1_statedIn']['value'])
    except:
        row_dict[property['variable'] + '_ref1_statedIn'] = ''
    try:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = result[property['variable'] + '_ref1_referenceUrl']['value']
    except:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
    try:
        # Note: the form of the node ID is http://www.wikidata.org/value/0a8f688406e3fc53d0119eafcd2c0396
        # so the extract_qnumber() function can be used on it.
        row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = extract_qnumber(result[property['variable'] + '_ref1_retrieved_nodeid']['value'])
    except:
        row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = result[property['variable'] + '_ref1_retrieved_val']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = result[property['variable'] + '_ref1_retrieved_prec']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
    references_list.append(row_dict)
    
# print(json.dumps(references_list, indent=2))

Turn the flattened JSON into CSV and write to a file

In [None]:
# create the list of column headers
fieldnames = ['qid', property['variable'] + '_uuid', property['variable'], property['variable'] + '_ref1_hash', property['variable'] + '_ref1_statedIn', property['variable'] + '_ref1_referenceUrl', property['variable'] + '_ref1_retrieved_nodeid', property['variable'] + '_ref1_retrieved_val', property['variable'] + '_ref1_retrieved_prec', 'label']

# write the data to a CSV file
print('writing data to file')
write_dicts_to_csv(references_list, data_path + output_file_name, fieldnames)

print('done')

# Combined script to acquire metadata for multiple properties at once

This duplicates many of the cells above

In [None]:
# Load item data from csv
print('loading item data from file')
filename = data_path + item_source_csv
items = read_dict(filename)

# Create VALUES list for journals
item_qids = ''
for item in items:
    item_qids += 'wd:' + item['qid'] + '\n'
# remove trailing newline
item_qids = item_qids[:len(item_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
]

#prop_list = [
#    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'},
#    {'pid': 'P1476', 'variable': 'title', 'value_type': 'string'},
#    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'},
#    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'},
#    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
#    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'},
#]

#print(item_qids)

In [None]:
# Create the SPARQL query to get the property statements for the items on the Q IDs list
select_variable_list = ''
graph_pattern = ''
for prop in prop_list:
    select_variable_list, graph_pattern = sparql_append_property(prop, select_variable_list, graph_pattern)
query = '''
select distinct ?qid ?label ''' + select_variable_list + 'where {'

query += '''
  VALUES ?qid
{
''' + item_qids + '''
}
'''

query += '''
?qid rdfs:label ?label.
filter(lang(?label)='en')
'''

query += graph_pattern + '''
}'''

print(query)

In [None]:
# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
#print(response.text)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
print('done retrieving reference data')
print(json.dumps(results, indent=2))

In [None]:
# extract results
metadata_list = []
for result in results:
    row_dict = {}
    row_dict['qid'] = extract_qnumber(result['qid']['value'])
    row_dict['label_en'] = result['label']['value']
    for property in prop_list:
        try:
            row_dict[property['variable'] + '_uuid'], trash = extract_statement_uuid(result[property['variable'] + '_uuid']['value'])
        except:
            row_dict[property['variable'] + '_uuid'] = ''
        try:
            row_dict[property['variable']] = result[property['variable']]['value']
        except:
            row_dict[property['variable']] = ''
        try:
            row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result[property['variable'] + '_ref1_hash']['value'])
        except:
            row_dict[property['variable'] + '_ref1_hash'] = ''
        try:
            row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result[property['variable'] + '_ref1_statedIn']['value'])
        except:
            row_dict[property['variable'] + '_ref1_statedIn'] = ''
        try:
            row_dict[property['variable'] + '_ref1_referenceUrl'] = result[property['variable'] + '_ref1_referenceUrl']['value']
        except:
            row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
        try:
            # Note: the form of the node ID is http://www.wikidata.org/value/0a8f688406e3fc53d0119eafcd2c0396
            # so the extract_qnumber() function can be used on it.
            row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = extract_qnumber(result[property['variable'] + '_ref1_retrieved_nodeid']['value'])
        except:
            row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = ''
        try:
            row_dict[property['variable'] + '_ref1_retrieved_val'] = result[property['variable'] + '_ref1_retrieved_val']['value']
        except:
            row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
        try:
            row_dict[property['variable'] + '_ref1_retrieved_prec'] = result[property['variable'] + '_ref1_retrieved_prec']['value']
        except:
            row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
    
    metadata_list.append(row_dict)
    
print(json.dumps(metadata_list, indent=2))

In [None]:
# create the list of column headers
fieldnames = ['qid', 'label_en']
for prop in prop_list:
    fieldnames = csv_header_append(prop, fieldnames)
# print(fieldnames)

# write the data to a CSV file
print('writing data to file')
write_dicts_to_csv(metadata_list, data_path + output_file_name, fieldnames)

print('done')