# Script to pull metadata from Wikidata

Throughout the script I refer to "Wikidata" but this could be used for any Wikibase instance.

## Configuration section

Import modules, set values, and define functions

In [None]:
# acquire-wikidata-metadata.ipynb This is part of the VandyCite project https://www.wikidata.org/wiki/Wikidata:WikiProject_VandyCite
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf 2020-09-08

from pathlib import Path
import requests
from time import sleep
import json
import csv

# ----------------
# Configuration settings
# ----------------

manage_labels_descriptions = True
sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
#data_path = home + '/divinity-law/'
data_path = '../../../vandycite/journals/'
item_source_csv = 'journal-div-qids.csv' # put an empty string here to get the QIDs from the query rather than a file

# insert any graph pattern that will screen for the Q IDs you are interested in. Must use "?qid" as the variable.
item_query = '''select distinct ?qid where {
  ?qid wdt:P195 wd:Q18563658.
  }'''

endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# NOTE: the following qualifier value types are currently not supported: globecoordinate, quantity, monolingualtext
# supported types are: item, date, and string-like types (uri, string)

# settings for multiprop
label_description_language_list = []
output_file_name = 'journals-multiprop-test.csv'
prop_list = [
    #{'pid': 'P108', 'variable': 'official_website', 'value_type': 'uri','qual': [{'pid': 'P580', 'variable': 'work_language', 'value_type': 'date'}]}
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri','qual': [{'pid': 'P407', 'variable': 'work_language', 'value_type': 'item'}]}
]

'''
# settings for labels/title metadata
label_description_language_list = ['en', 'zh', 'zh-hans', 'zh-hant', 'de', 'fr']
output_file_name = 'journals-title.csv'
prop_list = [
    {'pid': 'P1476', 'variable': 'title_en', 'value_type': 'monolingualtext', 'language': 'en'},
    {'pid': 'P1476', 'variable': 'title_de', 'value_type': 'monolingualtext', 'language': 'de'},
    {'pid': 'P1476', 'variable': 'title_la', 'value_type': 'monolingualtext', 'language': 'la'},
    {'pid': 'P1476', 'variable': 'title_fr', 'value_type': 'monolingualtext', 'language': 'fr'},
    {'pid': 'P1476', 'variable': 'title_it', 'value_type': 'monolingualtext', 'language': 'it'},
    {'pid': 'P1476', 'variable': 'title_pt', 'value_type': 'monolingualtext', 'language': 'pt'},
    {'pid': 'P1476', 'variable': 'title_es', 'value_type': 'monolingualtext', 'language': 'es'},
    {'pid': 'P1476', 'variable': 'title_sk', 'value_type': 'monolingualtext', 'language': 'sk'},
    {'pid': 'P1476', 'variable': 'title_hu', 'value_type': 'monolingualtext', 'language': 'hu'},
    {'pid': 'P1476', 'variable': 'title_nb', 'value_type': 'monolingualtext', 'language': 'nb'},
    {'pid': 'P1476', 'variable': 'title_ko', 'value_type': 'monolingualtext', 'language': 'ko'},
    {'pid': 'P1476', 'variable': 'title_af', 'value_type': 'monolingualtext', 'language': 'af'}
]
'''

'''
# official website requires qualifier language of work or name (item)
prop_list = [
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item','qual': []},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date','qual': []},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'date','qual': []},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri','qual': [{'pid': 'P407', 'variable': 'work_language', 'value_type': 'item'}]},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item','qual': []},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item','qual': []},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'quantity','qual': []}
]
'''
# The following properties can contain multiple values per item, so need to be managed in separate CSVs.
# The script needs to be rerun with each one as a single item on the prop_list.

#prop_list = [
#    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'}
#    {'pid': 'P1476', 'variable': 'title', 'value_type': 'monolingualtext', 'language': 'en'}
#    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'}
#    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'}
#    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
#    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'}
#]

# ----------------
# Utility functions
# ----------------

# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

# ----------------
# Specialty functions
# ----------------

# function to add variables related to a property to the select clause and graph pattern of the SPARQL query
def sparql_append_property(prop, select_prefix, graph_pattern_prefix):
    variable_name = prop['variable']
    prop_id = prop['pid']
    
    # Create the variables for the select clause of the query
    select_prefix += ' ?' + variable_name + '_uuid '
    if prop['value_type'] == 'date':
        select_prefix += '?' + variable_name + '_nodeId ?' + variable_name + '_val ?' + variable_name + '_prec'
    elif prop['value_type'] == 'quantity':
        select_prefix += '?' + variable_name + '_nodeId ?' + variable_name + '_val ?' + variable_name + '_unit'
    elif prop['value_type'] == 'globecoordinate':
        select_prefix += '?' + variable_name + '_nodeId ?' + variable_name + '_val ?' + variable_name + '_long ?' + variable_name + '_prec'
    else:
        select_prefix += '?' + variable_name
    select_prefix += ' ?' + variable_name + '_ref1_hash ?' + variable_name + '_ref1_statedIn ?' + variable_name + '_ref1_referenceUrl ?' + variable_name + '_ref1_retrieved_nodeId ?' + variable_name + '_ref1_retrieved_val ?' + variable_name + '_ref1_retrieved_prec '
    for qualifier in prop['qual']:
        if qualifier['value_type'] == 'date':
            select_prefix += '?' + variable_name + '_' + qualifier['variable'] + '_nodeId ?' + variable_name + '_' + qualifier['variable'] + '_val ?' + variable_name + '_' + qualifier['variable'] + '_prec '
        else:
            select_prefix += '?' + variable_name + '_' + qualifier['variable'] + ' '

    # Create the graph pattern for the query
    graph_pattern_prefix += '''optional{
?qid p:''' + prop['pid'] + ' ?' + variable_name + '_uuid.\n'

    if prop['value_type'] == 'date':
        graph_pattern_prefix += '?' + variable_name + '_uuid psv:' + prop['pid'] + ' ?' + variable_name + '''_nodeId.
?''' + variable_name + '_nodeId wikibase:timeValue ?' + variable_name + '''_val.
?''' + variable_name + '_nodeId wikibase:timePrecision ?' + variable_name + '_prec.\n'
        
    elif prop['value_type'] == 'quantity':
        graph_pattern_prefix += '?' + variable_name + '_uuid psv:' + prop['pid'] + ' ?' + variable_name + '''_nodeId.
?''' + variable_name + '_nodeId wikibase:quantityAmount ?' + variable_name + '''_val.
?''' + variable_name + '_nodeId wikibase:quantityUnit ?' + variable_name + '_unit.\n'
        
    elif prop['value_type'] == 'globecoordinate':
        graph_pattern_prefix += '?' + variable_name + '_uuid psv:' + prop['pid'] + ' ?' + variable_name + '''_nodeId.
?''' + variable_name + '_nodeId wikibase:geoLatitude ?' + variable_name + '''_val.
?''' + variable_name + '_nodeId wikibase:geoLongitude ?' + variable_name + '''_long.
?''' + variable_name + '_nodeId wikibase:geoPrecision ?' + variable_name + '_prec.\n'
        
    elif prop['value_type'] == 'monolingualtext':
        graph_pattern_prefix += '?' + variable_name + '_uuid ps:' + prop['pid'] + ' ?' + variable_name + '''.
filter(lang(?''' + variable_name + ')="' + prop['language'] + '")\n'
        
    else:
        graph_pattern_prefix += '?' + variable_name + '_uuid ps:' + prop['pid'] + ' ?' + variable_name + '.\n'

    graph_pattern_prefix += '''
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P248 ?' + variable_name + '''_ref1_statedIn.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash pr:P854 ?' + variable_name + '''_ref1_referenceUrl.
}
optional{
?''' + variable_name + '''_uuid prov:wasDerivedFrom ?''' + variable_name + '''_ref1_hash.
?''' + variable_name + '_ref1_hash prv:P813 ?' + variable_name + '''_ref1_retrieved_nodeId.
?''' + variable_name + '_ref1_retrieved_nodeId wikibase:timeValue ?' + variable_name + '''_ref1_retrieved_val.
?''' + variable_name + '_ref1_retrieved_nodeId wikibase:timePrecision ?' + variable_name + '''_ref1_retrieved_prec.
}
'''
    for qualifier in prop['qual']:
        if qualifier['value_type'] == 'date':
            graph_pattern_prefix += '''
optional{
?''' + variable_name + '''_uuid pqv:''' + qualifier['pid'] + ' ?' + variable_name + '_' + qualifier['variable'] + '''_nodeId.
?''' + variable_name + '_' + qualifier['variable'] + '_nodeId wikibase:timeValue ?' + variable_name + '_' + qualifier['variable'] + '''_val.
?''' + variable_name + '_' + qualifier['variable'] + '_nodeId wikibase:timePrecision ?' + variable_name + '_' + qualifier['variable'] + '''_prec.
}'''
        else:
            graph_pattern_prefix += '''
optional{
?''' + variable_name + '''_uuid pq:'''+ qualifier['pid'] + ''' ?''' + variable_name + '_' + qualifier['variable'] +'''.
}
'''
    graph_pattern_prefix += '}\n'
    return select_prefix, graph_pattern_prefix

# function to add columns to column list for CSV header
def csv_header_append(prop, header_list):
    variable_name = prop['variable']

    header_list.append(variable_name + '_uuid')
    if prop['value_type'] == 'date':
        header_list.append(variable_name + '_nodeId')
        header_list.append(variable_name + '_val')
        header_list.append(variable_name + '_prec')
    elif prop['value_type'] == 'quantity':
        header_list.append(variable_name + '_nodeId')
        header_list.append(variable_name + '_val')
        header_list.append(variable_name + '_unit')
    elif prop['value_type'] == 'globecoordinate':
        header_list.append(variable_name + '_nodeId')
        header_list.append(variable_name + '_val')
        header_list.append(variable_name + '_long')
        header_list.append(variable_name + '_prec')
    else:
        header_list.append(variable_name)
    header_list.append(variable_name + '_ref1_hash')
    header_list.append(variable_name + '_ref1_statedIn')
    header_list.append(variable_name + '_ref1_referenceUrl')
    header_list.append(variable_name + '_ref1_retrieved_nodeId')
    header_list.append(variable_name + '_ref1_retrieved_val')
    header_list.append(variable_name + '_ref1_retrieved_prec')
    for qualifier in prop['qual']:
        if qualifier['value_type'] == 'date':
            header_list.append(variable_name + '_' + qualifier['variable'] + '_nodeId')
            header_list.append(variable_name + '_' + qualifier['variable'] + '_val')
            header_list.append(variable_name + '_' + qualifier['variable'] + '_prec')
        else:
            header_list.append(variable_name + '_' + qualifier['variable'])

    return header_list

## Load list of items from file (or generate by query) and construct Q ID list for query

The CSV has a header row with column headers: `qid` and `label`. The `qid` column contains the Wikidata Q identifiers for each item. The `label` column contains the label, which isn't necessarily the label in Wikidata and isn't use for anything in the script. It does provide a way for humans to recognize the item when looking at the table.

In [None]:
if item_source_csv == '':
    # send request to Wikidata Query Service
    print('querying SPARQL endpoint')
    response = requests.post(endpoint, data=item_query, headers=requestheader)
    #print(response.text)
    data = response.json()
    print('results returned')

    # extract the values from the response JSON
    items = data['results']['bindings']
    #print(results)
else:
    # Load item data from csv
    print('loading item data from file')
    filename = data_path + item_source_csv
    items = read_dict(filename)
    print('done loading')

# Create VALUES list for items
item_qids = ''
for item in items:
    item_qids += 'wd:' + item['qid'] + '\n'
# remove trailing newline
item_qids = item_qids[:len(item_qids)-1]

#print(item_qids)

In [None]:
# Create the SPARQL query to get the property statements for the items on the Q IDs list
select_variable_list = ''
graph_pattern = ''
for prop in prop_list:
    select_variable_list, graph_pattern = sparql_append_property(prop, select_variable_list, graph_pattern)
query = '''
select distinct ?qid '''

# note: dashes not allowed in SPARQL variable names, so replace with underscores
for label_description_language in label_description_language_list:
    query +='?label_' + label_description_language.replace('-', '_') + ' ?description_' + label_description_language.replace('-', '_') + ' '
query += select_variable_list + 'where {'

query += '''
  VALUES ?qid
{
''' + item_qids + '''
}
'''
# made label and description optional since some don't have in English
for label_description_language in label_description_language_list:
    query += '''
optional {
?qid rdfs:label ?label_''' + label_description_language.replace('-', '_') + '''.
filter(lang(?label_''' + label_description_language.replace('-', '_') + ')="' + label_description_language + '''")
}
optional {
?qid schema:description ?description_''' + label_description_language.replace('-', '_') + '''.
filter(lang(?description_''' + label_description_language.replace('-', '_') + ')="' + label_description_language + '''")
}
'''

query += graph_pattern + '''
}'''

#print(query)

In [None]:
# send request to Wikidata Query Service
print('querying SPARQL endpoint')
response = requests.post(endpoint, data=query, headers=requestheader)
#print(response.text)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']

print('done retrieving data')
# print(json.dumps(results, indent=2))

In [None]:
# extract results
metadata_list = []
for result in results:
    row_dict = {}
    row_dict['qid'] = extract_qnumber(result['qid']['value'])
    for label_description_language in label_description_language_list:
        try:
            row_dict['label_' + label_description_language] = result['label_' + label_description_language.replace('-', '_')]['value']
        except:
            row_dict['label_' + label_description_language] = ''
        if manage_labels_descriptions:
            try:
                row_dict['description_' + label_description_language] = result['description_' + label_description_language.replace('-', '_')]['value']
            except:
                row_dict['description_' + label_description_language] = ''           
    for property in prop_list:
        try:
            row_dict[property['variable'] + '_uuid'], trash = extract_statement_uuid(result[property['variable'] + '_uuid']['value'])
        except:
            row_dict[property['variable'] + '_uuid'] = ''
        try:
            if property['value_type'] == 'item':
                row_dict[property['variable']] = extract_qnumber(result[property['variable']]['value'])
            elif property['value_type'] == 'date':
                row_dict[property['variable'] + '_nodeId'] = extract_qnumber(result[property['variable'] + '_nodeId']['value'])
                row_dict[property['variable'] + '_val'] = result[property['variable'] + '_val']['value']
                row_dict[property['variable'] + '_prec'] = result[property['variable'] + '_prec']['value']
            elif property['value_type'] == 'quantity':
                row_dict[property['variable'] + '_nodeId'] = extract_qnumber(result[property['variable'] + '_nodeId']['value'])
                row_dict[property['variable'] + '_val'] = result[property['variable'] + '_val']['value']
                row_dict[property['variable'] + '_unit'] = result[property['variable'] + '_unit']['value']
            elif property['value_type'] == 'globecoordinate':
                row_dict[property['variable'] + '_nodeId'] = extract_qnumber(result[property['variable'] + '_nodeId']['value'])
                row_dict[property['variable'] + '_val'] = result[property['variable'] + '_val']['value']
                row_dict[property['variable'] + '_long'] = result[property['variable'] + '_long']['value']
                row_dict[property['variable'] + '_prec'] = result[property['variable'] + '_long']['_prec']
            else:
                row_dict[property['variable']] = result[property['variable']]['value']
        except:
            if property['value_type'] == 'date':
                row_dict[property['variable'] + '_nodeId'] = ''
                row_dict[property['variable'] + '_val'] = ''
                row_dict[property['variable'] + '_prec'] = ''
            elif property['value_type'] == 'quantity':
                row_dict[property['variable'] + '_nodeId'] = ''
                row_dict[property['variable'] + '_val'] = ''
                row_dict[property['variable'] + '_unit'] = ''
            elif property['value_type'] == 'globecoordinate':
                row_dict[property['variable'] + '_nodeId'] = ''
                row_dict[property['variable'] + '_val'] = ''
                row_dict[property['variable'] + '_long'] = ''
                row_dict[property['variable'] + '_prec'] = ''
            else:
                row_dict[property['variable']] = ''
        try:
            row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result[property['variable'] + '_ref1_hash']['value'])
        except:
            row_dict[property['variable'] + '_ref1_hash'] = ''
        try:
            row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result[property['variable'] + '_ref1_statedIn']['value'])
        except:
            row_dict[property['variable'] + '_ref1_statedIn'] = ''
        try:
            row_dict[property['variable'] + '_ref1_referenceUrl'] = result[property['variable'] + '_ref1_referenceUrl']['value']
        except:
            row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
        try:
            # Note: the form of the node ID is http://www.wikidata.org/value/0a8f688406e3fc53d0119eafcd2c0396
            # so the extract_qnumber() function can be used on it.
            row_dict[property['variable'] + '_ref1_retrieved_nodeId'] = extract_qnumber(result[property['variable'] + '_ref1_retrieved_nodeId']['value'])
        except:
            row_dict[property['variable'] + '_ref1_retrieved_nodeId'] = ''
        try:
            row_dict[property['variable'] + '_ref1_retrieved_val'] = result[property['variable'] + '_ref1_retrieved_val']['value']
        except:
            row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
        try:
            row_dict[property['variable'] + '_ref1_retrieved_prec'] = result[property['variable'] + '_ref1_retrieved_prec']['value']
        except:
            row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
            
        for qualifier in property['qual']:
            try:
                if qualifier['value_type'] == 'date':
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_nodeId'] = extract_qnumber(result[property['variable'] + '_' + qualifier['variable'] + '_nodeId']['value'])
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_val'] = result[property['variable'] + '_' + qualifier['variable'] + '_val']['value']
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_prec'] = result[property['variable'] + '_' + qualifier['variable'] + '_prec']['value']
                elif qualifier['value_type'] == 'item':
                    row_dict[property['variable'] + '_' + qualifier['variable']] = extract_qnumber(result[property['variable'] + '_' + qualifier['variable']]['value'])
                else:
                    row_dict[property['variable'] + '_' + qualifier['variable']] = result[property['variable'] + '_' + qualifier['variable']]['value']
            except:
                if qualifier['value_type'] == 'date':
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_nodeId'] = ''
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_val'] = ''
                    row_dict[property['variable'] + '_' + qualifier['variable'] + '_prec'] = ''
                else:
                    row_dict[property['variable'] + '_' + qualifier['variable']] = ''
    
    metadata_list.append(row_dict)
    
# print(json.dumps(metadata_list, indent=2))

In [None]:
# create the list of column headers
fieldnames = ['qid']
if manage_labels_descriptions:
    # The schema generator puts all of the labels first, then the descriptions
    for label_description_language in label_description_language_list:
        fieldnames.append('label_' + label_description_language.replace('-', '_'))          
    for label_description_language in label_description_language_list:
        fieldnames.append('description_' + label_description_language.replace('-', '_'))          
for prop in prop_list:
    fieldnames = csv_header_append(prop, fieldnames)
if not(manage_labels_descriptions):
    fieldnames.append('label_' + label_description_language)          
# print(fieldnames)

# write the data to a CSV file
print('writing data to file')
write_dicts_to_csv(metadata_list, data_path + output_file_name, fieldnames)

print('done')