# Script to pull metadata from Wikidata

Throughout the script I refer to "Wikidata" but this could be used for any Wikibase instance.

## Configuration section

Import modules, set values, and define functions

In [1]:
# acquire-wikidata-metadata.ipynb This is part of the VandyCite project https://www.wikidata.org/wiki/Wikidata:WikiProject_VandyCite
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf 2020-09-05

from pathlib import Path
import requests
from time import sleep
import json
import csv

'''
import vb_common_code as vbc
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string
import copy # import the copy module from the standard library
'''

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
data_path = home + '/divinity-law/'
item_source_csv = 'identified-journals.csv'

endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# known_name_strings_path = home + '/divinity-law/known-name-strings.csv'
# wikidata_article_matches_path  = home + '/divinity-law/wikidata-article-matches.csv'
# duplicate_works_path  = home + '/divinity-law/duplicate-works.csv'
# journals_path = home + '/divinity-law/journals.csv'
# employerQId = 'Q29052' # Vanderbilt University
# testRatio = 90 # similarity required for a potential match of a generic wikidata match
# journalTestRatio = 94 # similarity required for a potential match of a generic wikidata match


def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

'''
# extracts the reference hash from a reference IRI
def extract_reference_hash(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[5]
'''
'''
# read from a CSV file beginning with a bit order mark (BOM) into a list of dictionaries
# Change encoding type to utf-8-sig instead of utf-8 to fix this
def read_dict_bom(filename):
    with open(filename, 'r', newline='', encoding='utf-8-sig') as fileObject: 
        dictObject = csv.DictReader(fileObject)
        array = []
        for row in dictObject:
            array.append(row)
    return array

def reverse_names(nameLastFirst):
    nameLastFirst = nameLastFirst.strip()
    nameParts = nameLastFirst.split(',')
    if len(nameParts) < 2: # name probably isn't reversed
        name = nameLastFirst.strip()
    else:
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName

        if len(nameParts) > 2:
            suffix = nameParts[2].strip()
        else:
            suffix = ''
        name = firstName + ' ' + lastName
        if suffix == 'Jr.':
            name += ', Jr.'
        elif suffix == 'Jr':
            name += ', Jr.'
        elif suffix == '':
            pass
        elif suffix == 'II':
            name += ' ' + suffix
        elif suffix == 'III':
            name += ' ' + suffix
        elif suffix == 'IV':
            name += ' ' + suffix
        elif suffix == 'V':
            name += ' ' + suffix
    return name
'''


"\n# read from a CSV file beginning with a bit order mark (BOM) into a list of dictionaries\n# Change encoding type to utf-8-sig instead of utf-8 to fix this\ndef read_dict_bom(filename):\n    with open(filename, 'r', newline='', encoding='utf-8-sig') as fileObject: \n        dictObject = csv.DictReader(fileObject)\n        array = []\n        for row in dictObject:\n            array.append(row)\n    return array\n\ndef reverse_names(nameLastFirst):\n    nameLastFirst = nameLastFirst.strip()\n    nameParts = nameLastFirst.split(',')\n    if len(nameParts) < 2: # name probably isn't reversed\n        name = nameLastFirst.strip()\n    else:\n        firstName = nameParts[1].strip()\n        lastName = nameParts[0].strip()\n        name = firstName + ' ' + lastName\n\n        if len(nameParts) > 2:\n            suffix = nameParts[2].strip()\n        else:\n            suffix = ''\n        name = firstName + ' ' + lastName\n        if suffix == 'Jr.':\n            name += ', Jr.'\n       

## Load list of items from file

The CSV has a header row with column headers: `qid` and `label`. The `qid` column contains the Wikidata Q identifiers for each item. The `label` column contains the label, which isn't necessarily the label in Wikidata, but provides a way for humans to recognize the item.

In [2]:
# Load item data from csv
print('loading item data from file')
filename = data_path + item_source_csv
items = read_dict(filename)

# Create VALUES list for journals
item_qids = ''
for item in items:
    item_qids += 'wd:' + item['qid'] + '\n'
# remove trailing newline
item_qids = item_qids[:len(item_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'},
    {'pid': 'P1476', 'variable': 'title', 'value_type': 'string'},
    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'},
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
]

#print(item_qids)

loading item data from file


This cell is to find out what properties are used by references given for the statement properties above and the items in the list.

### Breakdown:

I am dubious about using P236 (ISSN) as a source property. See [Q63871731](https://www.wikidata.org/wiki/Q63871731) for example. Since a URL is used, wouldn't it be P854 (reference URL)?

P1683 (quotation) seems to be being used to provide the string version of the `stated in` item. See [Q6295853](https://www.wikidata.org/wiki/Q6295853).

P887 (based on heuristic) is legitimate for language of the work. Daniel Mietchen uses it with [Q15755692](https://www.wikidata.org/wiki/Q15755692). But it seems like we could do better. Not sure about this one but not widely used (2 journals only).

We won't be using P143 (imported from Wikimedia project) and P4656 (Wikimedia import URL) since we aren't importing from Wikipedia. They are OK, but if we are really going to improve the quality of the data, we should be using primary sources. Therefore, I don't see this as worthy of tracking.

P4327 (BHL bibliography ID) is used in addition to P248 (stated in) within the same reference. Not sure if this is the best practice, but I don't think it's a source we will be using. See [Q6087079](https://www.wikidata.org/wiki/Q6087079) for an example.

P1436 (title) is used along with P854 (reference URL) to show the title of the page. Maybe not a bad idea, but only done once in [Q6087079](https://www.wikidata.org/wiki/Q6087079), so not useful to track.

### Conclusion

That leaves us with:

- P248 (stated in)
- P854 (reference URL)
- P813 (retrieved)

which are the properties I would have thought of using anyway. So that is good verification.

In [59]:
for property in prop_list:
    print('*', property['pid'], property['variable'])
    query = '''select distinct ?gprop ?prop_label where {
    '''
    query += '''
      VALUES ?qid
    {
    ''' + item_qids + '''
    }

    '''
    query += '?qid p:' + property['pid'] + ''' ?issn_statement.
    ?issn_statement prov:wasDerivedFrom ?reference.
    ?reference ?prop ?value.
    ?gprop wikibase:reference ?prop.
    ?gprop rdfs:label ?prop_label.
    filter(lang(?prop_label)='en')
    }'''
    #print(query)
    
    # send request to Wikidata Query Service
    response = requests.post(endpoint, data=query, headers=requestheader)
    data = response.json()

    # extract the values from the response JSON
    results = data['results']['bindings']
    #print(json.dumps(results, indent = 2))
    for result in results:
        print(extract_qnumber(result['gprop']['value']), result['prop_label']['value'])
    print()
    sleep(sparql_sleep)

* P31 instance_of
P236 ISSN
P143 imported from Wikimedia project
P248 stated in
P813 retrieved
P1683 quotation

* P1476 title
P143 imported from Wikimedia project
P236 ISSN
P248 stated in

* P407 language_of_work
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P813 retrieved
P887 based on heuristic
P4656 Wikimedia import URL

* P495 country_of_origin
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P4656 Wikimedia import URL

* P123 publisher
P143 imported from Wikimedia project
P236 ISSN
P248 stated in
P813 retrieved
P854 reference URL
P4656 Wikimedia import URL

* P571 inception
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P4327 BHL bibliography ID
P1476 title

* P2669 discontinued_date

* P856 official_website
P143 imported from Wikimedia project
P813 retrieved
P4656 Wikimedia import URL
P854 reference URL

* P155 follows

* P156 followed_by

* P921 main_subject
P143 imported from Wikimedia project
P248 stated in

## Create query to get the statement UUID and value

SPARQL query to be sent to the Wikidata Query Service (WDQS)

In [79]:
property = prop_list[12]

# create a string for the query
query = '''
select distinct ?qid '''

query += '?' + property['variable'] + '_value '
query += '?' + property['variable'] + '_statement '

query += 'where {'

query += '''
  VALUES ?qid
{
''' + item_qids + '''
}

'''


query += '?qid wdt:' + property['pid'] + ' ?' + property['variable'] + '_value.\n'
query += '?qid p:' + property['pid'] + ' ?' + property['variable'] + '_statement.\n'
query += '}'

#print(query)

Send the query to the WDQS and extract the results from the returned JSON

In [80]:
# Send SPARQL query to the Wikidata Query Service
print('retrieving data from Wikidata')

# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
#print(json.dumps(results, indent = 2))

retrieving data from Wikidata


Extract the qids, values, and statement UUIDs from the response data and store in a list of dicts (`statements_list`). Note: the `extract_statement_uuid()` function returns `uuid`,`qid` extracted from the statement IRI, so they have to be re-assembled to make the statement identifier: *qid*-*UUID*.

In [81]:
statements_list = [] # create list of data for statements, not really used for anything yet
statement_uuids = '' # create VALUES list for statmements
for result in results:
    row_dict = {}
    row_dict = {'qid': extract_qnumber(result['qid']['value'])}

    if property['value_type'] == 'item':
        # remove wd: 'http://www.wikidata.org/entity/'
        value = extract_qnumber(result[property['variable'] + '_value']['value'])
    else:
        value = result[property['variable'] + '_value']['value']
    row_dict[property['variable'] + '_value'] = value
    uuid, temp_qid = extract_statement_uuid(result[property['variable'] + '_statement']['value'])
    row_dict[property['variable'] + '_statement'] = uuid
    statement_uuids += 'wds:' + temp_qid + '-' + uuid + '\n'
    statements_list.append(row_dict)

# remove trailing newline
statement_uuids = statement_uuids[:len(statement_uuids)-1]

#print(statement_uuids)
#print(statements_list)

Create query to retrieve the reference metadata

In [82]:
# create a string for the query
query = 'select distinct ?label ?' + property['variable'] + ' ?' + property['variable'] + '_uuid ?' + property['variable'] + '_ref1_hash ?' + property['variable'] + '_ref1_statedIn ?' + property['variable'] + '_ref1_referenceUrl ?' + property['variable'] + '_ref1_retrieved_nodeid ?' + property['variable'] + '_ref1_retrieved_val ?' +property['variable'] + '_ref1_retrieved_prec where {'

query += '''
  VALUES ?''' + property['variable'] + '''_uuid
{
''' + statement_uuids + '''
}

'''
query += '''
?''' + property['variable'] + '_uuid ps:' + property['pid'] + ' ?' + property['variable'] + '''.
?qid p:''' + property['pid'] + ' ?' + property['variable'] + '''_uuid.
?qid rdfs:label ?label.
filter(lang(?label)='en')
optional{
?''' + property['variable'] + '''_uuid prov:wasDerivedFrom ?''' + property['variable'] + '''_ref1_hash.
?''' + property['variable'] + '_ref1_hash pr:P248 ?' + property['variable'] + '''_ref1_statedIn.
}
optional{
?''' + property['variable'] + '''_uuid prov:wasDerivedFrom ?''' + property['variable'] + '''_ref1_hash.
?''' + property['variable'] + '_ref1_hash pr:P854 ?' + property['variable'] + '''_ref1_referenceUrl.
}
optional{
?''' + property['variable'] + '''_uuid prov:wasDerivedFrom ?''' + property['variable'] + '''_ref1_hash.
?''' + property['variable'] + '_ref1_hash prv:P813 ?' + property['variable'] + '''_ref1_retrieved_nodeid.
?''' + property['variable'] + '_ref1_retrieved_nodeid wikibase:timeValue ?' + property['variable'] + '''_ref1_retrieved_val.
?''' + property['variable'] + '_ref1_retrieved_nodeid wikibase:timePrecision ?' + property['variable'] + '''_ref1_retrieved_prec.
}
}'''

print(query)

select distinct ?label ?issn ?issn_uuid ?issn_ref1_hash ?issn_ref1_statedIn ?issn_ref1_referenceUrl ?issn_ref1_retrieved_nodeid ?issn_ref1_retrieved_val ?issn_ref1_retrieved_prec where {
  VALUES ?issn_uuid
{
wds:Q15762548-96569F39-A7F9-41D9-92C8-1587D112CC48
wds:Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
wds:Q7552806-BBD52CF9-767D-4A9B-9011-05EB7B6E539F
wds:Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
wds:Q7552806-BBD52CF9-767D-4A9B-9011-05EB7B6E539F
wds:Q7628082-30F84C7E-A697-482F-B091-B2D307ECA29D
wds:Q7628082-ABA370FA-6EE6-4189-AE7A-9A48815DD744
wds:Q7628082-30F84C7E-A697-482F-B091-B2D307ECA29D
wds:Q7628082-ABA370FA-6EE6-4189-AE7A-9A48815DD744
wds:Q6888892-81592DF0-85DE-4F57-A908-00D2A03830BD
wds:Q6888892-832A0534-9348-4720-9762-062A8F58A643
wds:Q6888892-81592DF0-85DE-4F57-A908-00D2A03830BD
wds:Q6888892-832A0534-9348-4720-9762-062A8F58A643
wds:Q6071283-45FAE304-C0AB-4CED-8E29-3E5325CD1851
wds:Q6071283-55F29998-6459-458A-8CF2-41907EB197A3
wds:Q6071283-45FAE304-C0AB-4CED-8E29-3E5

In [83]:
# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
results

[{'issn_uuid': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q50426426-6F3C89F1-D5D1-4FDC-80C9-C822770E1E1F'},
  'issn_ref1_hash': {'type': 'uri',
   'value': 'http://www.wikidata.org/reference/7d86a461ec09093a75573d9b93c24394a0efacd9'},
  'issn_ref1_retrieved_nodeid': {'type': 'uri',
   'value': 'http://www.wikidata.org/value/aac34e923e2709b62d83e34900e505f5'},
  'issn_ref1_retrieved_val': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime',
   'type': 'literal',
   'value': '2018-03-29T00:00:00Z'},
  'issn_ref1_retrieved_prec': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
   'type': 'literal',
   'value': '11'},
  'issn_ref1_referenceUrl': {'type': 'uri',
   'value': 'https://doaj.org/toc/0103-801X'},
  'issn_ref1_statedIn': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q1227538'},
  'issn': {'type': 'literal', 'value': '0103-801X'},
  'label': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Estudos de Religião'}},
 {'issn_

In [84]:
# extract results
references_list = []
for result in results:
    row_dict = {}
    row_dict[property['variable'] + '_uuid'], row_dict['qid'] = extract_statement_uuid(result[property['variable'] + '_uuid']['value'])
    row_dict[property['variable']] = result[property['variable']]['value']
    row_dict['label'] = result['label']['value']
    try:
        row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result[property['variable'] + '_ref1_hash']['value'])
    except:
        row_dict[property['variable'] + '_ref1_hash'] = ''
    try:
        row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result[property['variable'] + '_ref1_statedIn']['value'])
    except:
        row_dict[property['variable'] + '_ref1_statedIn'] = ''
    try:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = result[property['variable'] + '_ref1_referenceUrl']['value']
    except:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
    try:
        # Note: the form of the node ID is http://www.wikidata.org/value/0a8f688406e3fc53d0119eafcd2c0396
        # so the extract_qnumber() function can be used on it.
        row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = extract_qnumber(result[property['variable'] + '_ref1_retrieved_nodeid']['value'])
    except:
        row_dict[property['variable'] + '_ref1_retrieved_nodeid'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = result[property['variable'] + '_ref1_retrieved_val']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = result[property['variable'] + '_ref1_retrieved_prec']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
    references_list.append(row_dict)
    

print(json.dumps(references_list, indent=2))

[
  {
    "issn_uuid": "6F3C89F1-D5D1-4FDC-80C9-C822770E1E1F",
    "qid": "Q50426426",
    "issn": "0103-801X",
    "label": "Estudos de Religi\u00e3o",
    "issn_ref1_hash": "7d86a461ec09093a75573d9b93c24394a0efacd9",
    "issn_ref1_statedIn": "Q1227538",
    "issn_ref1_referenceUrl": "https://doaj.org/toc/0103-801X",
    "issn_ref1_retrieved_nodeid": "aac34e923e2709b62d83e34900e505f5",
    "issn_ref1_retrieved_val": "2018-03-29T00:00:00Z",
    "issn_ref1_retrieved_prec": "11"
  },
  {
    "issn_uuid": "1FE5E287-C708-44BA-A42E-BBDF39E7B8EE",
    "qid": "Q3523724",
    "issn": "2169-1304",
    "label": "Theological Studies",
    "issn_ref1_hash": "36cd573425c873bfb236b993e285640d04bbc978",
    "issn_ref1_statedIn": "",
    "issn_ref1_referenceUrl": "",
    "issn_ref1_retrieved_nodeid": "5a222f4e76bbcf99182811d78451dd85",
    "issn_ref1_retrieved_val": "2018-03-23T00:00:00Z",
    "issn_ref1_retrieved_prec": "11"
  },
  {
    "issn_uuid": "857F19D1-F329-4A4E-B4A3-2EE1417B3F97",
    "qid"

In [85]:
# create the list of column headers
fieldnames = ['qid', property['variable'] + '_uuid', property['variable'], property['variable'] + '_ref1_hash', property['variable'] + '_ref1_statedIn', property['variable'] + '_ref1_referenceUrl', property['variable'] + '_ref1_retrieved_nodeid', property['variable'] + '_ref1_retrieved_val', property['variable'] + '_ref1_retrieved_prec', 'label']

# write the data to a CSV file
print('writing data to file')
vbc.writeDictsToCsv(references_list, home + '/divinity-law/new-test-journal-issn.csv', fieldnames)

print('done')

writing data to file
done
