# Script to pull metadata from Wikidata

Throughout the script I refer to "Wikidata" but this could be used for any Wikibase instance.

## Configuration section

Import modules, set values, and define functions

In [57]:
# acquire-wikidata-metadata.ipynb This is part of the VandyCite project https://www.wikidata.org/wiki/Wikidata:WikiProject_VandyCite
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf 2020-09-05

import vb_common_code as vbc
from pathlib import Path
import requests
from time import sleep
import json

'''
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string
import copy # import the copy module from the standard library
'''

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
data_path = home + '/divinity-law/'
item_source_csv = 'identified-journals.csv'

endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

# known_name_strings_path = home + '/divinity-law/known-name-strings.csv'
# wikidata_article_matches_path  = home + '/divinity-law/wikidata-article-matches.csv'
# duplicate_works_path  = home + '/divinity-law/duplicate-works.csv'
# journals_path = home + '/divinity-law/journals.csv'
# employerQId = 'Q29052' # Vanderbilt University
# testRatio = 90 # similarity required for a potential match of a generic wikidata match
# journalTestRatio = 94 # similarity required for a potential match of a generic wikidata match


def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

'''
# extracts the reference hash from a reference IRI
def extract_reference_hash(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[5]
'''
'''
# read from a CSV file beginning with a bit order mark (BOM) into a list of dictionaries
# Change encoding type to utf-8-sig instead of utf-8 to fix this
def read_dict_bom(filename):
    with open(filename, 'r', newline='', encoding='utf-8-sig') as fileObject: 
        dictObject = csv.DictReader(fileObject)
        array = []
        for row in dictObject:
            array.append(row)
    return array

def reverse_names(nameLastFirst):
    nameLastFirst = nameLastFirst.strip()
    nameParts = nameLastFirst.split(',')
    if len(nameParts) < 2: # name probably isn't reversed
        name = nameLastFirst.strip()
    else:
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName

        if len(nameParts) > 2:
            suffix = nameParts[2].strip()
        else:
            suffix = ''
        name = firstName + ' ' + lastName
        if suffix == 'Jr.':
            name += ', Jr.'
        elif suffix == 'Jr':
            name += ', Jr.'
        elif suffix == '':
            pass
        elif suffix == 'II':
            name += ' ' + suffix
        elif suffix == 'III':
            name += ' ' + suffix
        elif suffix == 'IV':
            name += ' ' + suffix
        elif suffix == 'V':
            name += ' ' + suffix
    return name
'''


"\n# read from a CSV file beginning with a bit order mark (BOM) into a list of dictionaries\n# Change encoding type to utf-8-sig instead of utf-8 to fix this\ndef read_dict_bom(filename):\n    with open(filename, 'r', newline='', encoding='utf-8-sig') as fileObject: \n        dictObject = csv.DictReader(fileObject)\n        array = []\n        for row in dictObject:\n            array.append(row)\n    return array\n\ndef reverse_names(nameLastFirst):\n    nameLastFirst = nameLastFirst.strip()\n    nameParts = nameLastFirst.split(',')\n    if len(nameParts) < 2: # name probably isn't reversed\n        name = nameLastFirst.strip()\n    else:\n        firstName = nameParts[1].strip()\n        lastName = nameParts[0].strip()\n        name = firstName + ' ' + lastName\n\n        if len(nameParts) > 2:\n            suffix = nameParts[2].strip()\n        else:\n            suffix = ''\n        name = firstName + ' ' + lastName\n        if suffix == 'Jr.':\n            name += ', Jr.'\n       

## Load list of items from file

The CSV has a header row with column headers: `qid` and `label`. The `qid` column contains the Wikidata Q identifiers for each item. The `label` column contains the label, which isn't necessarily the label in Wikidata, but provides a way for humans to recognize the item.

In [54]:
# Load item data from csv
print('loading item data from file')
filename = data_path + item_source_csv
items = vbc.readDict(filename)

# Create VALUES list for journals
item_qids = ''
for item in items:
    item_qids += 'wd:' + item['qid'] + '\n'
# remove trailing newline
item_qids = item_qids[:len(item_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'},
    {'pid': 'P1476', 'variable': 'title', 'value_type': 'string'},
    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'},
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
]

#print(item_qids)

loading item data from file


This cell is to find out what properties are used by references given for the statement properties above and the items in the list.

In [59]:
for property in prop_list:
    print('*', property['pid'], property['variable'])
    query = '''select distinct ?gprop ?prop_label where {
    '''
    query += '''
      VALUES ?qid
    {
    ''' + item_qids + '''
    }

    '''
    query += '?qid p:' + property['pid'] + ''' ?issn_statement.
    ?issn_statement prov:wasDerivedFrom ?reference.
    ?reference ?prop ?value.
    ?gprop wikibase:reference ?prop.
    ?gprop rdfs:label ?prop_label.
    filter(lang(?prop_label)='en')
    }'''
    #print(query)
    
    # send request to Wikidata Query Service
    response = requests.post(endpoint, data=query, headers=requestheader)
    data = response.json()

    # extract the values from the response JSON
    results = data['results']['bindings']
    #print(json.dumps(results, indent = 2))
    for result in results:
        print(extract_qnumber(result['gprop']['value']), result['prop_label']['value'])
    print()
    sleep(sparql_sleep)

* P31 instance_of
P236 ISSN
P143 imported from Wikimedia project
P248 stated in
P813 retrieved
P1683 quotation

* P1476 title
P143 imported from Wikimedia project
P236 ISSN
P248 stated in

* P407 language_of_work
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P813 retrieved
P887 based on heuristic
P4656 Wikimedia import URL

* P495 country_of_origin
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P4656 Wikimedia import URL

* P123 publisher
P143 imported from Wikimedia project
P236 ISSN
P248 stated in
P813 retrieved
P854 reference URL
P4656 Wikimedia import URL

* P571 inception
P143 imported from Wikimedia project
P248 stated in
P854 reference URL
P4327 BHL bibliography ID
P1476 title

* P2669 discontinued_date

* P856 official_website
P143 imported from Wikimedia project
P813 retrieved
P4656 Wikimedia import URL
P854 reference URL

* P155 follows

* P156 followed_by

* P921 main_subject
P143 imported from Wikimedia project
P248 stated in

## Create query to get the statement UUID and value

SPARQL query to be sent to the Wikidata Query Service (WDQS)

In [43]:
property = prop_list[2]

# create a string for the query
query = '''
select distinct ?qid '''

query += '?' + property['variable'] + '_value '
query += '?' + property['variable'] + '_statement '

query += 'where {'

query += '''
  VALUES ?qid
{
''' + item_qids + '''
}

'''


query += '?qid wdt:' + property['pid'] + ' ?' + property['variable'] + '_value.\n'
query += '?qid p:' + property['pid'] + ' ?' + property['variable'] + '_statement.\n'
query += '}'

#print(query)

Send the query to the WDQS and extract the results from the returned JSON

In [44]:
# Send SPARQL query to the Wikidata Query Service
print('retrieving data from Wikidata')

# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
#print(json.dumps(results, indent = 2))

retrieving data from Wikidata


Extract the qids, values, and statement UUIDs from the response data and store in a list of dicts (`statements_list`). Note: the `extract_statement_uuid()` function returns `uuid`,`qid` extracted from the statement IRI, so they have to be re-assembled to make the statement identifier: *qid*-*UUID*.

In [45]:
statements_list = [] # create list of data for statements, not really used for anything yet
statement_uuids = '' # create VALUES list for statmements
for result in results:
    row_dict = {}
    row_dict = {'qid': extract_qnumber(result['qid']['value'])}

    if property['value_type'] == 'item':
        # remove wd: 'http://www.wikidata.org/entity/'
        value = extract_qnumber(result[property['variable'] + '_value']['value'])
    else:
        value = result[property['variable'] + '_value']['value']
    row_dict[property['variable'] + '_value'] = value
    uuid, temp_qid = extract_statement_uuid(result[property['variable'] + '_statement']['value'])
    row_dict[property['variable'] + '_statement'] = uuid
    statement_uuids += 'wds:' + temp_qid + '-' + uuid + '\n'
    statements_list.append(row_dict)

# remove trailing newline
statement_uuids = statement_uuids[:len(statement_uuids)-1]

#print(statement_uuids)
#print(statements_list)

I used this query:
```
select distinct ?gprop ?prop_label where {
     VALUES ?id
    {
wd:Q7311312
wd:Q4984286
wd:Q15749660
etc.
    }
?id p:P236 ?issn_statement.
?issn_statement prov:wasDerivedFrom ?reference.
?reference ?prop ?value.
?gprop wikibase:reference ?prop.
?gprop rdfs:label ?prop_label.
filter(lang(?prop_label)='en')
}
```
to find out what reference properties were in use in the journals of interest. They were:

```
gprop   prop_label
wd:P143  imported from Wikimedia project
wd:P248  stated in
wd:P813  retrieved
wd:P854  reference URL
wd:P4327 BHL bibliography ID
```
The only Wikimedia project they were imported from was Wikipedia, so not a useful one to track. I don't think BHL ID is either, so that leaves P248, P813, and P845 as the useful ones to track.

In [46]:
# create a string for the query
query = 'select distinct ?label ?' + property['variable'] + ' ?statement ?reference ?statedIn ?refUrl ?retrDate ?retrPrecision where {'

query += '''
  VALUES ?statement
{
''' + statement_uuids + '''
}

'''
query += '''
?statement ps:''' + property['pid'] + ' ?' + property['variable'] + '''.
?qid p:''' + property['pid'] + ''' ?statement.
?qid rdfs:label ?label.
filter(lang(?label)='en')
optional{
?statement prov:wasDerivedFrom ?reference.
?reference pr:P248 ?statedIn.
}
optional{
?statement prov:wasDerivedFrom ?reference.
?reference pr:P854 ?refUrl.
}
optional{
?statement prov:wasDerivedFrom ?reference.
?reference prv:P813 ?retrievedNode.
?retrievedNode wikibase:timeValue ?retrDate.
?retrievedNode wikibase:timePrecision ?retrPrecision.
}
}'''

print(query)

select distinct ?label ?instance_of ?statement ?reference ?statedIn ?refUrl ?retrDate ?retrPrecision where {
  VALUES ?statement
{
wds:Q68836075-524F3A33-FD71-47D6-84A6-2C4E008B4BF0
wds:Q91678397-5cc36403-4360-14b0-3f4e-879b88125a8c
wds:Q62259203-6E280AAE-1E0B-45B8-8F7B-32B3ECA5FF34
wds:Q62259203-80C4942D-123A-46CB-BF81-A25A6B00C307
wds:Q88977207-735CC0AB-754A-49FE-8FF1-22799E2D58CE
wds:Q63871731-7A024214-F5CB-425A-B167-4571CA6C7B0A
wds:Q63871802-E9E66F81-0778-4F51-963C-1F88FFC19382
wds:Q63871713-EC003478-FBE5-4812-B414-868AC673033D
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q15759717-A3C798F2-297B-41B8-A44E-B51678E43752
wd

In [47]:
# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
results

[{'statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q50815593-87C17CD1-F50C-4BDE-804C-0AC06F958B37'},
  'reference': {'type': 'uri',
   'value': 'http://www.wikidata.org/reference/e544413e0106f7253a231dfffe136d9d9fff837a'},
  'retrDate': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime',
   'type': 'literal',
   'value': '2018-05-10T00:00:00Z'},
  'retrPrecision': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
   'type': 'literal',
   'value': '11'},
  'instance_of': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q773668'},
  'label': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Journal of French and Francophone Philosophy'},
  'statedIn': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q1227538'}},
 {'statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q50426426-CC1BBEFB-DA93-4EF8-9D9D-899A08417EC2'},
  'reference': {'type': 'uri',
   'value': 'http://www.wikidata.org/refe

In [48]:
# extract results
references_list = []
for result in results:
    row_dict = {}
    row_dict[property['variable'] + '_uuid'], row_dict['qid'] = extract_statement_uuid(result['statement']['value'])
    row_dict[property['variable']] = result[property['variable']]['value']
    row_dict['label'] = result['label']['value']
    try:
        row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result['reference']['value'])
    except:
        row_dict[property['variable'] + '_ref1_hash'] = ''
    try:
        row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result['statedIn']['value'])
    except:
        row_dict[property['variable'] + '_ref1_statedIn'] = ''
    try:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = result['refUrl']['value']
    except:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = result['retrDate']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = result['retrPrecision']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
    references_list.append(row_dict)
    

print(json.dumps(references_list, indent=2))

[
  {
    "instance_of_uuid": "87C17CD1-F50C-4BDE-804C-0AC06F958B37",
    "qid": "Q50815593",
    "instance_of": "http://www.wikidata.org/entity/Q773668",
    "label": "Journal of French and Francophone Philosophy",
    "instance_of_ref1_hash": "e544413e0106f7253a231dfffe136d9d9fff837a",
    "instance_of_ref1_statedIn": "Q1227538",
    "instance_of_ref1_referenceUrl": "",
    "instance_of_ref1_retrieved_val": "2018-05-10T00:00:00Z",
    "instance_of_ref1_retrieved_prec": "11"
  },
  {
    "instance_of_uuid": "CC1BBEFB-DA93-4EF8-9D9D-899A08417EC2",
    "qid": "Q50426426",
    "instance_of": "http://www.wikidata.org/entity/Q773668",
    "label": "Estudos de Religi\u00e3o",
    "instance_of_ref1_hash": "e544413e0106f7253a231dfffe136d9d9fff837a",
    "instance_of_ref1_statedIn": "Q1227538",
    "instance_of_ref1_referenceUrl": "",
    "instance_of_ref1_retrieved_val": "2018-05-10T00:00:00Z",
    "instance_of_ref1_retrieved_prec": "11"
  },
  {
    "instance_of_uuid": "FDBA9022-8407-4FCE-BC

In [None]:
# create the list of column headers
fieldnames = ['qid', property['variable'] + '_uuid', property['variable'], property['variable'] + '_ref1_hash', property['variable'] + '_ref1_statedIn', property['variable'] + '_ref1_referenceUrl', property['variable'] + '_ref1_retrieved_val', property['variable'] + '_ref1_retrieved_prec', 'label']

# write the data to a CSV file
print('writing data to file')
vbc.writeDictsToCsv(references_list, home + '/divinity-law/test-journal-instanceof.csv', fieldnames)

print('done')