Materialize entailed Wikidata triples and federated queries using Python (2020-11-28)

(c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0

Author: Steve Baskauf

# Set configuration and define functions

Run the first cell before any of the others

These queries run on a local installation of Apache Jena Fuseki (<https://jena.apache.org/documentation/fuseki2/>) operating as a localhost server on port 3030 (the default). After downloading the .zip file, uncompress it somewhere where you can find it (like your home folder). To start the server from the console (Terminal on Mac, Command Prompt on Windows), change to the directory `apache-jena-fuseki-...` that you unzipped and execute the start command. Alternatively, you can add the directory to your system path and start the server from anywhere.

NOTE: it is important that Fuseki be started with the `--update` option, otherwise SPARQL UPDATE operations are disabled. The syntax to start Fuseki from the command line is:

```
java -Xmx1200M -jar fuseki-server.jar --update --loc=dataDir /myDataset
```

If you don't care about where the data are stored and you want to set the dataset name when you upload data, you can use:

```
java -Xmx1200M -jar fuseki-server.jar --update
```

For details on using SPARQL UPDATE with Fuseki, see Bob DuCharme's blog post: <http://www.bobdc.com/blog/getting-started-with-sparql-up/>

The dataset name needs to be set at the beginning of the first cell of the script. 

In [None]:
# Materialize entailed Wikidata triples and federated queries using Python (2020-11-28)
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf
    
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
import sys # Read CLI arguments

# Configure

# If not screening by Q ID, set item_values_path to empty string ''
graph_data = [
    { # 0=Divinity journals
        'item_values_path': '../../vandycite/journals/journal-div-qids.csv', 
        'iri': 'http://journals', 
        'item_screen_graph_pattern': ''
    },
    { # 1=Bluffton University presidents
        'item_values_path': '', 
        'iri': 'http://bluffton', 
        'item_screen_graph_pattern': '''  ?statement1 ps:P108  wd:Q886141 .
  ?qid p:P108 ?statement1.
  ?statement2 ps:P39 wd:Q61061.
  ?qid p:P39 ?statement2.
'''
    },
    { # 2=Fine Arts Gallery works (explicitly listed)
        'item_values_path': '../../vandycite/gallery_works/gallery_works_to_write.csv', 
        'iri': 'http://gallery', 
        'item_screen_graph_pattern': ''
    },
    { # 3=people who work at Bluffton (defined by query)
        'item_values_path': '', 
        'iri': 'http://bluffton', 
        'item_screen_graph_pattern': '''  ?statement1 ps:P108  wd:Q886141 .
  ?qid p:P108 ?statement1.
'''
    },
    { # 4=people affiliated with a Vanderbilt unit (defined by query)
        'item_values_path': '', 
        'iri': 'http://researchers', 
        'item_screen_graph_pattern': '''  ?unit wdt:P749+ wd:Q29052.
  ?qid wdt:P1416 ?unit.
'''
    },
    { # 5=people affiliated with a Vanderbilt unit (defined by list)
        'item_values_path': '../../vandycite/researchers/vanderbilt-employees.csv', 
        'iri': 'http://researchers', 
        'item_screen_graph_pattern': ''
    }
]

query_data = [
    { # Query 0: Labels in Wikidata that aren't in the local data
        'variables': [
                'qid',
                'name'
            ],
        'subquery': '''  ?qid rdfs:label ?name.
''',
        'local_minus': True,
        'order_by': 'qid'
    },
    { # Query 1: Statements made in Wikidata that weren't made in the local data
        'variables': [
                'qid',
                'name',
                'wdt',
                'value'
            ],
        'subquery': '''  ?qid rdfs:label ?name.
  filter(lang(?name) = "en")
  ?qid ?wdt ?value.
  filter(substr(str(?wdt),1,37)="http://www.wikidata.org/prop/direct/P")
''',
        'local_minus': True,
        'order_by': 'qid'
    },
    { # Query 2: References in the local data that aren't in Wikidata
        'variables': [
                'qid',
                'name',
                'pr',
                'value'
            ],
        'subquery': '''  ?qid rdfs:label ?name.
  filter(lang(?name) = "en")
  ?qid ?p ?statement.
  ?statement prov:wasDerivedFrom ?reference.
  ?reference ?pr ?value.
  filter(substr(str(?pr),1,40)="http://www.wikidata.org/prop/reference/P")
''',
        'local_minus': False,
        'order_by': 'qid'
    },
    { # Query 3: English labels in the local data that aren't in Wikidata
        'variables': [
                'qid',
                'name'
            ],
        'subquery': '''  ?qid rdfs:label ?name.
  filter(lang(?name) = "en")
''',
        'local_minus': False,
        'order_by': ''
    },
    { # Query 4: Items and their labels that are in Wikidata but not locally
        'variables': [
                'qid',
                'label'
            ],
        'subquery': '''  ?qid rdfs:label ?label.
  filter(lang(?label) = "en")
''',
        'local_minus': True,
        'order_by': 'qid'
    },
    { # Query 5: People, their names, and the labels of their units that are in Wikidata but not locally
        'variables': [
                'qid',
                'label',
                'unitLabel'
            ],
        'subquery': '''  ?qid rdfs:label ?label.
  filter(lang(?label) = "en")
  ?unit rdfs:label ?unitLabel.
  filter(lang(?unitLabel) = "en")
''',
        'local_minus': True,
        'order_by': 'qid'
    },
    { # Query 6: ORCIDs of Vanderbilt people in Wikidata but not locally
        'variables': [
                'qid',
                'label',
                'orcid'
            ],
        'subquery': '''  ?qid rdfs:label ?label.
  filter(lang(?label) = "en")
  ?qid wdt:P496 ?orcid.
''',
        'local_minus': True,
        'order_by': 'qid'
    },
    { # Query 7: Items that are in Wikidata but not locally
        'variables': [
                'qid'
            ],
        'subquery': '',
        'local_minus': True,
        'order_by': 'qid'
    }
]

dataset_name = "data"

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
sparql_endpoint = 'http://localhost:3030/' + dataset_name + '/query'
update_endpoint = 'http://localhost:3030/' + dataset_name + '/update'
remote_endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'

namespaces = '''
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix prov: <http://www.w3.org/ns/prov#>
prefix wikibase: <http://wikiba.se/ontology#>
prefix  wd:  <http://www.wikidata.org/entity/>
prefix  wdt: <http://www.wikidata.org/prop/direct/>
prefix  p:  <http://www.wikidata.org/prop/>
prefix  pq:  <http://www.wikidata.org/prop/qualifier/>
prefix  pr:  <http://www.wikidata.org/prop/reference/>
prefix  ps:  <http://www.wikidata.org/prop/statement/>
prefix  pqv:  <http://www.wikidata.org/prop/qualifier/value/>
prefix  prv:  <http://www.wikidata.org/prop/reference/value/>
prefix  psv:  <http://www.wikidata.org/prop/statement/value/>
'''

value_types = [
    {'string': 'time', 
     'local_names': ['timeValue'], 
     'datatype':'http://www.w3.org/2001/XMLSchema#dateTime',
     'bind': '?literal0'}, 
    {'string': 'quantity', 
     'local_names': ['quantityAmount'],
     'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
     'bind': '?literal0'}, 
    {'string': 'globecoordinate', 
     'local_names': ['geoLatitude', 'geoLongitude'],
     'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
     'bind': 'concat("Point(", str(?literal0), " ", str(?literal1), ")")'}
]

property_types = ['statement', 'qualifier', 'reference']

def generate_header_dictionary(accept_media_type):
    user_agent_header = 'TestBot/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

def generate_update_header_dictionary():
    user_agent_header = 'TestBot/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Content-Type': 'application/sparql-update',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

updateheader = generate_update_header_dictionary()

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts a property pNumber from a Wikidata IRI
def extract_pnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[len(pieces) -1]

# lookes up a label from a list of dict containing pids and labels
def find_label(pid, property_labels):
    found_label = ''
    for label in property_labels:
        if label['pid'] == pid:
            found_label = label['label']
            break
    return found_label

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def load_qids(item_values_path):
    # Load item data from csv
    print('loading item data from file')
    filename = item_values_path
    items = read_dict(filename)
    print('done loading')
    print()

    # Create VALUES list for items
    item_qids = ''
    for item in items:
        item_qids += 'wd:' + item['qid'] + '\n'
    # remove trailing newline
    item_qids = item_qids[:len(item_qids)-1]

    item_values_list = '''
          VALUES ?qid
        {
''' + item_qids + '''
        }
'''
    return(item_values_list)

def send_query(query, query_type, endpoint, requestheader):
    print('querying SPARQL endpoint to acquire item metadata')
    response = requests.post(endpoint, data=query, headers=requestheader)
    #print(response.text)
    print('done retrieving data')
    
    if query_type == 'select':
        data = response.json()
        # extract the values from the response JSON
        results = data['results']['bindings']
        #print(json.dumps(results, indent=2))
    if query_type == 'construct':
        data = response.text        
    else:
        data = response.text
    
    return(data)

def send_update(query, endpoint, requestheader):
    #print('sending SPARQL update')
    response = requests.post(endpoint, headers=requestheader, data = query)

    # nothing is returned from the operation
    #print(response.text)
    print('update complete')

def federated_query(query_number, query_data, graph_number, graph_data):
    # build the select string
    select_string = ''
    for variable in query_data[query_number]['variables']:
        select_string += '?' + variable + ' '

    # set the direction of the comparison
    if query_data[query_number]['local_minus']:
        first_service = 'SERVICE <' + remote_endpoint + '> '
        minus_service = 'GRAPH <' + graph_data[graph_number]['iri'] + '> '
    else:
        first_service = 'GRAPH <' + graph_data[graph_number]['iri'] + '> '
        minus_service = 'SERVICE <' + remote_endpoint + '> '

    # load the Q IDs if any
    if graph_data[graph_number]['item_values_path'] == '':
        item_values_list = ''
    else:
        item_values_list = load_qids(graph_data[graph_number]['item_values_path'])

    query = '''
    SELECT distinct ''' + select_string + '''
    WHERE {
    ''' + first_service + '{' + item_values_list + graph_data[graph_number]['item_screen_graph_pattern'] + query_data[query_number]['subquery'] + '''}
      minus
      {
    ''' + minus_service + '{' + item_values_list + graph_data[graph_number]['item_screen_graph_pattern'] + query_data[query_number]['subquery'] + '''}
      }
    }
    '''
    if query_data[query_number]['order_by'] != '':
        query += 'ORDER BY ?' + query_data[query_number]['order_by'] + '''
    '''

    #print(query)

    # ----------------
    # send request to Fuseki endpoint
    # ----------------
    text = send_query(namespaces + query, query.split(' ')[0].lower(), sparql_endpoint, requestheader)
    #print(text)

    data = json.loads(text)
    # extract the values from the response JSON
    results = data['results']['bindings']
    #print(json.dumps(results, indent=2))

    # ----------------
    # extract results
    # ----------------

    fieldnames = list(query_data[query_number]['variables'])
    metadata_list = []
    for result in results:
        row_dict = {}
        for variable in query_data[query_number]['variables']:
            if result[variable]['type'] == 'uri':
                if result[variable]['value'][:32] == 'http://www.wikidata.org/entity/Q':
                    row_dict[variable] = extract_qnumber(result[variable]['value'])
                elif result[variable]['value'][:29] == 'http://www.wikidata.org/prop/':
                    row_dict[variable] = extract_pnumber(result[variable]['value'])                
                else:
                    row_dict[variable] = result[variable]['value']
            else:
                row_dict[variable] = result[variable]['value']
                if 'xml:lang' in result[variable]:
                    row_dict[variable + '_lang'] = result[variable]['xml:lang']
                    if variable + '_lang' not in fieldnames:
                        fieldnames.append(variable + '_lang')
                if 'datatype' in result[variable]:
                    row_dict[variable + '_datatype'] = result[variable]['datatype'].split('#')[1]
                    if variable + '_datatype' not in fieldnames:
                        fieldnames.append(variable + '_datatype')
        metadata_list.append(row_dict)

    #print(json.dumps(metadata_list, indent=2))
    return metadata_list, fieldnames


# Utilities for setting up triplestore for querying

The first cell clears all graphs from the dataset. 

After clearing the graphs, Wikidata triples are generated by Ruby gem `rdf-tabular` (<https://github.com/ruby-rdf/rdf-tabular>) according to the W3C "Generating RDF from Tabular Data on the Web" Recommendation (<https://www.w3.org/TR/csv2rdf/>). The command to generate the output serialized as RDF/Turtle and redirected to a file is:

```
rdf serialize --input-format tabular --output-format ttl --metadata csv-metadata.json --minimal > output.ttl
```

The resulting file is loaded into Fuseki. The second cell then generates the missing triples for expressing the data that are available via the Wikidata Query Service, but that aren't generated directly by rdf-tabular.

In [None]:
# !!!!!! Warning! Warning! Warning! !!!!!!!!!!
# This command deletes all triples in the triplestore! There is no way to recover the data, so use with caution!
query = 'drop all'
#query = 'drop graph <http://bluffton>'

# update test
data = send_update(query, update_endpoint, updateheader)
# Fuseki doesn't send any response for update commands
#print('response:\n', data)


In [None]:
# Insert the missing value statements using values from value nodes

# NOTE: The datatypes for xsd:dateTime and xsd:decimal should be generated automatically by the datatype 
# designation in the csv-metadata.json schema (for integers, a ".0" is appended to integers to make them decimals).
# However, I don't know how to force the geo:wktLiteral datatype in SPARQL construct, so this may end up not matching
# the datatype of literals acquired from the Wikidata Query service.

graph_number = 2 # set the appropriate graph number for the graph to be supplemented

for value_type in value_types:
    for property_type in property_types:
        query = '''
        with <''' + graph_data[graph_number]['iri'] + '''>
        insert {?reference ?directProp ?literal.}
        where {
          ?reference ?pxv ?value.
        '''
        for ln_index in range(len(value_type['local_names'])):
            query += '  ?value wikibase:' + value_type['local_names'][ln_index] + ' ?literal' + str(ln_index) + '''.
        '''
        query += '  bind(' + value_type['bind'] + ''' as ?literal)
        '''
        query += '  filter(substr(str(?pxv),1,45)="http://www.wikidata.org/prop/' + property_type + '''/value/")
          bind(substr(str(?pxv),46) as ?id)
          bind(iri(concat("http://www.wikidata.org/prop/''' + property_type + '''/", ?id)) as ?directProp)
          }
          '''
        #print(query) 
        print('updating', property_type, value_type['string'])
        send_update(namespaces + query, update_endpoint, updateheader)

# Insert the missing "truthy" statements from statement value statements

query = '''
with <''' + graph_data[graph_number]['iri'] + '''>
insert {?item ?truthyProp ?value.}
where {
  ?item ?p ?statement.
  ?statement ?ps ?value.
  filter(substr(str(?ps),1,40)="http://www.wikidata.org/prop/statement/P")
  bind(substr(str(?ps),40) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?truthyProp)
  }
  '''
#print(query)
print ('updating truthy statements')
send_update(namespaces + query, update_endpoint, updateheader)


# Federated queries

The following queries compare the local data in Fuseki against the "real" data in Wikidata by carrying out a federated query using the local data and data from the Wikidata Query Service.

The first cell performs a query (`subquery`) on both the local data and the data in Wikidata, and performs a minus operation to determine what bindings are present on Wikidata but not in the local dataset.

The following cell does the generic querying 

In [None]:
# set the graph and query number (see config section)
graph_number = 2 # gallery data
query_number = 0 # items in Wikidata but not locally

metadata_list, fieldnames = federated_query(query_number, query_data, graph_number, graph_data)

if len(metadata_list) == 0:
    print('no data retrieved')
else:
    write_dicts_to_csv(metadata_list, 'federated_test.csv', fieldnames)

print('done')

Special cell for eliminating the bad quantity results from the gallery

In [None]:
graph_number = 2 # gallery data (defined by list)
query_number = 7 # statements in Wikidata but not locally

property_labels = read_dict('property_labels.csv')

metadata_list, fieldnames = federated_query(query_number, query_data, graph_number, graph_data)
fieldnames = ['qid', 'name', 'name_lang', 'wdt', 'wdt_label', 'value', 'value_datatype']

if len(metadata_list) == 0:
    print('no data retrieved')
else:
    # screen out the bad quantity results
    output = []
    for line in metadata_list:
        try:
            number = float(line['value'])
            if line['value_datatype'] != 'decimal' or number%1 != 0: # check for nothing after the decimal
                line['wdt_label'] = find_label(line['wdt'], property_labels)
                output.append(line)
        except: # if the value isn't a number, then append the record
            line['wdt_label'] = find_label(line['wdt'], property_labels)
            output.append(line)
    
    write_dicts_to_csv(output, '../../vandycite/gallery_works/statements_not_locally.csv', fieldnames)

print('done')

In [None]:
# set the graph and query number (see config section)
graph_number = 4 # people affiliate with Vanderbilt departments (defined by query)
query_number = 5 # people and their department names, in Wikidata but not locally
# NOTE: this not only gets new people, but also people who have new unit affiliations

metadata_list, fieldnames = federated_query(query_number, query_data, graph_number, graph_data)

if len(metadata_list) == 0:
    print('no data retrieved')
else:
    write_dicts_to_csv(metadata_list, '../../vandycite/researchers/researchers_not_locally.csv', fieldnames)

print('done')

In [None]:
# set the graph and query number (see config section)
graph_number = 5 # people affiliate with Vanderbilt departments (defined by list)
query_number = 6 # people and their ORCIDs that are in Wikidata but not locally

metadata_list, fieldnames = federated_query(query_number, query_data, graph_number, graph_data)

if len(metadata_list) == 0:
    print('no data retrieved')
else:
    write_dicts_to_csv(metadata_list, '../../vandycite/researchers/orcids_not_locally.csv', fieldnames)

print('done')

# Tests

Left for historical purposes and future development

In [None]:
# Test to insert a single triple into the default graph

query = '''
insert {?s ?p ?o}
where {
bind(uri("http://test_subject") as ?s)
bind(uri("http://test_predicate") as ?p)
bind(uri("http://test_object") as ?o)
}
'''

# update test
data = send_update(query, update_endpoint, updateheader)
# Fuseki doesn't send any response for update commands
#print('response:\n', data)


In [None]:
# Test to construct missing value statements using value node values.

# NOTE: The datatypes for xsd:dateTime and xsd:decimal should be generated automatically by the datatype 
# designation in the csv-metadata.json schema (for integers, a ".0" is appended to integers to make them decimals).
# However, I don't know how to force the geo:wktLiteral datatype in SPARQL construct, so this may end up not matching
# the datatype of literals acquired from the Wikidata Query service.

value_type = value_types[2]
property_type = property_types[0]

query = '''
construct {?reference ?directProp ?literal.}
where {
  ?reference ?pxv ?value.
'''
for ln_index in range(len(value_type['local_names'])):
    query += '  ?value wikibase:' + value_type['local_names'][ln_index] + ' ?literal' + str(ln_index) + '''.
'''
query += '  bind(' + value_type['bind'] + ''' as ?literal)
'''
query += '  filter(substr(str(?pxv),1,45)="http://www.wikidata.org/prop/' + property_type + '''/value/")
  bind(substr(str(?pxv),46) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/''' + property_type + '''/", ?id)) as ?directProp)
  }
  '''
print(query) 
data = send_query(namespaces + query, query.split(' ')[0].lower(), sparql_endpoint, requestheader)
print(data)

In [None]:
# Test to construct "truthy" statements from statement value statements

query = '''
construct {?item ?truthyProp ?value.}
where {
  ?item ?p ?statement.
  ?statement ?ps ?value.
  filter(substr(str(?ps),1,40)="http://www.wikidata.org/prop/statement/P")
  bind(substr(str(?ps),40) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?truthyProp)
  }
  '''
print(query) 
data = send_query(namespaces + query, query.split(' ')[0].lower(), sparql_endpoint, requestheader)
print(data)