In [39]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
import sys # Read CLI arguments

sparql_sleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
sparql_endpoint = 'http://localhost:3030/bluffton/query'
update_endpoint = 'http://localhost:3030/bluffton/update'
accept_media_type = 'application/json'

namespaces = '''
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix prov: <http://www.w3.org/ns/prov#>
prefix wikibase: <http://wikiba.se/ontology#>
prefix  wd:  <http://www.wikidata.org/entity/>
prefix  p:  <http://www.wikidata.org/prop/>
prefix  pq:  <http://www.wikidata.org/prop/qualifier/>
prefix  pr:  <http://www.wikidata.org/prop/reference/>
prefix  ps:  <http://www.wikidata.org/prop/statement/>
prefix  pqv:  <http://www.wikidata.org/prop/qualifier/value/>
prefix  prv:  <http://www.wikidata.org/prop/reference/value/>
prefix  psv:  <http://www.wikidata.org/prop/statement/value/>
'''

value_types = ['time', 'quantity', 'globecoordinate']

def generate_header_dictionary(accept_media_type):
    user_agent_header = 'TestBot/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

requestheader = generate_header_dictionary(accept_media_type)

def generate_update_header_dictionary():
    user_agent_header = 'TestBot/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Content-Type': 'application/sparql-update',
        'User-Agent': user_agent_header
    }
    return requestHeaderDictionary

updateheader = generate_update_header_dictionary()

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

def send_query(query, query_type, endpoint, requestheader):
    print('querying SPARQL endpoint to acquire item metadata')
    response = requests.post(endpoint, data=query, headers=requestheader)
    #print(response.text)
    print('done retrieving data')
    
    if query_type == 'select':
        data = response.json()
        # extract the values from the response JSON
        results = data['results']['bindings']
        #print(json.dumps(results, indent=2))
    if query_type == 'construct':
        data = response.text        
    else:
        data = response.text
    
    return(data)

def send_update(query, endpoint, requestheader):
    #print('sending SPARQL update')
    response = requests.post(endpoint, headers=requestheader, data = query)

    #print(response.text)
    print('update complete')

    data = response.text
    
    return(data)


In [19]:
# Test to insert a single triple into the default graph

query = '''
insert {?s ?p ?o}
where {
bind(uri("http://test_subject") as ?s)
bind(uri("http://test_predicate") as ?p)
bind(uri("http://test_object") as ?o)
}
'''

# update test
data = send_update(query, update_endpoint, updateheader)
# Fuseki doesn't send any response for update commands
#print('response:\n', data)


sending SPARQL update
update complete


In [43]:
# Delete all triples in the triplestore
query = 'drop all'

# update test
data = send_update(query, update_endpoint, updateheader)
# Fuseki doesn't send any response for update commands
#print('response:\n', data)


update complete


In [33]:
# Test to construct missing value statements using value node values.

# NOTE: The datatypes for xsd:dateTime and xsd:decimal should be generated automatically by the datatype 
# designation in the csv-metadata.json schema (for integers, a ".0" is appended to integers to make them decimals).
# However, I don't know how to force the geo:wktLiteral datatype in SPARQL construct, so this may end up not matching
# the datatype of literals acquired from the Wikidata Query service.

value_types = [
    {'string': 'time', 
     'local_names': ['timeValue'], 
     'datatype':'http://www.w3.org/2001/XMLSchema#dateTime',
     'bind': '?literal0'}, 
    {'string': 'quantity', 
     'local_names': ['quantityAmount'],
     'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
     'bind': '?literal0'}, 
    {'string': 'globecoordinate', 
     'local_names': ['geoLatitude', 'geoLongitude'],
     'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
     'bind': 'concat("Point(", str(?literal0), " ", str(?literal1), ")")'}
]

property_types = ['statement', 'qualifier', 'reference']

value_type = value_types[2]
property_type = property_types[0]

query = '''
construct {?reference ?directProp ?literal.}
where {
  ?reference ?pxv ?value.
'''
for ln_index in range(len(value_type['local_names'])):
    query += '  ?value wikibase:' + value_type['local_names'][ln_index] + ' ?literal' + str(ln_index) + '''.
'''
query += '  bind(' + value_type['bind'] + ''' as ?literal)
'''
query += '  filter(substr(str(?pxv),1,45)="http://www.wikidata.org/prop/' + property_type + '''/value/")
  bind(substr(str(?pxv),46) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/''' + property_type + '''/", ?id)) as ?directProp)
  }
  '''
print(query) 
data = send_query(namespaces + query, query.split(' ')[0].lower(), sparql_endpoint, requestheader)
print(data)


construct {?reference ?directProp ?literal.}

where {
  ?reference ?pxv ?value.
  ?value wikibase:geoLatitude ?literal0.
  ?value wikibase:geoLongitude ?literal1.
  bind(concat("Point(", str(?literal0), " ", str(?literal1), ")") as ?literal)
  filter(substr(str(?pxv),1,45)="http://www.wikidata.org/prop/statement/value/")
  bind(substr(str(?pxv),46) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/statement/", ?id)) as ?directProp)
  }
  
querying SPARQL endpoint to acquire item metadata
done retrieving data
@prefix schema: <http://schema.org/> .
@prefix pq:    <http://www.wikidata.org/prop/qualifier/> .
@prefix pr:    <http://www.wikidata.org/prop/reference/> .
@prefix ps:    <http://www.wikidata.org/prop/statement/> .
@prefix wikibase: <http://wikiba.se/ontology#> .
@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd:    <http://www.wikidata.org/entity/> .
@prefix p:     <http://www.wikidata.org/prop/> .
@pre

In [44]:
# Insert the missing value statements using values from value nodes

# NOTE: The datatypes for xsd:dateTime and xsd:decimal should be generated automatically by the datatype 
# designation in the csv-metadata.json schema (for integers, a ".0" is appended to integers to make them decimals).
# However, I don't know how to force the geo:wktLiteral datatype in SPARQL construct, so this may end up not matching
# the datatype of literals acquired from the Wikidata Query service.

value_types = [
    {'string': 'time', 
     'local_names': ['timeValue'], 
     'datatype':'http://www.w3.org/2001/XMLSchema#dateTime',
     'bind': '?literal0'}, 
    {'string': 'quantity', 
     'local_names': ['quantityAmount'],
     'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
     'bind': '?literal0'}, 
    {'string': 'globecoordinate', 
     'local_names': ['geoLatitude', 'geoLongitude'],
     'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
     'bind': 'concat("Point(", str(?literal0), " ", str(?literal1), ")")'}
]

property_types = ['statement', 'qualifier', 'reference']

#value_type = value_types[2]
#property_type = property_types[0]

for value_type in value_types:
    for property_type in property_types:
        query = '''
        insert {?reference ?directProp ?literal.}
        where {
          ?reference ?pxv ?value.
        '''
        for ln_index in range(len(value_type['local_names'])):
            query += '  ?value wikibase:' + value_type['local_names'][ln_index] + ' ?literal' + str(ln_index) + '''.
        '''
        query += '  bind(' + value_type['bind'] + ''' as ?literal)
        '''
        query += '  filter(substr(str(?pxv),1,45)="http://www.wikidata.org/prop/' + property_type + '''/value/")
          bind(substr(str(?pxv),46) as ?id)
          bind(iri(concat("http://www.wikidata.org/prop/''' + property_type + '''/", ?id)) as ?directProp)
          }
          '''
        #print(query) 
        print('updating', property_type, value_type['string'])
        data = send_update(namespaces + query, update_endpoint, updateheader)


updating statement time
update complete
updating qualifier time
update complete
updating reference time
update complete
updating statement quantity
update complete
updating qualifier quantity
update complete
updating reference quantity
update complete
updating statement globecoordinate
update complete
updating qualifier globecoordinate
update complete
updating reference globecoordinate
update complete


In [None]:
construct_query1 = '''
construct {?item ?directProp ?value.}
#select distinct ?p ?directProp
where {
  ?item ?p ?statement.
  ?statement ?ps ?value.
  filter(substr(str(?ps),1,39)="http://www.wikidata.org/prop/statement/")
  bind(substr(str(?ps),40) as ?id)
#  bind(substr(str(?p),30) as ?id)
  bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?directProp)
  }
'''

In [57]:
subquery = '''
  ?qid rdfs:label ?name.
  ?qid p:P108 ?statement1.
  ?statement1 ps:P108  wd:Q886141 .
  ?qid p:P39 ?statement2.
  ?statement2 ps:P39 wd:Q61061.
'''

query = '''
SELECT distinct ?qid ?name
WHERE {
  SERVICE <https://query.wikidata.org/sparql> {''' + subquery + '''}
  minus
{''' + subquery + '''}
}
'''

# ----------------
# send request to Fuseki endpoint
# ----------------
data = send_query(namespaces + query, query.split(' ')[0].lower(), sparql_endpoint, requestheader)
print(data)

'''
print('querying SPARQL endpoint to acquire item metadata')
response = requests.post(endpoint, data=query, headers=requestheader)
#print(response.text)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']

print('done retrieving data')
print(json.dumps(results, indent=2))

# ----------------
# extract results
# ----------------

metadata_list = []
for result in results:
    row_dict = {}
    row_dict['qid'] = extract_qnumber(result['qid']['value'])
    row_dict['name'] = result['name']['value']
    metadata_list.append(row_dict)

print(json.dumps(metadata_list, indent=2))
'''
print()

querying SPARQL endpoint to acquire item metadata
done retrieving data
{ "head": {
    "vars": [ "qid" , "name" ]
  } ,
  "results": {
    "bindings": [
      { 
        "qid": { "type": "uri" , "value": "http://www.wikidata.org/entity/Q71157201" } ,
        "name": { "type": "literal" , "xml:lang": "nl" , "value": "Jane Wood" }
      } ,
      { 
        "qid": { "type": "uri" , "value": "http://www.wikidata.org/entity/Q98569118" } ,
        "name": { "type": "literal" , "xml:lang": "nl" , "value": "Noah Calvin Hirschy" }
      } ,
      { 
        "qid": { "type": "uri" , "value": "http://www.wikidata.org/entity/Q98569121" } ,
        "name": { "type": "literal" , "xml:lang": "nl" , "value": "Samuel K. Mosiman" }
      } ,
      { 
        "qid": { "type": "uri" , "value": "http://www.wikidata.org/entity/Q98569123" } ,
        "name": { "type": "literal" , "xml:lang": "nl" , "value": "Arthur S. Rosenberger" }
      } ,
      { 
        "qid": { "type": "uri" , "value": "http://www.wi