Install the dependencies:

In [92]:
!pip install SPARQLWrapper rdflib
from SPARQLWrapper import SPARQLWrapper, JSON
from getpass import getpass
from rdflib import Graph, URIRef, OWL

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Input Intavia SPARQL endpoint credentials:

In [20]:
user = input("Username: ")
password = getpass("Password: ")

Username: intavia
Password: ··········


Fetch person URIs and already existing external ID's:

In [69]:
PREFIXES = """
PREFIX bds: <http://www.bigdata.com/rdf/search#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX bioc: <http://ldf.fi/schema/bioc/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX idm: <https://www.intavia.eu/idm/>
PREFIX idmcore: <http://www.intavia.eu/idm-core/>
PREFIX idmcores: <https://www.intavia.eu/idm-core/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
"""

sparql = SPARQLWrapper("https://triplestore.acdh-dev.oeaw.ac.at/intavia/sparql")

sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?g { <http://apis.acdh.oeaw.ac.at/data> <http://ldf.fi/nbf/data> }
  GRAPH ?g {    
    VALUES ?class { crm:E21_Person idmcore:Person_Proxy }
    ?s a ?class ;
       owl:sameAs ?ext_uri .
    # Filter out sameAs links to national data sources
    FILTER (!STRSTARTS(STR(?ext_uri), "http://ldf.fi/nbf/"))
    FILTER (!STRSTARTS(STR(?ext_uri), "https://apis.acdh.oeaw.ac.at/"))
    FILTER (!STRSTARTS(STR(?ext_uri), "https://www.slovenska-biografija.si/"))
    BIND (REPLACE(STR(?ext_uri), "[^/]+$", "") AS ?ext_ns)
    BIND (REPLACE(STR(?ext_uri), "^.+/", "") AS ?ext_id)
  }
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
sparql.setCredentials(user, password)
results = sparql.query().convert()
res = results["results"]["bindings"]
print(f"{len(res)} results")
print(f"Datafields {results['head']['vars']}")
for ob in res[:10]:
  print(ob)

43522 results
Datafields ['s', 'class', 'g', 'ext_uri', 'ext_ns', 'ext_id']
{'g': {'type': 'uri', 'value': 'http://ldf.fi/nbf/data'}, 'class': {'type': 'uri', 'value': 'http://www.cidoc-crm.org/cidoc-crm/E21_Person'}, 's': {'type': 'uri', 'value': 'http://www.intavia.eu/personproxy/bs/1877'}, 'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q4129860'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q4129860'}}
{'g': {'type': 'uri', 'value': 'http://ldf.fi/nbf/data'}, 'class': {'type': 'uri', 'value': 'http://www.cidoc-crm.org/cidoc-crm/E21_Person'}, 's': {'type': 'uri', 'value': 'http://www.intavia.eu/personproxy/bs/1880'}, 'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q5561382'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q5561382'}}
{'g': {'type': 'uri', 'value': 'http://ldf.fi/nbf/data'}, 'class': {'type'

In [102]:
wikidata_ids = dict()
gnd_ids = dict()

for ob in res:
  if (ob['ext_ns']['value'] == "http://www.wikidata.org/entity/"):
    wikidata_ids[ob['ext_uri']['value']] = ob['s']['value']
  elif (ob['ext_ns']['value'] == "https://d-nb.info/gnd/"):
    gnd_ids[ob['ext_id']['value']] = ob['s']['value']

Get Wikidata -> GND ID's:

In [82]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(wikidata_ids.keys()) + """>}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_gnd = results["results"]["bindings"]
print(f"{len(res_wikidata_to_gnd)} results")


1590 results


Get Wikidata -> VIAF ID's:

In [84]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(wikidata_ids.keys()) + """>}
  ?wikidata_uri wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_viaf = results["results"]["bindings"]
print(f"{len(res_wikidata_to_viaf)} results")

2916 results


Get GND -> Wikidata ID's:

In [103]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_wikidata = results["results"]["bindings"]
print(f"{len(res_gnd_to_wikidata)} results") 

15454 results


Get GND -> VIAF ID's:

In [104]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id ;
            wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_viaf = results["results"]["bindings"]
print(f"{len(res_gnd_to_viaf)} results") 

15719 results


Get VIAF -> Wikidata ID's:


In [75]:
# TODO: do we have such data?
 #      # VIAF
 #                   wdt:P214 ?viaf_id . 
 #     BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
 #   }
 # }

Get VIAF -> GND ID's:

In [None]:
# TODO: do we have such data?

Write sameAs statements:

In [116]:
g = Graph()

print(f"{len(res_wikidata_to_gnd)} results")
for ob in res_wikidata_to_gnd[:10]:
  print(ob)
for ob in res_wikidata_to_gnd[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']] + " <same as> " + ob['gnd_uri']['value'])
for ob in res_wikidata_to_gnd:
  g.add((URIRef(wikidata_ids[ob['wikidata_uri']['value']]), OWL.sameAs, URIRef(ob['gnd_uri']['value'])))

print(f"{len(res_wikidata_to_viaf)} results")
for ob in res_wikidata_to_viaf[:10]:
  print(ob)
for ob in res_wikidata_to_viaf[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']] + " <same as> " + ob['viaf_uri']['value'])
for ob in res_wikidata_to_viaf:
  g.add((URIRef(wikidata_ids[ob['wikidata_uri']['value']]), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(f"{len(res_gnd_to_wikidata)} results")
for ob in res_gnd_to_wikidata[:10]:
  print(ob)
for ob in res_gnd_to_wikidata[:10]:
  print(gnd_ids[ob['gnd_id']['value']] + " <same as> " + ob['wikidata_uri']['value'])
for ob in res_gnd_to_wikidata:
  g.add((URIRef(gnd_ids[ob['gnd_id']['value']]), OWL.sameAs, URIRef(ob['wikidata_uri']['value'])))

print(f"{len(res_gnd_to_viaf)} results")
for ob in res_gnd_to_viaf[:10]:
  print(ob)
for ob in res_gnd_to_viaf[:10]:
  print(gnd_ids[ob['gnd_id']['value']] + " <same as> " + ob['viaf_uri']['value'])
for ob in res_gnd_to_viaf:
  g.add((URIRef(gnd_ids[ob['gnd_id']['value']]), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(len(g))

g.serialize("intavia-person-id-enrichment.ttl")

1590 results
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q855'}, 'gnd_id': {'type': 'literal', 'value': '118642499'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118642499'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1043'}, 'gnd_id': {'type': 'literal', 'value': '118573349'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118573349'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1394'}, 'gnd_id': {'type': 'literal', 'value': '118640402'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118640402'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3098'}, 'gnd_id': {'type': 'literal', 'value': '1023037076'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/1023037076'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q4761'}, 'gnd_id': {'type': 'literal', 'value': '123062179'}, 'gnd_uri': {'type'

<Graph identifier=N516024c07a5a481f887c95480b9173d1 (<class 'rdflib.graph.Graph'>)>

In [115]:
from google.colab import files
files.download("intavia-person-id-enrichment.ttl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>