Add additional external person ID's from Wikidata endpoint using source data's owl:sameAs links:

*   Wikidata ID -> GND & VIAF (BS, Bionet)
*   GND ID -> Wikidata & VIAF (APIS)
*   SBI ID -> Wikidata, GND & VIAF (SBI)

Install the dependencies:

In [56]:
!pip install SPARQLWrapper rdflib
from SPARQLWrapper import SPARQLWrapper, JSON
from getpass import getpass
import itertools
from rdflib import Graph, URIRef, OWL

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Input Intavia SPARQL endpoint credentials:

In [24]:
user = input("Username: ")
password = getpass("Password: ")

Username: intavia
Password: ··········


Fetch person URIs and already existing external ID's:

In [38]:
PREFIXES = """
PREFIX bds: <http://www.bigdata.com/rdf/search#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX bioc: <http://ldf.fi/schema/bioc/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX idm: <https://www.intavia.eu/idm/>
PREFIX idmcore: <http://www.intavia.eu/idm-core/>
PREFIX idmcores: <https://www.intavia.eu/idm-core/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
"""

sparql = SPARQLWrapper("https://triplestore.acdh-dev.oeaw.ac.at/intavia/sparql")

sparql.setQuery(PREFIXES + """
SELECT * {
  # Existing sameAs links in Intavia data:
  #  APIS (graph: http://apis.acdh.oeaw.ac.at/data -> GND (URI ns: https://d-nb.info/gnd/)
  #  BS (graph: http://ldf.fi/nbf/data) -> Wikidata (URI ns: http://www.wikidata.org/entity/)
  #  SBI (graph: http://www.intavia.eu/sbi) -> Slovenska biografija (URI ns: https://www.slovenska-biografija.si/oseba/) 
  #  BiographyNet (graph: http://data.biographynet.nl/rdf/data) -> Wikidata: TODO
  #VALUES ?g { <http://apis.acdh.oeaw.ac.at/data> <http://ldf.fi/nbf/data> <http://www.intavia.eu/sbi> }
  #GRAPH ?g {
{
    VALUES ?class { crm:E21_Person idmcore:Person_Proxy }
    ?s a ?class ;
       owl:sameAs ?ext_uri .
}
UNION
{
    ?ext_uri <http://www.wikidata.org/prop/direct/651>/^<http://www.intavia.eu/idm-core/person_proxy_for> ?s .
}
    # Filter out sameAs links to redundant national data sources
    FILTER (!STRSTARTS(STR(?ext_uri), "http://ldf.fi/nbf/"))
    FILTER (!STRSTARTS(STR(?ext_uri), "https://apis.acdh.oeaw.ac.at/"))
    BIND (IF (STRSTARTS(STR(?ext_uri), "https://www.slovenska-biografija.si/oseba/"), "https://www.slovenska-biografija.si/oseba/", REPLACE(STR(?ext_uri), "[^/]+$", "")) as ?ext_ns)
    BIND (IF (STRSTARTS(STR(?ext_uri), "https://www.slovenska-biografija.si/oseba/"), REPLACE(REPLACE(STR(?ext_uri), "^https://www.slovenska-biografija.si/oseba/sbi", ""), "/", ""), REPLACE(STR(?ext_uri), "^.+/", "")) AS ?ext_id)
  #}
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
sparql.setCredentials(user, password)
results = sparql.query().convert()
res = results["results"]["bindings"]
print(f"{len(res)} results")
print(f"Datafields {results['head']['vars']}")
for ob in res[:10]:
  print(ob)

142061 results
Datafields ['class', 's', 'ext_uri', 'ext_ns', 'ext_id']
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55902316'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-12650564_01'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q55902316'}}
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55902316'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-12650564_02'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q55902316'}}
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55902316'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-12650564_03'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q55902316'}}
{'ext_uri': {'type': 'uri', 'value

In [41]:
wikidata_ids = dict()
gnd_ids = dict()
sbi_ids = dict()

for ob in res:
  if (ob['ext_ns']['value'] == "http://www.wikidata.org/entity/"):
    wikidata_ids[ob['ext_uri']['value']] = ob['s']['value']
  elif (ob['ext_ns']['value'] == "https://d-nb.info/gnd/"):
    gnd_ids[ob['ext_id']['value']] = ob['s']['value']
  elif (ob['ext_ns']['value'] == "https://www.slovenska-biografija.si/oseba/"):
    sbi_ids[ob['ext_id']['value']] = ob['s']['value']


Get Wikidata -> GND ID's:

In [61]:
# Wikidata endpoint doesn't handle ~30 000 inline URIs in the query, so split the query:

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000)).keys()) + """>}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_gnd = results["results"]["bindings"]
print(f"{len(res_wikidata_to_gnd)} results")

sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000, len(wikidata_ids))).keys()) + """>}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
print(f'{len(results["results"]["bindings"])} results')
res_wikidata_to_gnd += results["results"]["bindings"]

4855 results
4446 results


Get Wikidata -> VIAF ID's:

In [64]:
# Wikidata endpoint doesn't handle ~30 000 inline URIs in the query, so split the query:

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000)).keys()) + """>}
  ?wikidata_uri wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_viaf = results["results"]["bindings"]
print(f"{len(res_wikidata_to_viaf)} results")

sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000, len(wikidata_ids))).keys()) + """>}
  ?wikidata_uri wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()

print(f'{len(results["results"]["bindings"])} results')
res_wikidata_to_viaf += results["results"]["bindings"]

9358 results
9213 results


Get GND -> Wikidata ID's:

In [29]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id .
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_wikidata = results["results"]["bindings"]
print(f"{len(res_gnd_to_wikidata)} results") 

15454 results


Get GND -> VIAF ID's:

In [30]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id ;
                wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_viaf = results["results"]["bindings"]
print(f"{len(res_gnd_to_viaf)} results") 

15719 results


Get SBI -> Wikidata ID's:

In [31]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id .
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_wikidata = results["results"]["bindings"]
print(f"{len(res_sbi_to_wikidata)} results")

7751 results


Get SBI -> GND ID'S:

In [32]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id ;
                wdt:P227 ?gnd_id . 
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_gnd = results["results"]["bindings"]
print(f"{len(res_sbi_to_gnd)} results")

1707 results


Get SBI -> VIAF ID's:

In [33]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id ;
                wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_viaf = results["results"]["bindings"]
print(f"{len(res_sbi_to_viaf)} results") 

3951 results


Write sameAs statements:

In [65]:
g = Graph()

print(f"{len(res_wikidata_to_gnd)} results")
for ob in res_wikidata_to_gnd[:10]:
  print(ob)
for ob in res_wikidata_to_gnd[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']] + " <same as> " + ob['gnd_uri']['value'])
for ob in res_wikidata_to_gnd:
  g.add((URIRef(wikidata_ids[ob['wikidata_uri']['value']]), OWL.sameAs, URIRef(ob['gnd_uri']['value'])))

print(f"{len(res_wikidata_to_viaf)} results")
for ob in res_wikidata_to_viaf[:10]:
  print(ob)
for ob in res_wikidata_to_viaf[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']] + " <same as> " + ob['viaf_uri']['value'])
for ob in res_wikidata_to_viaf:
  g.add((URIRef(wikidata_ids[ob['wikidata_uri']['value']]), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(f"{len(res_gnd_to_wikidata)} results")
for ob in res_gnd_to_wikidata[:10]:
  print(ob)
for ob in res_gnd_to_wikidata[:10]:
  print(gnd_ids[ob['gnd_id']['value']] + " <same as> " + ob['wikidata_uri']['value'])
for ob in res_gnd_to_wikidata:
  g.add((URIRef(gnd_ids[ob['gnd_id']['value']]), OWL.sameAs, URIRef(ob['wikidata_uri']['value'])))

print(f"{len(res_gnd_to_viaf)} results")
for ob in res_gnd_to_viaf[:10]:
  print(ob)
for ob in res_gnd_to_viaf[:10]:
  print(gnd_ids[ob['gnd_id']['value']] + " <same as> " + ob['viaf_uri']['value'])
for ob in res_gnd_to_viaf:
  g.add((URIRef(gnd_ids[ob['gnd_id']['value']]), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(f"{len(res_sbi_to_wikidata)} results")
for ob in res_sbi_to_wikidata[:10]:
  print(ob)
for ob in res_sbi_to_wikidata[:10]:
  print(sbi_ids[ob['sbi_id']['value']] + " <same as> " + ob['wikidata_uri']['value'])
for ob in res_sbi_to_wikidata:
  g.add((URIRef(sbi_ids[ob['sbi_id']['value']]), OWL.sameAs, URIRef(ob['wikidata_uri']['value'])))

print(f"{len(res_sbi_to_gnd)} results")
for ob in res_sbi_to_gnd[:10]:
  print(ob)
for ob in res_sbi_to_gnd[:10]:
  print(sbi_ids[ob['sbi_id']['value']] + " <same as> " + ob['gnd_uri']['value'])
for ob in res_sbi_to_gnd:
  g.add((URIRef(sbi_ids[ob['sbi_id']['value']]), OWL.sameAs, URIRef(ob['gnd_uri']['value'])))

print(f"{len(res_sbi_to_viaf)} results")
for ob in res_sbi_to_viaf[:10]:
  print(ob)
for ob in res_sbi_to_viaf[:10]:
  print(sbi_ids[ob['sbi_id']['value']] + " <same as> " + ob['viaf_uri']['value'])
for ob in res_sbi_to_viaf:
  g.add((URIRef(sbi_ids[ob['sbi_id']['value']]), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(len(g))

g.serialize("intavia-person-id-enrichment.ttl")

9301 results
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q742'}, 'gnd_id': {'type': 'literal', 'value': '118597523'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118597523'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q855'}, 'gnd_id': {'type': 'literal', 'value': '118642499'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118642499'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1043'}, 'gnd_id': {'type': 'literal', 'value': '118573349'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118573349'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1394'}, 'gnd_id': {'type': 'literal', 'value': '118640402'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118640402'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3098'}, 'gnd_id': {'type': 'literal', 'value': '1023037076'}, 'gnd_uri': {'type': 

<Graph identifier=Nd370b03730394d1c8401f990953f15df (<class 'rdflib.graph.Graph'>)>

In [66]:
from google.colab import files
files.download("intavia-person-id-enrichment.ttl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>