Add additional external person ID's from Wikidata endpoint using source data's owl:sameAs links:

*   Wikidata ID -> GND & VIAF (BS, Bionet)
*   GND ID -> Wikidata & VIAF (APIS)
*   SBI ID -> Wikidata, GND & VIAF (SBI)

Install the dependencies:

In [2]:
!pip install SPARQLWrapper rdflib
from SPARQLWrapper import SPARQLWrapper, JSON
from getpass import getpass
import itertools
from rdflib import Graph, URIRef, OWL

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 5.9 MB/s 
[?25hCollecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 549 kB/s 
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.2.0


Input Intavia SPARQL endpoint credentials:

In [3]:
user = input("Username: ")
password = getpass("Password: ")

Username: intavia
Password: ··········


Fetch person URIs and already existing external ID's:

In [28]:
PREFIXES = """
PREFIX bds: <http://www.bigdata.com/rdf/search#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX bioc: <http://ldf.fi/schema/bioc/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX idm: <https://www.intavia.eu/idm/>
PREFIX idmcore: <http://www.intavia.eu/idm-core/>
PREFIX idmcores: <https://www.intavia.eu/idm-core/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
"""

sparql = SPARQLWrapper("https://triplestore.acdh-dev.oeaw.ac.at/intavia/sparql")

sparql.setQuery(PREFIXES + """
SELECT * {
  # Existing sameAs links in Intavia data:
  #  APIS (graph: http://apis.acdh.oeaw.ac.at/data -> GND (URI ns: https://d-nb.info/gnd/)
  #  BS (graph: http://ldf.fi/nbf/data) -> Wikidata (URI ns: http://www.wikidata.org/entity/)
  #  SBI (graph: http://www.intavia.eu/sbi) -> Slovenska biografija (URI ns: https://www.slovenska-biografija.si/oseba/) 
  #  BiographyNet (graph: http://data.biographynet.nl/rdf/data) -> Wikidata (URI ns: http://www.wikidata.org/entity/)

  { GRAPH ?graph {
      # Don't include the previous result of the person id enrichment (this script)
      FILTER (STR(?graph) != "http://www.intavia.eu/person-id-enrichment")

      VALUES ?class { crm:E21_Person idmcore:Person_Proxy }
      ?s a ?class ;
         owl:sameAs ?ext_uri .
    }
  }
  UNION
  # Bionet
  { ?ext_uri <http://www.wikidata.org/prop/direct/651>/^<http://www.intavia.eu/idm-core/person_proxy_for> ?s . }
 
  # Filter out sameAs links to redundant national data sources
  FILTER (!STRSTARTS(STR(?ext_uri), "http://ldf.fi/nbf/"))
  FILTER (!STRSTARTS(STR(?ext_uri), "https://apis.acdh.oeaw.ac.at/"))

  BIND (IF (STRSTARTS(STR(?ext_uri), "https://www.slovenska-biografija.si/oseba/"), "https://www.slovenska-biografija.si/oseba/", REPLACE(STR(?ext_uri), "[^/]+$", "")) as ?ext_ns)
  BIND (IF (STRSTARTS(STR(?ext_uri), "https://www.slovenska-biografija.si/oseba/"), REPLACE(REPLACE(STR(?ext_uri), "^https://www.slovenska-biografija.si/oseba/sbi", ""), "/", ""), REPLACE(STR(?ext_uri), "^.+/", "")) AS ?ext_id)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
sparql.setCredentials(user, password)
results = sparql.query().convert()
res = results["results"]["bindings"]
print(f"{len(res)} results")
print(f"Datafields {results['head']['vars']}")
for ob in res[:10]:
  print(ob)

146089 results
Datafields ['graph', 'class', 's', 'ext_uri', 'ext_ns', 'ext_id']
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q6149252'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-11046340_01'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q6149252'}}
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q6149252'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-11046340_02'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q6149252'}}
{'ext_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q6150406'}, 's': {'type': 'uri', 'value': 'http://data.biographynet.nl/rdf/PersonDes-11046358_01'}, 'ext_ns': {'type': 'literal', 'value': 'http://www.wikidata.org/entity/'}, 'ext_id': {'type': 'literal', 'value': 'Q6150406'}}
{'ext_uri': {'type': 'uri', 'va

In [19]:
wikidata_ids = dict()
gnd_ids = dict()
sbi_ids = dict()

for ob in res:
  if (ob['ext_ns']['value'] == "http://www.wikidata.org/entity/"):
    if ob['ext_uri']['value'] not in wikidata_ids:
      wikidata_ids[ob['ext_uri']['value']] = list()
    wikidata_ids[ob['ext_uri']['value']].append(ob['s']['value'])
  elif (ob['ext_ns']['value'] == "https://d-nb.info/gnd/"):
    if ob['ext_id']['value'] not in gnd_ids:
      gnd_ids[ob['ext_id']['value']] = list()
    gnd_ids[ob['ext_id']['value']].append(ob['s']['value'])
  elif (ob['ext_ns']['value'] == "https://www.slovenska-biografija.si/oseba/"):
    if ob['ext_id']['value'] not in sbi_ids:
      sbi_ids[ob['ext_id']['value']] = list()
    sbi_ids[ob['ext_id']['value']].append(ob['s']['value'])

print(len(wikidata_ids))

29281


Get Wikidata -> GND ID's:

In [6]:
# Wikidata endpoint doesn't handle ~30 000 inline URIs in the query, so split the query:

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000)).keys()) + """>}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_gnd = results["results"]["bindings"]
print(f"{len(res_wikidata_to_gnd)} results")

sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000, len(wikidata_ids))).keys()) + """>}
  ?wikidata_uri wdt:P227 ?gnd_id .
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)   
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
print(f'{len(results["results"]["bindings"])} results')
res_wikidata_to_gnd += results["results"]["bindings"]

4899 results
4454 results


Get Wikidata -> VIAF ID's:

In [8]:
# Wikidata endpoint doesn't handle ~30 000 inline URIs in the query, so split the query:

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000)).keys()) + """>}
  ?wikidata_uri wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_wikidata_to_viaf = results["results"]["bindings"]
print(f"{len(res_wikidata_to_viaf)} results")

sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?wikidata_uri {<""" + '> <'.join(dict(itertools.islice(wikidata_ids.items(), 15000, len(wikidata_ids))).keys()) + """>}
  ?wikidata_uri wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()

print(f'{len(results["results"]["bindings"])} results')
res_wikidata_to_viaf += results["results"]["bindings"]

9361 results
9230 results


Get GND -> Wikidata ID's:

In [9]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id .
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_wikidata = results["results"]["bindings"]
print(f"{len(res_gnd_to_wikidata)} results") 

15465 results


Get GND -> VIAF ID's:

In [10]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?gnd_id {'""" + "' '".join(gnd_ids.keys()) + """'}
  ?wikidata_uri wdt:P227 ?gnd_id ;
                wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_gnd_to_viaf = results["results"]["bindings"]
print(f"{len(res_gnd_to_viaf)} results") 

15726 results


Get SBI -> Wikidata ID's:

In [12]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id .
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_wikidata = results["results"]["bindings"]
print(f"{len(res_sbi_to_wikidata)} results")

7841 results


Get SBI -> GND ID'S:

In [13]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id ;
                wdt:P227 ?gnd_id . 
  BIND (URI(CONCAT("https://d-nb.info/gnd/", ?gnd_id)) AS ?gnd_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_gnd = results["results"]["bindings"]
print(f"{len(res_sbi_to_gnd)} results")

1731 results


Get SBI -> VIAF ID's:

In [14]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(PREFIXES + """
SELECT * {
  VALUES ?sbi_id {'""" + "' '".join(sbi_ids.keys()) + """'}
  ?wikidata_uri wdt:P1254 ?sbi_id ;
                wdt:P214 ?viaf_id . 
  BIND (URI(CONCAT("https://viaf.org/viaf/", ?viaf_id)) AS ?viaf_uri)
}
#LIMIT 1
""")

sparql.setReturnFormat(JSON)
sparql.setMethod('POST')
results = sparql.query().convert()
res_sbi_to_viaf = results["results"]["bindings"]
print(f"{len(res_sbi_to_viaf)} results") 

4024 results


Write sameAs statements:

In [25]:
g = Graph()

print(f"{len(res_wikidata_to_gnd)} results")
for ob in res_wikidata_to_gnd[:10]:
  print(ob)
for ob in res_wikidata_to_gnd[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']])
  print(" <same as> " + ob['gnd_uri']['value'])
for ob in res_wikidata_to_gnd:
  for intavia_uri in wikidata_ids[ob['wikidata_uri']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['gnd_uri']['value'])))

print(f"{len(res_wikidata_to_viaf)} results")
for ob in res_wikidata_to_viaf[:10]:
  print(ob)
for ob in res_wikidata_to_viaf[:10]:
  print(wikidata_ids[ob['wikidata_uri']['value']])
  print(" <same as> " + ob['viaf_uri']['value'])
for ob in res_wikidata_to_viaf:
  for intavia_uri in wikidata_ids[ob['wikidata_uri']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(f"{len(res_gnd_to_wikidata)} results")
for ob in res_gnd_to_wikidata[:10]:
  print(ob)
for ob in res_gnd_to_wikidata[:10]:
  print(gnd_ids[ob['gnd_id']['value']])
  print(" <same as> " + ob['wikidata_uri']['value'])
for ob in res_gnd_to_wikidata:
  for intavia_uri in gnd_ids[ob['gnd_id']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['wikidata_uri']['value'])))

print(f"{len(res_gnd_to_viaf)} results")
for ob in res_gnd_to_viaf[:10]:
  print(ob)
for ob in res_gnd_to_viaf[:10]:
  print(gnd_ids[ob['gnd_id']['value']])
  print(" <same as> " + ob['viaf_uri']['value'])
for ob in res_gnd_to_viaf:
  for intavia_uri in gnd_ids[ob['gnd_id']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(f"{len(res_sbi_to_wikidata)} results")
for ob in res_sbi_to_wikidata[:10]:
  print(ob)
for ob in res_sbi_to_wikidata[:10]:
  print(sbi_ids[ob['sbi_id']['value']])
  print(" <same as> " + ob['wikidata_uri']['value'])
for ob in res_sbi_to_wikidata:
  for intavia_uri in sbi_ids[ob['sbi_id']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['wikidata_uri']['value'])))

print(f"{len(res_sbi_to_gnd)} results")
for ob in res_sbi_to_gnd[:10]:
  print(ob)
for ob in res_sbi_to_gnd[:10]:
  print(sbi_ids[ob['sbi_id']['value']])
  print(" <same as> " + ob['gnd_uri']['value'])
for ob in res_sbi_to_gnd:
  for intavia_uri in sbi_ids[ob['sbi_id']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['gnd_uri']['value'])))

print(f"{len(res_sbi_to_viaf)} results")
for ob in res_sbi_to_viaf[:10]:
  print(ob)
for ob in res_sbi_to_viaf[:10]:
  print(sbi_ids[ob['sbi_id']['value']])
  print(" <same as> " + ob['viaf_uri']['value'])
for ob in res_sbi_to_viaf:
  for intavia_uri in sbi_ids[ob['sbi_id']['value']]:
    g.add((URIRef(intavia_uri), OWL.sameAs, URIRef(ob['viaf_uri']['value'])))

print(len(g))

g.serialize("intavia-person-id-enrichment.ttl")

9353 results
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q742'}, 'gnd_id': {'type': 'literal', 'value': '118597523'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118597523'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q855'}, 'gnd_id': {'type': 'literal', 'value': '118642499'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118642499'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1043'}, 'gnd_id': {'type': 'literal', 'value': '118573349'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118573349'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1394'}, 'gnd_id': {'type': 'literal', 'value': '118640402'}, 'gnd_uri': {'type': 'uri', 'value': 'https://d-nb.info/gnd/118640402'}}
{'wikidata_uri': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3098'}, 'gnd_id': {'type': 'literal', 'value': '1023037076'}, 'gnd_uri': {'type': 

<Graph identifier=N9804fb216acc4bef8ad0ff55d29dee7f (<class 'rdflib.graph.Graph'>)>

In [26]:
from google.colab import files
files.download("intavia-person-id-enrichment.ttl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>