In [1]:
from rdflib_hdt import HDTStore
from rdflib import Graph

# Load an HDT file. Missing indexes are generated automatically
# You can provide the index file by putting it in the same directory as the HDT file.
# See https://www.rdfhdt.org/datasets/ for getting the HDT and the index file.
print("Loading HDTStore at ...")
store = HDTStore("./dbpedia2016-10.hdt.1")

# Display some metadata about the HDT document itself
print(f"Number of RDF triples: {len(store)}")
print(f"Number of subjects: {store.nb_subjects}")
print(f"Number of predicates: {store.nb_predicates}")
print(f"Number of objects: {store.nb_objects}")
print(f"Number of shared subject-object: {store.nb_shared}")

# Create an RDFlib Graph with the HDT document as a backend
graph = Graph(store=store)

Loading HDTStore at ...
Number of RDF triples: 1137003322
Number of subjects: 68940263
Number of predicates: 123002
Number of objects: 237607127
Number of shared subject-object: 50711395


## Get Wikidata URIs

In [7]:
from tqdm import tqdm
import pandas as pd
from rdflib import Graph, URIRef, Namespace
from rdflib.namespace import OWL
from rdflib_hdt import HDTStore

def get_wikdataURIs(data, graph):
  """ Exploits owl:sameAs relation to get wikidata mappings
  Args:
      data (str): Specifies wether to get Wikidata mappings for movielens or lastfm
      graph (Graph): RDFlib Graph with the HDT document as a backend.
  """

  assert data in ["movielens", "lastfm"]

  # Get for each item the corresponding DBPedia mappings/entities
  df = pd.read_csv(f'./{data}/Mapping2DBpedia-1.2-corrected.tsv', sep='\t', encoding = 'utf-8', header=None, engine='python')
  entities = list(df[2])

  # For each DBPedia mapping/entity:
  # – Go through all the triples with owl:sameAs-relation where the mapping/entity is a subject
  # – If there exists an object that starts with http://www.wikidata.org/entity/: 
  #   We found the corresponding Wikidata mapping for the item.
  mapping_dict = {}
  wikidata_not_found = []
  for entity in tqdm(entities):
    found_wikidata = False

    # List of entities that had to be found manually.
    if entity == "http://dbpedia.org/resource/Big_Mike":
      mapping_dict[entity] = "http://www.wikidata.org/entity/Q3609472"
      continue
    
    elif entity == "http://dbpedia.org/resource/Dakota_(singer)":
      mapping_dict[entity] = "http://www.wikidata.org/entity/Q27973731"
      continue

    elif entity == "http://dbpedia.org/resource/Avalon_(musician)":
      mapping_dict[entity] = "http://www.wikidata.org/entity/Q28421675"
      continue   

    for s, p, o in graph.triples((URIRef(entity), OWL.sameAs, None)):
      if o.startswith("http://www.wikidata.org/entity/"):
        mapping_dict[entity] = o
        found_wikidata = True
    
    if not found_wikidata:
      print(f"No wikidata URI found for {entity}")
      wikidata_not_found.append(entity)

  # Change URIs in df from DBPedia to Wikidata in order to save final mappings
  for dbpediaURI, wikidataURI in mapping_dict.items():
    index = df.loc[df[2] == dbpediaURI].index
    df.at[index, 2] = wikidataURI

  df.to_csv(f'./{data}/Mapping2Wikidata-1.2-corrected.tsv', sep='\t', index = False, header = False, encoding="utf-8")

  return df, wikidata_not_found

In [9]:
#data = "movielens" 
data = "lastfm"

df, wikidata_not_found = get_wikdataURIs(data, graph)

100%|██████████| 8790/8790 [00:01<00:00, 6904.80it/s]


In [10]:
wikidata_not_found

[]