In [1]:
from rdflib_hdt import HDTStore
from rdflib import Graph

# Load an HDT file. Missing indexes are generated automatically
# You can provide the index file by putting it in the same directory as the HDT file.
# See https://www.rdfhdt.org/datasets/ for getting the HDT and the index file.
print("Loading HDTStore at ...")
store = HDTStore("./dbpedia2016-10.hdt.1")

# Display some metadata about the HDT document itself
print(f"Number of RDF triples: {len(store)}")
print(f"Number of subjects: {store.nb_subjects}")
print(f"Number of predicates: {store.nb_predicates}")
print(f"Number of objects: {store.nb_objects}")
print(f"Number of shared subject-object: {store.nb_shared}")

# Create an RDFlib Graph with the HDT document as a backend
graph = Graph(store=store)

Loading HDTStore at ...
Number of RDF triples: 1137003322
Number of subjects: 68940263
Number of predicates: 123002
Number of objects: 237607127
Number of shared subject-object: 50711395


In [2]:
import rdflib
from rdflib_hdt import HDTStore
from rdflib import Graph, URIRef, Namespace
from rdflib.namespace import OWL, RDF

import pandas as pd
from tqdm import tqdm

def process_wikidata_entity(entity, graph, knowledgeGraph, next_queue):
  # Append all triples, where the object is not in English Language, not a Wikidata-Entitiy (e.g. an Identifier object) to the Knowledge Graph
  # Set all unvisited URIRefs of objects to next_queue

  for s,p,o in graph.triples((URIRef(entity), None, None)):
      if p == OWL.sameAs:
        print(f"{s} has owl:sameAs and is thus possible wrong.")

      # Get Literal either if its English or a not from <class "str">
      if isinstance(o, rdflib.term.Literal):
        if isinstance(o.value, str):
          if o.language == "en":
            knowledgeGraph.add((s, p, o))
            continue
        else:
          knowledgeGraph.add((s, p, o))
          continue

      # Add all URIs of the Objects to the new_queue
      if isinstance(o, rdflib.term.URIRef):
        if o.startswith("http://www.wikidata.org/entity/Q"):
          knowledgeGraph.add((s, p, o))
          next_queue.add(o)
          continue

      # If so far no triple has been found, the current triple doesn't satisify the criteria

  return knowledgeGraph, next_queue

def process_dpbedia_entity(entity, graph, knowledgeGraph, next_queue):
    # Append all triples to the Knowledge Graph
    # Set all unvisited URIRefs of objects to next_queue

    for s, p, o in graph.triples((URIRef(entity), None, None)):
        
      if p != OWL.sameAs:
        knowledgeGraph.add((s, p, o))
        
        # Add all URIs of the Objects to the new_queue
        if isinstance(o, URIRef):
          next_queue.add(o)

    return knowledgeGraph, next_queue

def expand_kg(kg_type, data, graph, num_hops = 2):
  # Run BFS to generate Knowledge Graph

  assert data in ["movielens", "lastfm"]
  assert kg_type in ["DBpedia", "Wikidata"]

  print("_____________________")
  print("Running queue for...")
  print(f"Data:\t\t {data:>15}")
  print(f"Knowledge Graph:\t  {kg_type:>15}")
  print(f"Number of Hops:\t  {num_hops:>15}")
  print("_____________________")

  # Get Mappings
  data_df = pd.read_csv(f'/workspace/{data}/Mapping2{kg_type}-1.2-corrected.tsv', sep='\t', header=None, engine='python')
  ids = list(data_df[0])
  names = list(data_df[1])
  entities = list(data_df[2])

  # Initialize Knowledge Graph
  knowledgeGraph = Graph()

  #Initialize a queue
  queue = entities 

  # List to keep track of visited nodes.
  visited = [] 

  # Run BFS
  for i in range(0, num_hops):
    print(f"Parse all URls {i+1} hop(s) away...")
    print(f"About to process {len(queue)} entities")
    next_queue = set()

    # Run BFS for DBPedia
    if kg_type == "DBpedia":
      for j, entity in enumerate(tqdm(queue)):
        if entity not in visited:
          knowledgeGraph, next_queue = process_dpbedia_entity(entity, graph, knowledgeGraph, next_queue)
          visited.append(entity)
    
    # Run BFS for Wikidata
    else: 
      for j, entity in enumerate(tqdm(queue)):
        if entity not in visited:
          knowledgeGraph, next_queue = process_wikidata_entity(entity, graph, knowledgeGraph, next_queue)
          visited.append(entity)
    
    print(f"Finished {i+1} hop(s).\n")
    queue = next_queue

  return knowledgeGraph

# Movielens
## DBpedia

In [10]:
data = "movielens"
kg_type = "DBpedia"

movielens_dbpedia_2hops = expand_kg(kg_type, data, graph, num_hops = 2)

  1%|          | 17/3244 [00:00<00:19, 166.39it/s]

_____________________
Running queue for...
Data:		       movielens
Knowledge Graph:	          DBpedia
Number of Hops:	                2
_____________________
Parse all URls 1 hop(s) away...
About to process 3244 entities


100%|██████████| 3244/3244 [00:21<00:00, 153.69it/s]
  0%|          | 23/105964 [00:00<07:43, 228.41it/s]

Finished 1 hop(s).

Parse all URls 2 hop(s) away...
About to process 105964 entities


100%|██████████| 105964/105964 [1:06:54<00:00, 26.40it/s]


Finished 2 hop(s).



In [11]:
graph_file = "./movielens/2hopsDBpedia.nt"
movielens_dbpedia_2hops.serialize(destination=graph_file, format='nt')

graph_without_literals = rdflib.Graph()
for s,p,o in movielens_dbpedia_2hops.triples((None, None, None)):
  if o.startswith('http://dbpedia.org/resource/'):
    graph_without_literals.add((s,p,o))

graph_file = "./movielens/2hopsDBpediaNoLiterals.nt"
graph_without_literals.serialize(destination=graph_file, format='nt')

## Wikidata

In [None]:
data = "movielens"
kg_type = "Wikidata"

movielens_wikidata_2hops = expand_kg(kg_type, data, graph, num_hops = 2)

In [None]:
graph_file = "./movielens/2hopsWikidata.nt"
movielens_wikidata_2hops.serialize(destination=graph_file, format='nt')

graph_without_literals = rdflib.Graph()
for s,p,o in movielens_wikidata_2hops.triples((None, None, None)):
  if o.startswith('http://www.wikidata.org/entity/Q'):
    graph_without_literals.add((s,p,o))

graph_file = "./movielens/2hopsWikidataNoLiterals.nt"
graph_without_literals.serialize(destination=graph_file, format='nt')

# Last.fm
## DBpedia

In [3]:
data = "lastfm"
kg_type = "DBpedia"

lastfm_dbpedia_2hops = expand_kg(kg_type, data, graph, num_hops = 2)

  0%|          | 0/8790 [00:00<?, ?it/s]

_____________________
Running queue for...
Data:		          lastfm
Knowledge Graph:	          DBpedia
Number of Hops:	                2
_____________________
Parse all URls 1 hop(s) away...
About to process 8790 entities


100%|██████████| 8790/8790 [00:53<00:00, 163.13it/s]
  0%|          | 23/352082 [00:00<25:49, 227.14it/s]

Finished 1 hop(s).

Parse all URls 2 hop(s) away...
About to process 352082 entities


100%|██████████| 352082/352082 [12:46:13<00:00,  7.66it/s]   

Finished 2 hop(s).






In [4]:
graph_file = "./lastfm/2hopsDBpedia.nt"
lastfm_dbpedia_2hops.serialize(destination=graph_file, format='nt')

graph_without_literals = rdflib.Graph()
for s,p,o in lastfm_dbpedia_2hops.triples((None, None, None)):
  if o.startswith('http://dbpedia.org/resource/'):
    graph_without_literals.add((s,p,o))

graph_file = "./lastfm/2hopsDBpediaNoLiterals.nt"
graph_without_literals.serialize(destination=graph_file, format='nt')

## Wikidata

In [None]:
data = "lastfm"
kg_type = "Wikidata"

lastfm_wikidata_2hops = expand_kg(kg_type, data, graph, num_hops = 2)

In [None]:
graph_file = "./lastfm/2hopsWikidata.nt"
lastfm_wikidata_2hops.serialize(destination=graph_file, format='nt')

graph_without_literals = rdflib.Graph()
for s,p,o in lastfm_wikidata_2hops.triples((None, None, None)):
  if o.startswith('http://www.wikidata.org/entity/Q'):
    graph_without_literals.add((s,p,o))

graph_file = "./lastfm/2hopsWikidataNoLiterals.nt"
graph_without_literals.serialize(destination=graph_file, format='nt')