In [5]:
from rdflib_hdt import HDTStore
from rdflib import Graph

# Load an HDT file. Missing indexes are generated automatically
# You can provide the index file by putting it in the same directory as the HDT file.
# See https://www.rdfhdt.org/datasets/ for getting the HDT and the index file.
print("Loading HDTStore at ...")
store = HDTStore("./dbpedia2016-10.hdt.1")

# Display some metadata about the HDT document itself
print(f"Number of RDF triples: {len(store)}")
print(f"Number of subjects: {store.nb_subjects}")
print(f"Number of predicates: {store.nb_predicates}")
print(f"Number of objects: {store.nb_objects}")
print(f"Number of shared subject-object: {store.nb_shared}")

# Create an RDFlib Graph with the HDT document as a backend
graph = Graph(store=store)

Loading HDTStore at ...
Number of RDF triples: 1137003322
Number of subjects: 68940263
Number of predicates: 123002
Number of objects: 237607127
Number of shared subject-object: 50711395


# Movielens1M
## Implemented Functions

* check_mappings: marks the initial/generated mappings as potentially wrong
* URI_generator: generates a list of potential URIs for each item.
* find_correct: filter the list of potential URI names for each item, by the URI that seems to be a movie

In [6]:
import re
from tqdm import tqdm
from rdflib import Graph, URIRef, Namespace
from rdflib.namespace import RDFS, OWL, RDF

def check_mappings(movielens_mappings, graph):
  """ Mark the initial mappings as potentially wrong, if one of the following criteria holds:
      – the mapped entity seems to be wrong (the subject/abstract is not containing the year (range +1/-1) of publication, given in the name of the item)
      – the entity does not seem to be a movie, but e.g. the book the movie is based on (rdf:type does not contain SCHEMA.Movie or DBO.Film)
  Args:
      movielens_mappings (dict): Contains mappings for each item to its entity, provided by prior work.
      graph (Graph): RDFlib Graph with the HDT document as a backend.
  Returns:
      possibly_wrong_mappings (dict): Contains mappings for each item to its entity, if the mapped entity seems to be wrong.
  """
 
  SCHEMA = Namespace('http://schema.org/')
  DBO = Namespace('http://dbpedia.org/ontology/')
  DCT = Namespace("http://purl.org/dc/terms/")

  # Check for wrongly mapped entities
  possibly_wrong_mappings = {}
  for name, entity in tqdm(movielens_mappings.items()):

    # Step 1 – Check if subject/abstract is containing the year (range +1/-1) of publication
    year_name = re.findall(r'\(\d\d\d\d\)', name)[0][1:-1]
    year_name_range = [str(int(year_name)-1), year_name, str(int(year_name)+1)]
    year_subject_found = False
    is_contained = []
    for s, p, o in graph.triples((URIRef(entity), DCT.subject, None)):
      year_subject = re.findall(r'(\d\d\d\d)_films', o)
      if year_subject:
        year_subject = year_subject[0]
        year_subject_found = True
        year_subject_range = [str(int(year_subject)-1), year_subject, str(int(year_subject)+1)]
        is_contained = [i for i in year_name_range if i in year_subject_range]
        break
  
    if not year_subject_found:
      for s, p, o in graph.triples((URIRef(entity), DBO.abstract, None)):
        years_entity_abstract = re.findall(r'\d\d\d\d', o)
        is_contained = [i for i in year_name_range if i in years_entity_abstract]
        break

    if not is_contained:
      # print(f"'{name}': '{entity}' is possibly wrong because the subject and comment is not containing the year (range +1/-1) of publication.")
      possibly_wrong_mappings[name] = entity
      continue

    # Step 2 – Check if entity is a movie (rdf:type does contain SCHEMA.Movie or DBO.Film)
    is_movie = False
    for s, p, o in graph.triples((URIRef(entity), RDF.type, None)):
      if o == SCHEMA.Movie or o == DBO.Film:
        is_movie = True

    if not is_movie:
      # print(f"'{name}': '{entity}' is possibly wrong because it tends to be no movie.")
      possibly_wrong_mappings[name] = entity

  return possibly_wrong_mappings

def URI_generator(movielens_names):
    """ Generates a list of potential URIs for each item. 
        Potential URI names are based on personal experience, how correct URIs seems to look.
    Args: 
        movielens_names (list): Names of all items in the movielens dataset.
    Returns:
        URI_dict (dict): Contains mappings for each item to its list of potential URI names.
    """

    DBR = "http://dbpedia.org/resource/"

    URI_dict = {}
    for idx, name in enumerate(movielens_names):
        
        year_name = re.findall(r'\(\d\d\d\d\)', name)[0]
        second_name = re.findall(r'\(([^\)]+)\)', name)
        if len(second_name) >= 2:
          name = name.replace(second_name[0], "")
        
        # Resolve Mummy's Curse, The (1944) in The Mummy's Curse (1944)
        articles = [", The", ", An", ", A", ", El", ", Der", ", La", ", Les", ", L'", ", Le", ", O", ", Il"]
        for article in articles:
            if article in name:
                # Split Mummy's Curse, The (1944) in ["Mummy's Curse", 'The (1944)']
                name_split = name.split(", ")
                # Create correct name
                if article == ", L'":
                  name = article.split(", ")[1] + " ".join([name_split[0], year_name])
                else:
                  name = " ".join([article.split(", ")[1], name_split[0], year_name])
                break
        
        # Create URI form of the Name
        year_name = re.findall(r'\(\d\d\d\d\)', name)[0][1:-1]

        name = name.replace("?", "%3F")
        uriname = name.replace(" ", "_")
        
        base = DBR + uriname.split("_(", 1)[0]
        film = base + "_(film)"

        year = int(year_name)
        year_film_1= base + f"_({year-1}_film)"
        year_film_2= base + f"_({year}_film)"
        year_film_3= base + f"_({year+1}_film)"
        
        URI_dict[movielens_names[idx]] = [year_film_1, year_film_2, year_film_3, film, base]

        if second_name:
          second_name = second_name[0]
          if not second_name.isdigit():
            articles = [", The", ", An", ", A", ", El", ", Der", ", La", ", Les", ", L'", ", Le", ", O", ", Il"]
            for article in articles:
              if article in second_name:
                  # Split Mummy's Curse, The (1944) in ["Mummy's Curse", 'The (1944)']
                  name_split = second_name.split(", ")
                  year_name_str = "("+year_name+")"
                  # Create correct name
                  if article == ", L'":
                    second_name = article.split(", ")[1] + name_split[0]
                  else:
                    second_name = " ".join([article.split(", ")[1], name_split[0]])
                  break

            second_name = second_name.replace("?", "%3F")
            second_name_uriname = second_name.replace(" ", "_")

            base = DBR + second_name_uriname.rsplit("_(", 1)[0]
            film = base + "_(film)"

            year = int(year_name)
            year_film_1= base + f"_({year-1}_film)"
            year_film_2= base + f"_({year}_film)"
            year_film_3= base + f"_({year+1}_film)"

            URI_dict[movielens_names[idx]].append(year_film_1)
            URI_dict[movielens_names[idx]].append(year_film_2)
            URI_dict[movielens_names[idx]].append(year_film_3)
            URI_dict[movielens_names[idx]].append(film)
            URI_dict[movielens_names[idx]].append(base)

        if "(" in base:
          uriname = name.replace(" ", "_")

          base = DBR + uriname.rsplit("_(", 2)[0]
          film = base + "_(film)"

          year = int(year_name)
          year_film_1= base + f"_({year-1}_film)"
          year_film_2= base + f"_({year}_film)"
          year_film_3= base + f"_({year+1}_film)"

          URI_dict[movielens_names[idx]].append(year_film_1)
          URI_dict[movielens_names[idx]].append(year_film_2)
          URI_dict[movielens_names[idx]].append(year_film_3)
          URI_dict[movielens_names[idx]].append(film)
          URI_dict[movielens_names[idx]].append(base)

        if ": " in name:
          uriname = name.replace(" ", "_")

          base = DBR + uriname.split(":_")[0]
          film = base + "_(film)"

          year = int(year_name)
          year_film_1= base + f"_({year-1}_film)"
          year_film_2= base + f"_({year}_film)"
          year_film_3= base + f"_({year+1}_film)"

          URI_dict[movielens_names[idx]].append(year_film_1)
          URI_dict[movielens_names[idx]].append(year_film_2)
          URI_dict[movielens_names[idx]].append(year_film_3)
          URI_dict[movielens_names[idx]].append(film)
          URI_dict[movielens_names[idx]].append(base)

        if "%3F" in name:
          uriname = name.replace(" ", "_")
          uriname = uriname.replace("%3F", "")

          base = DBR + uriname.rsplit("_(", 1)[0]
          film = base + "_(film)"

          year = int(year_name)
          year_film_1= base + f"_({year-1}_film)"
          year_film_2= base + f"_({year}_film)"
          year_film_3= base + f"_({year+1}_film)"

          URI_dict[movielens_names[idx]].append(year_film_1)
          URI_dict[movielens_names[idx]].append(year_film_2)
          URI_dict[movielens_names[idx]].append(year_film_3)
          URI_dict[movielens_names[idx]].append(film)
          URI_dict[movielens_names[idx]].append(base)
        
    return URI_dict

def find_correct(URI_dict, graph):
  """ Filter the list of potential URI names for each item, by the URI that seems to be a movie (rdf:type does contain SCHEMA.Movie or DBO.Film)
      Note that we here don't have to check anymore for the year constraint, as we take care of that by the generated URIs.
      We could be more exact here, but we tend to do almost 100% alright with the current procedure.
  Args: 
      URI_dict (dict): Contains mappings for each item to its list of potential URI names.
      graph (Graph): RDFlib Graph with the HDT document as a backend.
  Returns:
      correct_URIs (dict): Contains mappings for each item to its potential URI, maps item to -1 otherwise.
  """

  SCHEMA = Namespace('http://schema.org/')
  DBO = Namespace('http://dbpedia.org/ontology/')

  correct_URIs = {}
  for name, URI_list in tqdm(URI_dict.items()):
    correct_URIs[name] = -1

    for entity in URI_list:

        # Step 1 – Handle Disambiguation Page
        # Check, if one of the objects listed is a movie (rdf:type does contain SCHEMA.Movie or DBO.Film)
        pageRedirects = set()
        for s, p, o in graph.triples((URIRef(entity), DBO.wikiPageRedirects, None)):
            pageRedirects.add(o)

        rp_movie = False
        for rp in pageRedirects:
          for s, p, o in graph.triples((URIRef(rp), None, None)):
            if p == RDF.type:
              if o == SCHEMA.Movie or o == DBO.Film:
                rp_movie = True

        if rp_movie:
          correct_URIs[name] = str(rp)
          break

        # Step 2 – Check if the entity is a movie (rdf:type does contain SCHEMA.Movie or DBO.Film)
        movie = False
        for s, p, o in graph.triples((URIRef(entity), None, None)):
          if p == RDF.type:
            if o == SCHEMA.Movie or o == DBO.Film:
              movie = True
              break
        
        if movie:
          correct_URIs[name] = entity
          break

  return correct_URIs

## Get URIs

In [7]:
import pandas as pd

# Load data
movielens_df = pd.read_csv('./movielens/MappingMovielens2DBpedia-1.2.tsv', sep='\t', header=None, engine='python')
movielens_df.columns = ['ID', 'Movie', 'URI']
movielens_ids = list(movielens_df['ID'])
movielens_names = list(movielens_df['Movie'])
movielens_entities = list(movielens_df['URI'])

# Create dict that contains mappings for each item to its entity, provided by prior work.
movielens_mappings = pd.Series(movielens_df.URI.values,index=movielens_df.Movie).to_dict()

## Control URIs

In [8]:
# Get dict that contains mappings for each item to its entity, if the mapped entity seems to be wrong.
possibly_wrong_mappings = check_mappings(movielens_mappings, graph)
print(len(possibly_wrong_mappings))

100%|██████████| 3301/3301 [00:00<00:00, 6746.43it/s]

405





## Get Correct URIs

In [9]:
# Generates a list of potential URIs for each item.
URI_dict = URI_generator(movielens_names)

# Filter the list of potential URI names for each item, by the URI that seems to be a movie (rdf:type does contain SCHEMA.Movie or DBO.Film)
correct_URIs = find_correct(URI_dict, graph)

100%|██████████| 3301/3301 [00:06<00:00, 517.60it/s]


In [11]:
# Seperate correct_URIs in two dicts:

# 1. not_found_URIs contains all the mappings of items, no potential correct URI could be found for
not_found_URIs = {key:value for (key,value) in correct_URIs.items() if value == -1}
print(len(not_found_URIs))

# 2. found_URIs contains all the mappings of items, a potential correct URI could be found for successfully
found_URIs = {key:value for (key,value) in correct_URIs.items() if value != -1}
print(len(found_URIs))

83
3218


In [12]:
# Check, which of the found_URIs mappings could still be wrong.
possibly_wrong_found_URIs = check_mappings(found_URIs, graph)
print(len(possibly_wrong_found_URIs))

100%|██████████| 3218/3218 [00:00<00:00, 6990.10it/s]

27





## Make Corrections

In [13]:
print(len(possibly_wrong_mappings), "of the initial URIs are possibly wrong.")

not_found_intersection = [i for i in possibly_wrong_mappings.keys() if i in not_found_URIs.keys()]
print("Possibly wrong URIs that can't be corrected because no URI could be found:", len(not_found_intersection))

found_double_possible_wrong_intersection = [i for i in possibly_wrong_mappings.keys() if i in possibly_wrong_found_URIs.keys()]
print("Possibly wrong can't be corrected beacuse found URI is also possibly wrong:", len(found_double_possible_wrong_intersection))

found_intersection = [i for i in possibly_wrong_mappings.keys() if i in found_URIs.keys() and i not in possibly_wrong_found_URIs.keys()]
print("Possibly wrong can be corrected, because found URI seems to be valid:", len(found_intersection))

405 of the initial URIs are possibly wrong.
Possibly wrong URIs that can't be corrected because no URI could be found: 37
Possibly wrong can't be corrected beacuse found URI is also possibly wrong: 18
Possibly wrong can be corrected, because found URI seems to be valid: 350


In [14]:
# Update wrong URIs with found valid URIs
count = 0
for key in found_intersection:
  if possibly_wrong_mappings[key] != found_URIs[key]:
    index = movielens_df.loc[movielens_df['Movie'] == key].index
    movielens_df.at[index, 'URI'] = found_URIs[key]
    count += 1
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {found_URIs[key]:<45}")
  
print(f"Updated {count} URIs")

Love's Labour's Lost (2000)                   http://dbpedia.org/resource/Love's_Labour's_Lost_(2000_film)           	 http://dbpedia.org/resource/Love's_Labour's_Lost_(film)
Blood Feast (1963)                            http://dbpedia.org/resource/Blood_Feast_(1972_film)                    	 http://dbpedia.org/resource/Blood_Feast      
Kingpin (1996)                                http://dbpedia.org/resource/Kingpin                                    	 http://dbpedia.org/resource/Kingpin_(1996_film)
Everything You Always Wanted to Know About Sex (1972) http://dbpedia.org/resource/Everything_You_Always_Wanted_to_Know_About_Sex*_(*But_Were_Afraid_to_Ask) 	 http://dbpedia.org/resource/Everything_You_Always_Wanted_to_Know_About_Sex*_(*But_Were_Afraid_to_Ask)_(film)
Mondo (1996)                                  http://dbpedia.org/resource/Mondo_cane                                 	 http://dbpedia.org/resource/Mondo_(film)     
Stars and Bars (1988)                         http://dbpedia.

In [15]:
# Delete wrong URIs, where a URI is found but the found URI seems to also be possibly wrong
for key in found_double_possible_wrong_intersection:
    index = movielens_df.loc[movielens_df['Movie'] == key].index
    movielens_df = movielens_df.drop(index)
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {possibly_wrong_found_URIs[key]:<45}")

Acid House, The (1998)                        http://dbpedia.org/resource/The_Acid_House_(film)                      	 http://dbpedia.org/resource/The_Acid_House_(film)
Solar Crisis (1993)                           http://dbpedia.org/resource/Solar_Crisis_(film)                        	 http://dbpedia.org/resource/Solar_Crisis_(film)
Breathing Room (1996)                         http://dbpedia.org/resource/Breathing_Room                             	 http://dbpedia.org/resource/Breathing_Room   
Century of Cinema, A (1994)                   http://dbpedia.org/resource/A_Century_of_Cinema                        	 http://dbpedia.org/resource/A_Century_of_Cinema
Choices (1981)                                http://dbpedia.org/resource/Choices_(film)                             	 http://dbpedia.org/resource/Choices_(film)   
Full Speed (1996)                             http://dbpedia.org/resource/Full_Speed                                 	 http://dbpedia.org/resource/Full_Speed       
Em

In [16]:
# Delete wrong URIs, where no other URI could be found
for key in not_found_intersection:
    index = movielens_df.loc[movielens_df['Movie'] == key].index
    movielens_df = movielens_df.drop(index)
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {not_found_URIs[key]:<45}")

Boys of St. Vincent, The (1993)               http://dbpedia.org/resource/The_Boys_of_St._Vincent                    	 -1                                           
Last Resort (1994)                            http://dbpedia.org/resource/Last_Resort_(1986_film)                    	 -1                                           
Blood & Wine (1997)                           http://dbpedia.org/resource/Blood_Diamond_(film)                       	 -1                                           
Contact (1997)                                http://dbpedia.org/resource/Contact                                    	 -1                                           
Emma (1996)                                   http://dbpedia.org/resource/Emma                                       	 -1                                           
Homage (1995)                                 http://dbpedia.org/resource/Homage_to_Chagall:_The_Colours_of_Love     	 -1                                           
Eden (1997

In [17]:
# Check if duplicates exist
movielens_df[movielens_df.duplicated(['URI'], keep=False)]

Unnamed: 0,ID,Movie,URI
1272,3065,Ten Benny (1997),http://dbpedia.org/resource/Ten_Benny
1487,811,"Bewegte Mann, Der (1994)",http://dbpedia.org/resource/Der_bewegte_Mann
1717,860,"Maybe, Maybe Not (Bewegte Mann, Der) (1994)",http://dbpedia.org/resource/Der_bewegte_Mann
2386,875,Nothing to Lose (1994),http://dbpedia.org/resource/Ten_Benny


In [18]:
movielens_df.shape

(3246, 3)

In [19]:
# Delete Duplicates but keep first element, otherwise, we will have Problems with Index Mappings in the function later
movielens_df.drop_duplicates(subset ="URI", keep = 'first', inplace = True) 

In [20]:
movielens_df.shape

(3244, 3)

## Make the Update

In [21]:
movielens_df.to_csv('./movielens/Mapping2DBpedia-1.2-corrected.tsv', sep='\t', index = False, header = False, encoding="utf-8")

# Lastfm
## Implemented Functions

* check_mappings: marks initial/generated mappings as potentially wrong
* URI_generator: generates a list of potential URIs for each item.
* find_correct: filter the list of potential URI names for each item, by the URI that seems to be a artist

In [22]:
import re
from tqdm import tqdm
from rdflib import Graph, URIRef, Namespace
from rdflib.namespace import RDFS, OWL, RDF

def check_mappings(lastfm_mappings, graph):
    """ Mark the mapped entity as potentially wrong, if one of the following criteria holds:
      – the mapped entity seems to be wrong (the abstract is empty or the name does not match).
      – the entity does not seem to be an artist (rdf:type does not contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
    Args:
        lastfm_mappings (dict): Contains mappings for each item to its entity, provided by prior work.
        graph (Graph): RDFlib Graph with the HDT document as a backend.
    Returns:
        possibly_wrong_mappings (dict): Contains mappings for each item to its entity, if the mapped entity seems to be wrong.
    """

    SCHEMA = Namespace('http://schema.org/')
    DBR = "http://dbpedia.org/resource/"
    DBO = Namespace('http://dbpedia.org/ontology/')
    DCT = Namespace("http://purl.org/dc/terms/") 

    # Check for wrongly mapped entities
    possibly_wrong_mappings = {}
    different_name = {}
    for name, entity in tqdm(lastfm_mappings.items()):

        # Step 1 – Check if abstract is not empty and the names match
        is_not_empty = False
        for s, p, o in graph.triples((URIRef(entity), DBO.abstract, None)):
          is_not_empty = True
    
        if not is_not_empty:
          #print(f"'{name}': '{entity}' is possibly wrong because empty.")
          possibly_wrong_mappings[name] = entity
          continue

        uriname = name.replace(" ", "_")
        base = DBR + uriname
        entity_cleaned = entity

        if entity[-1] == ")":
            splitted_list = entity.split("(")[0]
            entity_cleaned = splitted_list[:-1]

        if entity_cleaned.lower() != base.lower():
            #print(f"'{name}': '{entity}' is possibly wrong because different name.")
            possibly_wrong_mappings[name] = entity
            continue
        
        # Step 2 – Check if entity is a artist (rdf:type does contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
        is_artist = False
        for s, p, o in graph.triples((URIRef(entity), RDF.type, None)):
          if o == DBO.MusicalArtist or o == DBO.Band or o == SCHEMA.MusicGroup or o == DBO.Artist:
            is_artist = True

        if not is_artist:
          #print(f"'{name}': '{entity}' is possibly wrong because it tends to be no artist.")
          possibly_wrong_mappings[name] = entity
  
    return possibly_wrong_mappings

def URI_generator(lastfm_names):
    """ Generates a list of potential URIs for each item. 
        Potential URI names are based on personal experience, how correct URIs seems to look.
    Args: 
        lastfm_names (list): Names of all items in the movielens dataset.
    Returns:
        URI_dict (dict): Contains mappings for each item to its list of potential URI names.
    """
    DBR = "http://dbpedia.org/resource/"

    URI_dict = {}
    for idx, name in enumerate(lastfm_names):
        
        uriname = name.replace(" ", "_")
        
        base = DBR + uriname
        band = base + "_(band)"
        musician = base + "_(musician)"
        singer = base + "_(singer)"
        
        URI_dict[lastfm_names[idx]] = [band, musician, singer, base]
        
    return URI_dict

def find_correct(URI_dict, graph):
  """ Filter the list of potential URI names for each item, by the URI that seems to be an artist (rdf:type does not contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
      Note that we here don't have to check anymore for the year constraint, as we take care of that by the generated URIs.
      We could be more exact here, but we tend to do almost 100% alright with the current procedure.
  Args: 
      URI_dict (dict): Contains mappings for each item to its list of potential URI names.
  Returns:
      correct_URIs (dict): Contains mappings for each item to its potential URI, maps item to -1 otherwise.
  """

  SCHEMA = Namespace('http://schema.org/')
  DBO = Namespace('http://dbpedia.org/ontology/')

  correct_URIs = {}
  for name, URI_list in tqdm(URI_dict.items()):
    correct_URIs[name] = -1

    for entity in URI_list:

        # Step 1 – Handle Disambiguation Page
        # Check, if one of the objects listed is an artist (rdf:type does not contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
        pageRedirects = set()
        for s, p, o in graph.triples((URIRef(entity), DBO.wikiPageRedirects, None)):
          pageRedirects.add(o)

        rp_artist = False
        for rp in pageRedirects:
          for s, p, o in graph.triples((URIRef(rp), RDF.type, None)):
            if o == DBO.MusicalArtist or o == DBO.Band or o == SCHEMA.MusicGroup or o == DBO.Artist:
              rp_artist = True

        if rp_artist:
          correct_URIs[name] = str(rp)
          break

        # Step 2 – Check if the entity is an artist (rdf:type does not contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
        artist = False
        for s, p, o in graph.triples((URIRef(entity), RDF.type, None)):
          if p == RDF.type:
            if o == DBO.MusicalArtist or o == DBO.Band or o == SCHEMA.MusicGroup or o == DBO.Artist:
              artist = True
              break
        
        if artist:
          correct_URIs[name] = entity
          break

  return correct_URIs

## Get URIs

In [23]:
import pandas as pd

# Load data
lastfm_df = pd.read_csv('./lastfm/MappingLastfm2DBpedia-1.2.tsv', sep='\t', header=None, engine='python')
lastfm_df.columns = ['ID', 'Artist', 'URI']
lastfm_ids = list(lastfm_df['ID'])
lastfm_names = list(lastfm_df['Artist'])
lastfm_entities = list(lastfm_df['URI'])

# Create dict that contains mappings for each item to its entity, provided by prior work.
lastfm_dict = pd.Series(lastfm_df.URI.values,index=lastfm_df.Artist).to_dict()

## Analyze URIs
First we analyze the initial URIs to get well working conditions for the "check"-function.<br>
We have to do this, as the criteria is not as obvious as it was for the movielens dataset.<br>

Based on the following output, we define the criteria as:<br> 
_entity is a artist (rdf:type does contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)_

In [24]:
import re
from tqdm import tqdm
from rdflib import Graph, URIRef, Namespace
from rdflib.namespace import RDFS, OWL, RDF

def analyze_entitities(entities):
  """ Return Frequency dict of tags (content in paranthesis) of the inial DBPedia URIs
  Args:
      lastfm_entities (list): Intial URIs of the lastfm dataset.
  Returns:
      tag_dict (dict): Maps each tag to its occurence frequency.
  """

  tag_dict = {}
  for enitity in lastfm_entities:
    name_extra = re.findall('\(.*?\)', enitity)
    if name_extra:
      entitiy_name = name_extra[0]
      tag_dict[entitiy_name] = tag_dict.get(entitiy_name, 0) + 1

  return tag_dict

def analyze_tags(lastfm_entities, graph):
  """ Return Frequency dict of found rdf.types of the inial DBPedia URIs
  Args:
      lastfm_entities (list): Intial URIs of the lastfm dataset.
      graph (Graph): RDFlib Graph with the HDT document as a backend.
  Returns:
      type_dict (dict): Maps each rdf.type to its occurence frequency.
  """

  type_dict = {}
  for entity in tqdm(lastfm_entities):
    for s, p, o in graph.triples((URIRef(entity), RDF.type, None)):
      type_dict[o] = type_dict.get(o, 0) + 1

  return type_dict

In [33]:
# Get frequency dict of tags (content in paranthesis) of the inial DBPedia URIs
name_dict = analyze_entitities(lastfm_entities)
for w in sorted(name_dict, key=name_dict.get, reverse=True):
  if name_dict[w] > 10:
    print(w, name_dict[w])

(band) 1388
(musician) 181
(singer) 137
(rapper) 54
(group) 32
(American_band) 18
(British_band) 14
(composer) 11


In [34]:
# Get frequency dict of found rdf.types of the inial DBPedia URIs
tags_dict = analyze_tags(lastfm_entities, store)
for w in sorted(tags_dict, key=tags_dict.get, reverse=True):
  if tags_dict[w] > 500:
    print(w, tags_dict[w])

100%|██████████| 10180/10180 [00:04<00:00, 2037.04it/s]

http://www.w3.org/2002/07/owl#Thing 9844
http://dbpedia.org/ontology/Agent 9810
http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent 9807
http://www.wikidata.org/entity/Q24229398 9807
http://schema.org/MusicGroup 9420
http://dbpedia.org/ontology/Band 6448
http://dbpedia.org/ontology/Organisation 5957
http://schema.org/Organization 5954
http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#SocialPerson 5954
http://www.wikidata.org/entity/Q43229 5954
http://dbpedia.org/ontology/Group 5953
http://www.wikidata.org/entity/Q215380 5952
http://dbpedia.org/ontology/MusicalArtist 4547
http://dbpedia.org/ontology/Person 4028
http://schema.org/Person 3853
http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#NaturalPerson 3853
http://www.wikidata.org/entity/Q215627 3853
http://www.wikidata.org/entity/Q5 3853
http://xmlns.com/foaf/0.1/Person 3853
http://dbpedia.org/ontology/Artist 3468
http://www.wikidata.org/entity/Q483501 3468
http://umbel.org/umbel/rc/Band_MusicGroup 2076
http://umbel.or




## Control URIs

In [25]:
# Get dict that contains mappings for each item to its entity, if the mapped entity seems to be wrong.
possibly_wrong_mappings = check_mappings(lastfm_dict, graph)
print(len(possibly_wrong_mappings))

100%|██████████| 10180/10180 [00:02<00:00, 4312.30it/s]

1676





## Get Correct URIs

In [26]:
# Generates a list of potential URIs for each item.
URI_dict = URI_generator(lastfm_names)

# Filter the list of potential URI names for each item, by the URI that seems to be an artist (rdf:type does not contain DBO.MusicalArtist or DBO.Band or SCHEMA.MusicGroup or DBO.Artist)
correct_URIs = find_correct(URI_dict, graph)

100%|██████████| 10180/10180 [00:02<00:00, 4996.21it/s]


In [27]:
# Seperate correct_URIs in two dicts:

# 1. not_found_URIs contains all the mappings of items, no potential correct URI could be found for
not_found_URIs = {key:value for (key,value) in correct_URIs.items() if value == -1}
print(len(not_found_URIs))

# 2. found_URIs contains all the mappings of items, a potential correct URI could be found for successfully
found_URIs = { key:value for (key,value) in correct_URIs.items() if value != -1}
print(len(found_URIs))

1115
9065


In [28]:
# Check, which of the found_URIs mappings could still be wrong.
possibly_wrong_found_URIs = check_mappings(found_URIs, graph)
print(len(possibly_wrong_found_URIs))

100%|██████████| 9065/9065 [00:02<00:00, 4157.52it/s]

599





## Make Corrections

In [29]:
print(len(possibly_wrong_mappings), "URIs are possibly wrong.")

not_found_intersection = [i for i in possibly_wrong_mappings.keys() if i in not_found_URIs.keys()]
print("Possibly wrong URIs that can't be corrected because no URI could be found:", len(not_found_intersection))

found_double_possible_wrong_intersection = [i for i in possibly_wrong_mappings.keys() if i in possibly_wrong_found_URIs.keys()]
print("Possibly wrong can't be corrected beacuse found URI is also possibly wrong:", len(found_double_possible_wrong_intersection))

found_intersection = [i for i in possibly_wrong_mappings.keys() if i in found_URIs.keys() and i not in possibly_wrong_found_URIs.keys()]
print("Possibly wrong can be corrected, because found URI seems to be valid:", len(found_intersection))

1676 URIs are possibly wrong.
Possibly wrong URIs that can't be corrected because no URI could be found: 824
Possibly wrong can't be corrected beacuse found URI is also possibly wrong: 566
Possibly wrong can be corrected, because found URI seems to be valid: 286


In [30]:
# Update Found Artist
count = 0
for key in found_intersection:
  if possibly_wrong_mappings[key] != found_URIs[key]:
    # Update df accordingly to found_URI
    index = lastfm_df.loc[lastfm_df['Artist'] == key].index
    lastfm_df.at[index, 'URI'] = found_URIs[key]
    count += 1
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {found_URIs[key]:<45}")
  
print(f"Updated {count} URIs")

Star One                                      http://dbpedia.org/resource/Star_One                                   	 http://dbpedia.org/resource/Star_One_(band)  
Act                                           http://dbpedia.org/resource/Hani_(producer)                            	 http://dbpedia.org/resource/Act_(band)       
Goldie                                        http://dbpedia.org/resource/Goldie_Loc                                 	 http://dbpedia.org/resource/Goldie_(band)    
Wolverine                                     http://dbpedia.org/resource/The_Wolverines_(rock_band)                 	 http://dbpedia.org/resource/Wolverine_(band) 
Annuals                                       http://dbpedia.org/resource/Annual                                     	 http://dbpedia.org/resource/Annuals_(band)   
Karat                                         http://dbpedia.org/resource/Jean_Karat                                 	 http://dbpedia.org/resource/Karat_(band)     
Esteban   

Wings                                         http://dbpedia.org/resource/Paul_McCartney_and_Wings                   	 http://dbpedia.org/resource/Wings_(band)     
Sensational                                   http://dbpedia.org/resource/Sensational_Nightingales                   	 http://dbpedia.org/resource/Sensational_(musician)
High on Fire                                  http://dbpedia.org/resource/High_On_Fire                               	 http://dbpedia.org/resource/High_on_Fire     
Torche                                        http://dbpedia.org/resource/Torche                                     	 http://dbpedia.org/resource/Torche_(band)    
Slums Attack                                  http://dbpedia.org/resource/Peja_(rapper)                              	 http://dbpedia.org/resource/Slums_Attack     
Drake                                         http://dbpedia.org/resource/Drake_(rapper)                             	 http://dbpedia.org/resource/Drake_(musician) 
Fulan

In [31]:
# Delete Wrong Found Artist
for key in found_double_possible_wrong_intersection:
    index = lastfm_df.loc[lastfm_df['Artist'] == key].index
    lastfm_df = lastfm_df.drop(index)
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {possibly_wrong_found_URIs[key]:<45}")# Delete Wrong Found Movies

Twin Sister                                   http://dbpedia.org/resource/Mr._Twin_Sister                            	 http://dbpedia.org/resource/Mr_Twin_Sister   
UK Subs                                       http://dbpedia.org/resource/U.K._Subs                                  	 http://dbpedia.org/resource/U.K._Subs        
Thao with The Get Down Stay Down              http://dbpedia.org/resource/Thao_&_The_Get_Down_Stay_Down              	 http://dbpedia.org/resource/Thao_&_the_Get_Down_Stay_Down
The Bug                                       http://dbpedia.org/resource/Luv_Bug                                    	 http://dbpedia.org/resource/Kevin_Martin_(British_musician)
There For Tomorrow                            http://dbpedia.org/resource/There_for_Tomorrow                         	 http://dbpedia.org/resource/Afterhour_(band) 
Reverend Horton Heat                          http://dbpedia.org/resource/The_Reverend_Horton_Heat                   	 http://dbpedia.org/resource/Th

The London Philharmonic Orchestra             http://dbpedia.org/resource/London_Philharmonic_Orchestra              	 http://dbpedia.org/resource/London_Philharmonic_Orchestra
Pretty Poison                                 http://dbpedia.org/resource/Pretty_Poison                              	 http://dbpedia.org/resource/Pretty_Poison    
Emily Haines & the Soft Skeleton              http://dbpedia.org/resource/Emily_Haines                               	 http://dbpedia.org/resource/Emily_Haines     
Machine Drum                                  http://dbpedia.org/resource/Machinedrum                                	 http://dbpedia.org/resource/Machinedrum      
Clazziquai Project                            http://dbpedia.org/resource/Clazziquai                                 	 http://dbpedia.org/resource/Clazziquai       
78violet                                      http://dbpedia.org/resource/78violet                                   	 http://dbpedia.org/resource/Aly_&_AJ        

Libido                                        http://dbpedia.org/resource/Mattias_Eklundh                            	 http://dbpedia.org/resource/Líbido_(band)    
Mya                                           http://dbpedia.org/resource/Jay_Mya                                    	 http://dbpedia.org/resource/Mýa              
Brandy                                        http://dbpedia.org/resource/Brandy_Norwood                             	 http://dbpedia.org/resource/Brandy_Norwood   
Jorge Ben                                     http://dbpedia.org/resource/Jorge_Ben_Jor                              	 http://dbpedia.org/resource/Jorge_Ben_Jor    
Lonny Breaux                                  http://dbpedia.org/resource/Frank_Ocean                                	 http://dbpedia.org/resource/Frank_Ocean      
The Recoys                                    http://dbpedia.org/resource/The_Recoys                                 	 http://dbpedia.org/resource/Hamilton_Leithauser
Pryda   

Ian Van Dahl                                  http://dbpedia.org/resource/AnnaGrace                                  	 http://dbpedia.org/resource/AnnaGrace        
The Grouch & Eligh                            http://dbpedia.org/resource/The_Grouch_(rapper)                        	 http://dbpedia.org/resource/Eligh            
The Advantage                                 http://dbpedia.org/resource/The_Advantage                              	 http://dbpedia.org/resource/The_Advantage    
Smog                                          http://dbpedia.org/resource/Golden_Smog                                	 http://dbpedia.org/resource/Bill_Callahan_(musician)
Finn                                          http://dbpedia.org/resource/Tim_Finn                                   	 http://dbpedia.org/resource/Finn_(band)      
GZA/Genius                                    http://dbpedia.org/resource/Gza                                        	 http://dbpedia.org/resource/GZA              
Deb

In [32]:
for key in not_found_intersection:
    index = lastfm_df.loc[lastfm_df['Artist'] == key].index
    lastfm_df = lastfm_df.drop(index)
    print(f"{key:<45} {possibly_wrong_mappings[key]:<70} \t {not_found_URIs[key]:<45}")

Deadmau5 & Wolfgang Gartner                   http://dbpedia.org/resource/Deadmau5                                   	 -1                                           
7Seconds                                      http://dbpedia.org/resource/7_Seconds_(band)                           	 -1                                           
mono inc. & lisa middelhauve                  http://dbpedia.org/resource/Mono_Inc.                                  	 -1                                           
Yui Makino                                    http://dbpedia.org/resource/Yui_Makino                                 	 -1                                           
Beth Gibbons & Rustin Man                     http://dbpedia.org/resource/Beth_Gibbons                               	 -1                                           
The Weird Sisters                             http://dbpedia.org/resource/Three_Weird_Sisters_(band)                 	 -1                                           
Olu       

Yolanda Be Cool & DCUP                        http://dbpedia.org/resource/Yolanda_Be_Cool                            	 -1                                           
Killwhitneydead                               http://dbpedia.org/resource/Killwhitneydead                            	 -1                                           
Aida                                          http://dbpedia.org/resource/Aida_Nadeem                                	 -1                                           
Benedetti & Svoboda                           http://dbpedia.org/resource/Myria_Benedetti                            	 -1                                           
Marius                                        http://dbpedia.org/resource/Marius_Moga                                	 -1                                           
Tigarah                                       http://dbpedia.org/resource/Tigarah                                    	 -1                                           
The Up On 

Dr. Octagon                                   http://dbpedia.org/resource/Kool_Keith                                 	 -1                                           
Zander                                        http://dbpedia.org/resource/Robin_Zander                               	 -1                                           
This Romantic Tragedy                         http://dbpedia.org/resource/This_Romantic_Tragedy                      	 -1                                           
Dylan & Robyn Chaos                           http://dbpedia.org/resource/Dylan_Michal                               	 -1                                           
presto?                                       http://dbpedia.org/resource/Presto_Ballet                              	 -1                                           
Spm                                           http://dbpedia.org/resource/South_Park_Mexican                         	 -1                                           
Overseer  

Taylor Swift & Def Leppard                    http://dbpedia.org/resource/Taylor_Swift                               	 -1                                           
Alan                                          http://dbpedia.org/resource/Aslan_(band)                               	 -1                                           
Tonka                                         http://dbpedia.org/resource/DJ_Tonka                                   	 -1                                           
Yahel                                         http://dbpedia.org/resource/Yinon_Yahel                                	 -1                                           
Narcosis                                      http://dbpedia.org/resource/Narcosis                                   	 -1                                           
Popcorn                                       http://dbpedia.org/resource/Richard_%22Popcorn%22_Wylie                	 -1                                           
Breathe In

Hive                                          http://dbpedia.org/resource/Hive_(record_producer)                     	 -1                                           
Dream Theater & Bruce Dickinson               http://dbpedia.org/resource/Dream_Theater                              	 -1                                           
Charlie's Angels                              http://dbpedia.org/resource/Charlie's_Angels_(band)                    	 -1                                           
Bruce Dickinson & Montserrat Cabelle          http://dbpedia.org/resource/Bruce_Dickinson                            	 -1                                           
Carlos Santana & Alice Coltrane               http://dbpedia.org/resource/Carlos_Santana                             	 -1                                           
Robin Thicke & Mary J. Blige                  http://dbpedia.org/resource/Robin_Thicke                               	 -1                                           
Naturi Nau

Juelz Santana & Lil Wayne                     http://dbpedia.org/resource/Juelz_Santana                              	 -1                                           
theaudience                                   http://dbpedia.org/resource/Theaudience                                	 -1                                           
Jeffree Star                                  http://dbpedia.org/resource/Jeffree_Star                               	 -1                                           
Nicole Scherzinger                            http://dbpedia.org/resource/Nicole_Scherzinger                         	 -1                                           
Sonny Boy Williamson                          http://dbpedia.org/resource/Sonny_Boy_Williamson_(original)            	 -1                                           
Joyce & Ridge                                 http://dbpedia.org/resource/Joyce_Sims                                 	 -1                                           
Sibel     

In [33]:
# Check if duplicates exist
lastfm_df[lastfm_df.duplicated(['URI'], keep=False)]

Unnamed: 0,ID,Artist,URI


## Make the Update

In [34]:
lastfm_df.to_csv('./lastfm/Mapping2DBpedia-1.2-corrected.tsv', sep='\t', index = False, header = False, encoding="utf-8")