In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Task 1: Using RLTK to perform Entity Resolution (ER)

<sub>Content of this notebook was prepared by Basel Shbita, and modified by Avijit Thawani (thawani@usc.edu) as part of the class <u>DSCI 558: Building Knowledge Graphs</u> at University of Southern California (USC).</sub>

The Record Linkage ToolKit ([RLTK](https://github.com/usc-isi-i2/rltk)) is a general-purpose open-source record linkage platform that allows users to build powerful Python programs that link records referring to the same underlying entity.

This notebook introduces some applied examples using RLTK. You can also find additional examples and use-cases in [RLTK's documentation](https://rltk.readthedocs.io/en/master/).

## Dataset analysis & RLTK components construction

In [None]:
!pip install rltk

### Task 1-1. Construct RLTK Datasets

First, you need define how a single entry would like for each type of record (for each dataset)

In [3]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [4]:
import re
from datetime import datetime

def get_singer(singer_str, isCosinger):
  singers = re.split(' ft. | feat. | & | featuring | - |, | and | & ', singer_str)
  if not singers:
    return singers
  return singers[1:] if isCosinger else singers[0]

class SendhandRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
    

    @rltk.cached_property
    def id(self):
        return self.raw_object['index']

    @rltk.cached_property
    def song(self):
        return self.raw_object['title']

    @rltk.cached_property
    def singer(self): 
      return get_singer(self.raw_object['originally_by'], isCosinger=False)
    
    @rltk.cached_property
    def singer_tokens(self):
        singer = get_singer(self.raw_object['originally_by'], isCosinger=False)
        return set(tokenizer.tokenize(singer))
    
    @rltk.cached_property
    def singer_first_name(self):
      return get_singer(self.raw_object['originally_by'], isCosinger=False).split(' ')[0]

    @rltk.cached_property
    def co_singer(self): 
      return get_singer(self.raw_object['originally_by'], isCosinger=True)
    


    
class WikiRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)

    @rltk.cached_property
    def id(self):
        return self.raw_object['song'].split('/')[-1]

    @rltk.cached_property
    def song(self):
        return self.raw_object['songLabel']

    @rltk.cached_property
    def singer(self): 
      return get_singer(self.raw_object['performerLabel'], isCosinger=False)
    
    @rltk.cached_property
    def singer_tokens(self):
        singer = get_singer(self.raw_object['performerLabel'], isCosinger=False)
        return set(tokenizer.tokenize(singer))
    
    @rltk.cached_property
    def singer_first_name(self):
      return get_singer(self.raw_object['performerLabel'], isCosinger=False).split(' ')[0]

    @rltk.cached_property
    def co_singer(self): 
      return get_singer(self.raw_object['performerLabel'], isCosinger=True)

In [6]:
dir_ = ''
sendhand_file = dir_ + 'original_song_stats.csv'
wiki_file = dir_ + 'nominated_or_received_awards.csv'

ds1 = rltk.Dataset(rltk.CSVReader(sendhand_file),record_class=SendhandRecord)
ds2 = rltk.Dataset(rltk.CSVReader(wiki_file),record_class=WikiRecord)

You can load your csv files into RLTK using this method:

And we can inspect a few entries:

In [7]:
# print some entries
print(ds1.generate_dataframe().head(5))
print(ds2.generate_dataframe().head(5))

  id               song        singer    singer_tokens singer_first_name  \
0  1          Let It Go  Idina Menzel  {Idina, Menzel}             Idina   
1  2          All of Me   John Legend   {Legend, John}              John   
2  3              Hello         Adele          {Adele}             Adele   
3  4  Thinking Out Loud    Ed Sheeran    {Ed, Sheeran}                Ed   
4  5            Perfect    Ed Sheeran    {Ed, Sheeran}                Ed   

  co_singer  
0        []  
1        []  
2        []  
3        []  
4        []  
           id             song           singer       singer_tokens  \
0   Q16323774      Intelligent      Raske Penge      {Raske, Penge}   
1   Q17628069        Q17628069              C2C               {C2C}   
2  Q109659406       Q109659406  Svetlana Loboda  {Svetlana, Loboda}   
3     Q858020  Frontside Ollie   Robin Packalen   {Packalen, Robin}   
4    Q3878196  Non è l'inferno     Emma Marrone     {Emma, Marrone}   

  singer_first_name co_singer  


### Task 1-2. Blocking

First, we'll load dev set to evaluate both blocking (Task 1-2) and entity linking (Task 1-3).

In [73]:
# Block by Initialisms of singer name
bg = rltk.HashBlockGenerator()
block = bg.generate(
      bg.block(ds1, function_=lambda r: ''.join(r.singer.split(' '))), 
      bg.block(ds2, function_=lambda r: ''.join(r.singer.split(' '))))

# BLock by song name
bg2 = rltk.HashBlockGenerator()
block2 = bg2.generate(
      # bg2.block(ds1, function_=lambda r: r.publish_date[-4:], base_on=block), 
      # bg2.block(ds2, function_=lambda r: r.publish_date[-4:], base_on=block)
      bg2.block(ds1, function_=lambda r: r.song[0], base_on=block), 
      bg2.block(ds2, function_=lambda r: r.song[0], base_on=block)
      )

# bg3 = rltk.HashBlockGenerator()
# block3 = bg3.generate(
#       bg3.block(ds1, function_=lambda r: r.isbn), 
#       bg3.block(ds2, function_=lambda r: r.isbn))

pairs = rltk.get_record_pairs(ds1, ds2, block=block2)
# for r1, r2 in pairs:
#   print(r1.id, r1.author_first_name, r1.publish_date, '\t', r2.id, r2.author_first_name, r2.publish_date)

matched_wiki_ids = [] # for later ground truth evaluation
pairs_pred = []
with open(dir_+'blocked.csv', 'w+') as f:
  # write header
  f.write('secondhandsong_ID, wiki_nominated_ID\n')
  for r1, r2 in pairs:
    # print(r1.id, r1.author_first_name, r1.publish_date, '\t', r2.id, r2.author_first_name, r2.publish_date)
    pairs_pred += [(r1.id, r2.id)]
    f.write(f'{r1.id},{r2.id}\n')
    # print(f'{r1.id}, {r2.id}\n')

    matched_wiki_ids += [r2.id]

# Self-construct ground truth file

In [11]:
from itertools import combinations
import random
random.seed(123)

secondhand_df = ds1.generate_dataframe()
secondhand_ids = list(secondhand_df['id'].values)

wiki_df = ds2.generate_dataframe()
wiki_ids = list(wiki_df['id'].values)
# wiki_ids = non_matched_wiki_ids

secondhand_wiki_ids = []
for secondhand_id in secondhand_ids:
  for wiki_id in wiki_ids:
    secondhand_wiki_ids += [(secondhand_id, wiki_id)]
print(f'combinations Length = {len(secondhand_wiki_ids)}')

selected_secondhand_ids = random.sample(secondhand_wiki_ids, 100)
selected_secondhand_ids

with open(dir_+'matching_ground_truth.csv', 'w+') as f:
  f.write('secondhandsong_ID,wiki_nominated_ID,label')

  for (secondhand_id, wiki_id) in selected_secondhand_ids:
    f.write(f'\n{secondhand_id},{wiki_id},')

combinations Length = 1452900


In [None]:
import pandas as pd

merged_dict = {'wiki_id':[], 'wiki_song':[], 'wiki_singer':[],
               'secondhand_id':[], 'secondhand_song':[], 'secondhand_singer':[]}

for (secondhand_id, wiki_id) in selected_secondhand_ids:
  wiki_song = wiki_df.query(f'id=="{wiki_id}"').song.values[0]
  wiki_singer = wiki_df.query(f'id=="{wiki_id}"').singer.values[0]
  
  secondhand_song = secondhand_df.query(f'id=="{secondhand_id}"').song.values[0]
  secondhand_singer = secondhand_df.query(f'id=="{secondhand_id}"').singer.values[0]

  for (col, val) in [('wiki_id', wiki_id), ('wiki_song', wiki_song), ('wiki_singer', wiki_singer),
                     ('secondhand_id', secondhand_id), ('secondhand_song', secondhand_song), ('secondhand_singer', secondhand_singer)]:
    merged_dict[col] += [val]

merged_df = pd.DataFrame(merged_dict)
merged_df.to_csv(dir_+'Entity Linking/'+'check.csv')
merged_df

Unnamed: 0,wiki_id,wiki_song,wiki_singer,secondhand_id,secondhand_song,secondhand_singer
0,Q19856909,Maria Salvador,J-Ax,758,That Girl,J T
1,Q114704064,Q114704064,Filipe Escandurras,3872,The Only Reason,5 Seconds of Summer
2,Q108805763,Easy on Me,Adele,1261,Fire Away,Chris Stapleton
3,Q112774463,Taksi,Kalush,5890,Why You Gotta Be That Way,Ina Forsman
4,Q15131296,Erilaiset,Robin Packalen,3856,Wave,Beck
...,...,...,...,...,...,...
95,Q106970624,Miénteme,María Becerra,5868,Making It Up as We Go Along,Kristen Anderson-Lopez & Robert Lopez
96,Q19856909,Maria Salvador,J-Ax,7695,Hele deg,"Ulrikke Brandstorp, Unn Vibeke Hol, Nahom Fesh..."
97,Q13461348,Formidable,Stromae,7247,Dela min dröm med mej,Lasse Stefanz
98,Q97153993,Head & Heart,Joel Corry,864,Taki taki,DJ Snake - Selena Gomez - Ozuna - Cardi B


# Blocking Evaluatiom

In [19]:
dev_set_file = dir_ + 'matching_ground_truth.csv'
dev = []
columns = None
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        # Empty row
        if not row: continue
        if not columns:
            columns = row
        else:
            dev.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')


pairs_groundtrue = []
gt = rltk.GroundTruth()
for row in dev:    
    wiki_ID,	secondhand_ID, label = row
    r1 = ds1.get_record(secondhand_ID)
    r2 = ds2.get_record(wiki_ID)
    if label == '1':
        secondhand_id = r1.raw_object['index']
        wiki_id = r2.raw_object['song'].split('/')[-1]
        gt.add_positive(secondhand_id, wiki_id)
        pairs_groundtrue += [(secondhand_id, wiki_id)]
    else:
        gt.add_negative(secondhand_id, wiki_id)

rltk.Trial(gt)

Column names are: wiki_id, secondhand_id, label
Processed 100 lines.


<rltk.evaluation.trial.Trial at 0x7f0149b84dc0>

Then, you can build your own blocking techniques and evaluate it.

Hint:

- What is the total number of pairs without blocking? 
- what is the number of paris with blocking?
- After blocking, how many "correct" (matched) pairs presented in dev set?


In [20]:
pair_n_in_RxR = ds1.generate_dataframe().shape[0] * ds2.generate_dataframe().shape[0]
pair_compared = len(pairs_pred)
true_matches_in_RxR = len(pairs_groundtrue)

true_matches_compares = 0
for pair_groundtrue in pairs_groundtrue:
  matched = False
  for pair_pred in pairs_pred:
    if pair_groundtrue == pair_pred:
      true_matches_compares += 1 # TP
      matched = True
      break

# reduction_ratio = 1 - pair_compared/pair_n_in_RxR
reduction_ratio = pair_compared / pair_n_in_RxR

# pairs_completeness = number of true matches compares / number of true matches in RxR
pairs_completeness = true_matches_compares / true_matches_in_RxR

# pair quality = number of true matches compares / number of matches compared
pair_quality = true_matches_compares / pair_compared

print(f'reduction ratio = {reduction_ratio}\npairs completeness = {pairs_completeness}\npair quality = {pair_quality}')

reduction ratio = 9.70472847408631e-05
pairs completeness = 0.5882352941176471
pair quality = 0.07092198581560284


In [21]:
pair_compared , pair_n_in_RxR

(141, 1452900)

### Task 1-3. Entity Linking

Here are 2 example functions for field (attribute) similarity:

In [32]:
def jaccard_sim(a, b):
    a, b = set(a), set(b)
    if len(a.union(b)) == 0: return 0
    jaccard_sim = float(len(a.intersection(b))) / len(a.union(b))
    return jaccard_sim

def singer_jaro_sim(r1, r2): 
    return rltk.jaro_winkler_similarity(r1.singer , r2.singer)
    
def song_jaro_sim(r1, r2):
    return rltk.jaro_winkler_similarity(r1.song, r2.song)

def last_name_jaccard_sim(r1, r2):
    return jaccard_sim(r1.author_last_name, r2.author_last_name)

def singer_tokens_levenshtein_sim(r1, r2): 
    for n1, n2 in zip(sorted(r1.singer_tokens), sorted(r2.singer_tokens)):
        if rltk.levenshtein_distance(n1, n2) > min(len(n1), len(n2)) / 3:
            return 0
    return 1

def singer_tokens_jaccard_sim(r1, r2):
  return jaccard_sim(r1.singer_tokens, r2.singer_tokens)
    

# threshold value to determine if we are confident the record match
MY_TRESH = 0.8
# entity linkage scoring function by combining multiple sim functions into a single weightened scoring function
def rule_based_method(r1, r2):
    score_song_jaro = song_jaro_sim(r1, r2)
    score_singer_jaro = singer_jaro_sim(r1, r2)
    score_singer_tokens_levenshtein = singer_tokens_levenshtein_sim(r1, r2)
    score_singer_tokens_jaccard = singer_tokens_jaccard_sim(r1, r2)

    # total = 0.7*score_song_jaro + 0.3*score_singer_jaro
    # total = score_singer_tokens_levenshtein # 0.7692
    # total = score_singer_jaro # 0.769
    total = score_song_jaro # 0.8275
    # total = score_singer_tokens_jaccard #  0.76923
    # total = 0.8*score_song_jaro + 0.1*score_singer_jaro + 0.1*score_singer_tokens_levenshtein # 0.8275
    # total = 0.6*score_song_jaro + 0.2*score_singer_jaro + 0.2*score_singer_tokens_levenshtein # 0.7692

    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total




##################################
##### Evaluation ######
##################################
# run some candidates using the ground-truth
trial = rltk.Trial(gt)
candidate_pairs = rltk.get_record_pairs(ds1, ds2, ground_truth=gt)
for r1, r2 in candidate_pairs:
    result, confidence = rule_based_method(r1, r2)
    trial.add_result(r1, r2, result, confidence)

# evaluation
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')
print(f'f_measure: {trial.f_measure}')

Trial statistics based on Ground-Truth from development set data:
tp: 0.750000 [12]
fp: 0.500000 [1]
tn: 0.500000 [1]
fn: 0.250000 [4]
f_measure: 0.8275862068965517


### Save Test predictions
Test on secondhandsong and wikidata

In [74]:
test_set_file = dir_ + 'blocked.csv'
test = []
with open(test_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            test.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(test)} lines.')

Column names are: secondhandsong_ID,  wiki_nominated_ID
Processed 141 lines.


In [77]:
predictions = []
for id1, id2 in test:
    r1 = ds1.get_record(id1)
    r2  = ds2.get_record(id2)
    result, confidence = rule_based_method(r1, r2)
    predictions.append((r1.id, r2.id, result, confidence))

print(len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe()))

141 10020 145


In [78]:
with open(dir_ + 'secondhandsong_wiki_matchings_confidence.csv', mode='w') as file:
    file.write('secondhandsong_ID, wiki_nominated_ID, prediction, confidence\n')
    for row in predictions:
      r1_id, r2_id, result, confidence = row
      file.write(f'{r1_id}, {r2_id}, {result}, {confidence}\n')

In [79]:
with open(dir_ + 'secondhandsong_wiki_matchings.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    file.write('secondhandsong_ID, wiki_nominated_ID\n')
    for row in predictions:
      r1_id, r2_id, result, confidence = row
      if result:
        file.write(f'{r1_id}, {r2_id}\n')
        # writer.writerow(row)