In [1]:
import pandas as pd
import numpy as np
import py_stringmatching as sm
import py_stringsimjoin as ssj
from pandas_profiling import ProfileReport
import re
import nltk
from datetime import datetime
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
ban = pd.read_csv('../../datasets/books/ban.csv',sep=';')

In [3]:
wor = pd.read_csv('../../datasets/books/wor.csv',sep=';')

In [4]:
ban.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17629 entries, 0 to 17628
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   ban_id                          17629 non-null  object
 1   ban_isbn                        17629 non-null  object
 2   ban_title                       17628 non-null  object
 3   ban_author                      16277 non-null  object
 4   ban_binding                     2961 non-null   object
 5   ban_pubdate                     8822 non-null   object
 6   ban_pages                       17629 non-null  int64 
 7   ban_publisher                   17589 non-null  object
 8   ban_title_no_par                17628 non-null  object
 9   ban_title_no_par_no_stop_words  17615 non-null  object
 10  ban_firstauthor                 16277 non-null  object
 11  ban_firsttwoauthors             16277 non-null  object
dtypes: int64(1), object(11)
memory usage: 1.6+ MB


In [5]:
wor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48792 entries, 0 to 48791
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   wor_id                          48792 non-null  object
 1   wor_isbn                        48792 non-null  int64 
 2   wor_title                       48792 non-null  object
 3   wor_author                      48533 non-null  object
 4   wor_binding                     46050 non-null  object
 5   wor_pubdate                     48677 non-null  object
 6   wor_pages                       48792 non-null  int64 
 7   wor_publisher                   48744 non-null  object
 8   wor_title_no_par                48791 non-null  object
 9   wor_title_no_par_no_stop_words  48729 non-null  object
 10  wor_firstauthor                 48511 non-null  object
 11  wor_firsttwoauthors             48533 non-null  object
dtypes: int64(2), object(10)
memory usage: 4.5+ MB


In [6]:
#wor is not read in as string but as integer so we convert it back to string
wor['wor_isbn'] = wor['wor_isbn'].apply(lambda x: str(x))

In [7]:
matches_ban_wor = pd.merge(ban,wor,left_on='ban_isbn',right_on='wor_isbn')
print('True matches across Ban and Wordery: {}'.format(matches_ban_wor.shape[0]))

True matches across Ban and Wordery: 970


In [8]:
ban.fillna('',inplace=True)
wor.fillna('',inplace=True)

In [9]:
ban.columns

Index(['ban_id', 'ban_isbn', 'ban_title', 'ban_author', 'ban_binding', 'ban_pubdate', 'ban_pages', 'ban_publisher', 'ban_title_no_par', 'ban_title_no_par_no_stop_words', 'ban_firstauthor', 'ban_firsttwoauthors'], dtype='object')

In [10]:
wor.columns

Index(['wor_id', 'wor_isbn', 'wor_title', 'wor_author', 'wor_binding', 'wor_pubdate', 'wor_pages', 'wor_publisher', 'wor_title_no_par', 'wor_title_no_par_no_stop_words', 'wor_firstauthor', 'wor_firsttwoauthors'], dtype='object')

### Join on first authors based on Jaccard score of at least 0.3 with 3-gram tokenization

In [22]:
q3 = sm.QgramTokenizer(qval=3,return_set=True)

In [30]:
ban_wor_pairs_fa_q3_jac_03 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firstauthor', 'wor_firstauthor', q3, 0.3, 
                                     l_out_attrs=['ban_title','ban_title_no_par','ban_author','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_no_par','wor_author','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_fa_q3_jac_03.to_csv('ban_wor_pairs_fa_q3_jac_03.csv',index=False,sep=';')

In [31]:
ban_wor_pairs_fa_q3_jac_03.columns

Index(['_id', 'l_ban_id', 'r_wor_id', 'l_ban_title', 'l_ban_title_no_par', 'l_ban_author', 'l_ban_firstauthor', 'l_ban_firsttwoauthors', 'l_ban_binding', 'l_ban_pubdate', 'l_ban_pages', 'l_ban_publisher', 'l_ban_isbn', 'r_wor_isbn', 'r_wor_title', 'r_wor_title_no_par', 'r_wor_author', 'r_wor_firstauthor', 'r_wor_firsttwoauthors', 'r_wor_binding', 'r_wor_pubdate', 'r_wor_pages', 'r_wor_publisher', '_sim_score'], dtype='object')

### Join on first authors based on edit distance of at most 5

In [50]:
# Blocking on first authors (fa) using Levenshtein (edit distance) of at most 5
ban_wor_pairs_fa_lev5 = ssj.edit_distance_join(ban, wor, 'ban_id', 'wor_id', 'ban_firstauthor', 'wor_firstauthor', 5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                      n_jobs=-3)

### Join on first two authors based on edit distance of at most 5

In [32]:
# Blocking on first two authors (fta) using Levenshtein (edit distance) of at most 5
ban_wor_pairs_fta_lev5 = ssj.edit_distance_join(ban, wor, 'ban_id', 'wor_id', 'ban_firsttwoauthors', 'wor_firsttwoauthors', 5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                      n_jobs=-3)
#ban_wor_pairs_fta_lev5.to_csv('ban_wor_pairs_fta_lev5.csv',index=False)

### Join on first two authors based on Jaccard score of at least 0.3 with 3-gram tokenization

In [46]:
ban.columns

Index(['ban_id', 'ban_isbn', 'ban_title', 'ban_author', 'ban_binding', 'ban_pubdate', 'ban_pages', 'ban_publisher', 'ban_title_no_par', 'ban_title_lower', 'ban_author_lower', 'ban_firstauthor', 'ban_firsttwoauthors', 'ban_title_no_par_no_stop_words'], dtype='object')

In [47]:
ban_wor_pairs_fta_q3_jac_03 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firsttwoauthors', 'wor_firsttwoauthors', q3, 0.3, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_fta_q3_jac_03.to_csv('ban_wor_pairs_fta_q3_jac_03.csv',index=False)

In [34]:
ban_wor_pairs_fta_q3_jac_03.columns

Index(['_id', 'l_ban_id', 'r_wor_id', 'l_ban_title', 'l_ban_title_no_par', 'l_ban_author', 'l_ban_firstauthor', 'l_ban_firsttwoauthors', 'l_ban_binding', 'l_ban_pubdate', 'l_ban_pages', 'l_ban_publisher', 'l_ban_isbn', 'r_wor_isbn', 'r_wor_title', 'r_wor_title_no_par', 'r_wor_author', 'r_wor_firstauthor', 'r_wor_firsttwoauthors', 'r_wor_binding', 'r_wor_pubdate', 'r_wor_pages', 'r_wor_publisher', '_sim_score'], dtype='object')

### Join on first two authors based on Jaccard score of at least 0.5 with 3-gram tokenization

In [35]:
ban_wor_pairs_fta_q3_jac_05 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firsttwoauthors', 'wor_firsttwoauthors', q3, 0.5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_fta_q3_jac_05.to_csv('ban_wor_pairs_fta_q3_jac_05.csv',index=False)

### Join on title (with parentheses) based on Jaccard score of at least 0.3 with 3-gram tokenization

In [25]:
ban_wor_pairs_tit_q3_jac_03 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_lower', 'wor_title_lower', q3, 0.3, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_tit_q3_jac_03.to_csv('ban_wor_pairs_tit_q3_jac_03.csv',index=False)

  if attr_type != pd.np.object:


### Join on title (w/out parentheses) based on Jaccard score of at least 0.3 with 3-gram tokenization

In [26]:
ban_wor_pairs_titnp_q3_jac_03 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_no_par', 'wor_title_no_par', q3, 0.3, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_titnp_q3_jac_03.to_csv('ban_wor_pairs_titnp_q3_jac_03.csv',index=False)

### Join on title (w/out parentheses) and stop words removed based on Levenshtein score of at most 5

In [51]:
ban_wor_pairs_titnp_nosw_lev_5 = ssj.edit_distance_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_no_par_no_stop_words', 'wor_title_no_par_no_stop_words', 5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

### Join on title (w/out parentheses) and stop words removed based on Jaccard score of at least 0.3 with 3-gram tokenization

In [27]:
ban_wor_pairs_titnp_nosw_q3_jac_03 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_no_par_no_stop_words', 'wor_title_no_par_no_stop_words', q3, 0.3, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

### Join on title (w/out parentheses) and stop words removed based on Jaccard score of at least 0.7 with 3-gram tokenization

In [54]:
ban_wor_pairs_titnp_nosw_q3_jac_07 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_no_par_no_stop_words', 'wor_title_no_par_no_stop_words', q3, 0.7, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

### Join on title (w/out parentheses) and stop words removed based on Jaccard score of at least 0.5 with 3-gram tokenization

In [55]:
ban_wor_pairs_titnp_nosw_q3_jac_05 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_title_no_par_no_stop_words', 'wor_title_no_par_no_stop_words', q3, 0.5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

### Join on first two authors using Jaccard 3-gram threshold 0.7

In [29]:
ban_wor_pairs_fta_q3_jac_07 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firsttwoauthors', 'wor_firsttwoauthors', q3, 0.7, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)
#ban_wor_pairs_fta_q3_jac_07.to_csv('ban_wor_pairs_fta_q3_jac_07.csv',index=False)

### Join on first authors using Jaccard 3-gram threshold 0.7

In [52]:
ban_wor_pairs_fa_q3_jac_07 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firstauthor', 'wor_firstauthor', q3, 0.7, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

### Join on first authors using Jaccard 3-gram threshold 0.5

In [53]:
ban_wor_pairs_fa_q3_jac_05 = ssj.jaccard_join(ban, wor, 'ban_id', 'wor_id', 'ban_firstauthor', 'wor_firstauthor', q3, 0.5, 
                                     l_out_attrs=['ban_title','ban_title_lower','ban_title_no_par','ban_title_no_par_no_stop_words','ban_author','ban_author_lower','ban_firstauthor','ban_firsttwoauthors','ban_binding','ban_pubdate', 'ban_pages','ban_publisher','ban_isbn'], 
                                     r_out_attrs=['wor_isbn','wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor','wor_firsttwoauthors','wor_binding','wor_pubdate','wor_pages','wor_publisher'],
                                     n_jobs=-3)

In [24]:
ban.columns

Index(['ban_id', 'ban_isbn', 'ban_title', 'ban_author', 'ban_binding', 'ban_pubdate', 'ban_pages', 'ban_publisher', 'ban_title_no_par', 'ban_title_lower', 'ban_author_lower', 'ban_firstauthor', 'ban_firsttwoauthors', 'ban_title_no_par_no_stop_words'], dtype='object')

### Evaluate Blocking

In [56]:
print('True matches across Ban and Wordery: {}'.format(matches_ban_wor.shape[0]))

True matches across Ban and Wordery: 970


In [57]:
def assignTrueLabels(df_blockedpairs, idcolumn1, idcolumn2):
    return df_blockedpairs.apply(lambda row: 1 if (row[idcolumn1]==row[idcolumn2]) else 0, axis=1)

In [58]:
#Assign labels to the blockedpairs df
ban_wor_pairs_fa_q3_jac_03['true_label'] = assignTrueLabels(ban_wor_pairs_fa_q3_jac_03,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fa_q3_jac_05['true_label'] = assignTrueLabels(ban_wor_pairs_fa_q3_jac_05,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fa_q3_jac_07['true_label'] = assignTrueLabels(ban_wor_pairs_fa_q3_jac_07,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fa_lev5['true_label'] = assignTrueLabels(ban_wor_pairs_fa_lev5,'l_ban_isbn','r_wor_isbn')

ban_wor_pairs_fta_q3_jac_03['true_label'] = assignTrueLabels(ban_wor_pairs_fta_q3_jac_03,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fta_q3_jac_05['true_label'] = assignTrueLabels(ban_wor_pairs_fta_q3_jac_05,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fta_q3_jac_07['true_label'] = assignTrueLabels(ban_wor_pairs_fta_q3_jac_07,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_fta_lev5['true_label'] = assignTrueLabels(ban_wor_pairs_fta_lev5,'l_ban_isbn','r_wor_isbn')

ban_wor_pairs_tit_q3_jac_03['true_label'] = assignTrueLabels(ban_wor_pairs_tit_q3_jac_03,'l_ban_isbn','r_wor_isbn')

ban_wor_pairs_titnp_q3_jac_03['true_label'] = assignTrueLabels(ban_wor_pairs_titnp_q3_jac_03,'l_ban_isbn','r_wor_isbn')

ban_wor_pairs_titnp_nosw_q3_jac_03['true_label'] = assignTrueLabels(ban_wor_pairs_titnp_nosw_q3_jac_03,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_titnp_nosw_q3_jac_05['true_label'] = assignTrueLabels(ban_wor_pairs_titnp_nosw_q3_jac_05,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_titnp_nosw_q3_jac_07['true_label'] = assignTrueLabels(ban_wor_pairs_titnp_nosw_q3_jac_07,'l_ban_isbn','r_wor_isbn')
ban_wor_pairs_titnp_nosw_lev_5['true_label'] = assignTrueLabels(ban_wor_pairs_titnp_nosw_lev_5,'l_ban_isbn','r_wor_isbn')

In [41]:
def calcRedRatio(df1,df2,df_blockedpairs):
    cp = (df1.shape[0]*df2.shape[0])/2
    rr = (1-(df_blockedpairs.shape[0]/cp))*100
    return round(rr,2)

In [42]:
def calcPairCompISBN(df_matches,df_blockedpairs,true_label_col):
    pc = (df_blockedpairs[df_blockedpairs[true_label_col]==1].shape[0]/df_matches.shape[0])*100
    return round(pc,2)

In [59]:
print('Blocking based on first author, Jaccaed, 3-gram, Threshold 0.3')
print('Evaluation of ban_wor_pairs_fa_q3_jac_03')
print('Number of rows: {:,}'.format(ban_wor_pairs_fa_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fa_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fa_q3_jac_03,'true_label')))

print('\n')
print('Blocking based on first author, Jaccaed, 3-gram, Threshold 0.5')
print('Evaluation of ban_wor_pairs_fa_q3_jac_05')
print('Number of rows: {:,}'.format(ban_wor_pairs_fa_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fa_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fa_q3_jac_05,'true_label')))

print('\n')
print('Blocking based on first author, Jaccaed, 3-gram, Threshold 0.7')
print('Evaluation of ban_wor_pairs_fa_q3_jac_07')
print('Number of rows: {:,}'.format(ban_wor_pairs_fa_q3_jac_07.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fa_q3_jac_07)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fa_q3_jac_07,'true_label')))

print('\n')
print('Blocking based on first authors, Levenshtein of at most 5')
print('Evaluation of ban_wor_pairs_fta_lev5, Levenshtein, at most 5')
print('Number of rows: {:,}'.format(ban_wor_pairs_fa_lev5.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fa_lev5)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fa_lev5,'true_label')))

print('\n')
print('Blocking based on first two authors, Jaccaed, 3-gram, Threshold 0.3')
print('Evaluation of ban_wor_pairs_fta_q3_jac_03')
print('Number of rows: {:,}'.format(ban_wor_pairs_fta_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fta_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fta_q3_jac_03,'true_label')))

print('\n')
print('Blocking based on first two author, Jaccaed, 3-gram, Threshold 0.5')
print('Evaluation of ban_wor_pairs_fta_q3_jac_05')
print('Number of rows: {:,}'.format(ban_wor_pairs_fta_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fta_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fta_q3_jac_05,'true_label')))

print('\n')
print('Blocking based on first two author, Jaccaed, 3-gram, Threshold 0.7')
print('Evaluation of ban_wor_pairs_fta_q3_jac_07')
print('Number of rows: {:,}'.format(ban_wor_pairs_fta_q3_jac_07.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fta_q3_jac_07)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fta_q3_jac_07,'true_label')))

print('\n')
print('Blocking based on first authors, Levenshtein of at most 5')
print('Evaluation of ban_wor_pairs_fta_lev5, Levenshtein, at most 5')
print('Number of rows: {:,}'.format(ban_wor_pairs_fta_lev5.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_fta_lev5)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_fta_lev5,'true_label')))

print('\n')
print('Blocking based on title')
print('Evaluation of ban_wor_pairs_tit_q3_jac_03 (Title, Jaccaed, 3-gram, Threshold 0.3)')
print('Number of rows: {:,}'.format(ban_wor_pairs_tit_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_tit_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_tit_q3_jac_03,'true_label')))

print('\n')
print('Evaluation of ban_wor_pairs_titnp_q3_jac_03 (Title excluded text in parenthesis, Jaccard, 3-gram, Threshold 0.3)')
print('Number of rows: {:,}'.format(ban_wor_pairs_titnp_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_titnp_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_titnp_q3_jac_03,'true_label')))

print('\n')
print('ban_wor_pairs_titnp_nosw_q3_jac_03 (Title excluded text in parenthesis and no stop words, Jaccard, 3-gram, Threshold 0.3)')
print('Number of rows: {:,}'.format(ban_wor_pairs_titnp_nosw_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_titnp_nosw_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_titnp_nosw_q3_jac_03,'true_label')))

print('\n')
print('ban_wor_pairs_titnp_nosw_q3_jac_05 (Title excluded text in parenthesis and no stop words, Jaccard, 3-gram, Threshold 0.5)')
print('Number of rows: {:,}'.format(ban_wor_pairs_titnp_nosw_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_titnp_nosw_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_titnp_nosw_q3_jac_05,'true_label')))

print('\n')
print('ban_wor_pairs_titnp_nosw_q3_jac_07 (Title excluded text in parenthesis and no stop words, Jaccard, 3-gram, Threshold 0.7)')
print('Number of rows: {:,}'.format(ban_wor_pairs_titnp_nosw_q3_jac_07.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_titnp_nosw_q3_jac_07)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_titnp_nosw_q3_jac_07,'true_label')))

print('\n')
print('ban_wor_pairs_titnp_nosw_lev_5 (Title excluded text in parenthesis and no stop words with Levenshtein of at most 5)')
print('Number of rows: {:,}'.format(ban_wor_pairs_titnp_nosw_lev_5.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(ban,wor,ban_wor_pairs_titnp_nosw_lev_5)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_ban_wor,ban_wor_pairs_titnp_nosw_lev_5,'true_label')))

Blocking based on first author, Jaccaed, 3-gram, Threshold 0.3
Evaluation of ban_wor_pairs_fa_q3_jac_03
Number of rows: 962,082
Reduction Ratio: 99.78%
Pair Completeness: 97.94%


Blocking based on first author, Jaccaed, 3-gram, Threshold 0.5
Evaluation of ban_wor_pairs_fa_q3_jac_05
Number of rows: 433,667
Reduction Ratio: 99.9%
Pair Completeness: 95.98%


Blocking based on first author, Jaccaed, 3-gram, Threshold 0.7
Evaluation of ban_wor_pairs_fa_q3_jac_07
Number of rows: 422,398
Reduction Ratio: 99.9%
Pair Completeness: 90.93%


Blocking based on first authors, Levenshtein of at most 5
Evaluation of ban_wor_pairs_fta_lev5, Levenshtein, at most 5
Number of rows: 880,408
Reduction Ratio: 99.8%
Pair Completeness: 94.54%


Blocking based on first two authors, Jaccaed, 3-gram, Threshold 0.3
Evaluation of ban_wor_pairs_fta_q3_jac_03
Number of rows: 843,590
Reduction Ratio: 99.8%
Pair Completeness: 98.45%


Blocking based on first two author, Jaccaed, 3-gram, Threshold 0.5
Evaluation of ba

-- Blocking based on Levenshtein performs significantly worse than Blocking on Jaccard distance with 3-gram tokens and blocking on title also performs worse compared to authors. Blocking on the firsttwoauthors performs best.

In [49]:
ban_wor_pairs_fta_q3_jac_03.to_csv('filtered_correspondences/ban_wor_pairs_fta_q3_jac_03.csv',index=False)

In [43]:
matches_ban_wor.to_csv('matching_pairs/matches_ban_wor.csv',index=False)