In [1]:
import pandas as pd
import numpy as np
import py_stringmatching as sm
import py_stringsimjoin as ssj
from pandas_profiling import ProfileReport
import re
import nltk
from datetime import datetime
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:
bx = pd.read_csv('../../datasets/books/bx.csv',sep=';',low_memory=False)

In [4]:
wor = pd.read_csv('../../datasets/books/wor.csv',sep=';')

In [5]:
bx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270948 entries, 0 to 270947
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   bx_id                          270948 non-null  object
 1   bx_isbn                        270948 non-null  object
 2   bx_title                       270948 non-null  object
 3   bx_firstauthor                 270947 non-null  object
 4   bx_pubdate                     270948 non-null  object
 5   bx_pages                       270948 non-null  int64 
 6   bx_publisher                   270946 non-null  object
 7   bx_binding                     4171 non-null    object
 8   bx_title_no_par                270946 non-null  object
 9   bx_title_no_par_no_stop_words  270658 non-null  object
dtypes: int64(1), object(9)
memory usage: 20.7+ MB


In [6]:
wor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48792 entries, 0 to 48791
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   wor_id                          48792 non-null  object
 1   wor_isbn                        48792 non-null  int64 
 2   wor_title                       48792 non-null  object
 3   wor_author                      48533 non-null  object
 4   wor_binding                     46050 non-null  object
 5   wor_pubdate                     48677 non-null  object
 6   wor_pages                       48792 non-null  int64 
 7   wor_publisher                   48744 non-null  object
 8   wor_title_no_par                48791 non-null  object
 9   wor_title_no_par_no_stop_words  48729 non-null  object
 10  wor_firstauthor                 48511 non-null  object
 11  wor_firsttwoauthors             48533 non-null  object
dtypes: int64(2), object(10)
memory usage: 4.5+ MB


In [7]:
#wor is not read in as string but as integer so we convert it back to string
wor['wor_isbn'] = wor['wor_isbn'].apply(lambda x: str(x))

In [8]:
bx.fillna('',inplace=True)
wor.fillna('',inplace=True)

In [9]:
matches_wor_bx = pd.merge(wor,bx,left_on='wor_isbn',right_on='bx_isbn')
print('True matches across wor and bx: {}'.format(matches_wor_bx.shape[0]))

True matches across wor and bx: 9270


In [10]:
bx.columns

Index(['bx_id', 'bx_isbn', 'bx_title', 'bx_firstauthor', 'bx_pubdate', 'bx_pages', 'bx_publisher', 'bx_binding', 'bx_title_no_par', 'bx_title_no_par_no_stop_words'], dtype='object')

In [11]:
wor.columns

Index(['wor_id', 'wor_isbn', 'wor_title', 'wor_author', 'wor_binding', 'wor_pubdate', 'wor_pages', 'wor_publisher', 'wor_title_no_par', 'wor_title_no_par_no_stop_words', 'wor_firstauthor', 'wor_firsttwoauthors'], dtype='object')

### Join on author based on Jaccard score of at least 0.5 with 3-gram tokenization

In [12]:
q3 = sm.QgramTokenizer(qval=3,return_set=True)

In [13]:
wor_bx_pairs_a_q3_jac_05 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_author_lower', 'bx_author_lower', q3, 0.5, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

  if attr_type != pd.np.object:


### Join on first author (wor_firstauthor and bx_author) based on Jaccard score of at least 0.5 with 3-gram tokenization

In [14]:
wor_bx_pairs_fa_q3_jac_05 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_firstauthor', 'bx_author', q3, 0.5, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

### Join on title (with parentheses) based on Jaccard score of at least 0.5 with 3-gram tokenization

In [15]:
wor_bx_pairs_tit_q3_jac_05 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_title_lower', 'bx_title_lower', q3, 0.5, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

### Join on title (without parentheses) based on Jaccard score of at least 0.5 with 3-gram tokenization

In [16]:
wor_bx_pairs_titnp_q3_jac_05 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_title_no_par', 'bx_title_no_par', q3, 0.5, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

### Join on title (w/out parentheses) and stop words removed based on Jaccard score of at least 0.5 with 3-gram tokenization

In [17]:
wor_bx_pairs_titnp_nosw_q3_jac_05 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_title_no_par_no_stop_words', 'bx_title_no_par_no_stop_words', q3, 0.5, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

### Join on title (w/out parentheses) and stop words removed based on Jaccard score of at least 0.3 with 3-gram tokenization

In [25]:
wor_bx_pairs_titnp_nosw_q3_jac_03 = ssj.jaccard_join(wor, bx, 'wor_id', 'bx_id', 'wor_title_no_par_no_stop_words', 'bx_title_no_par_no_stop_words', q3, 0.3, 
                                     l_out_attrs=['wor_title','wor_title_lower','wor_title_no_par','wor_title_no_par_no_stop_words','wor_author','wor_author_lower','wor_firstauthor', 'wor_firsttwoauthors','wor_binding','wor_pubdate', 'wor_pages','wor_publisher','wor_isbn'], 
                                     r_out_attrs=['bx_isbn','bx_title','bx_title_lower','bx_title_no_par','bx_title_no_par_no_stop_words','bx_author', 'bx_author_lower','bx_binding','bx_pubdate','bx_pages','bx_publisher'],
                                     n_jobs=-3)

### Evaluate Blocking

In [18]:
#Calculafte the true matches (based on ISBN) and store it in matches_wor_bx
matches_wor_bx = pd.merge(wor,bx,left_on='wor_isbn',right_on='bx_isbn')
print('True matches across wor and bxdery: {}'.format(matches_wor_bx.shape[0]))

True matches across wor and bxdery: 9270


In [19]:
def assignTrueLabels(df_blockedpairs, idcolumn1, idcolumn2):
    return df_blockedpairs.apply(lambda row: 1 if (row[idcolumn1]==row[idcolumn2]) else 0, axis=1)

In [20]:
def calcRedRatio(df1,df2,df_blockedpairs):
    cp = (df1.shape[0]*df2.shape[0])/2
    rr = (1-(df_blockedpairs.shape[0]/cp))*100
    return round(rr,2)

In [21]:
def calcPairCompISBN(df_matches,df_blockedpairs,true_label_col):
    pc = (df_blockedpairs[df_blockedpairs[true_label_col]==1].shape[0]/df_matches.shape[0])*100
    return round(pc,2)

In [22]:
#Assign labels to the blockedpairs df
wor_bx_pairs_a_q3_jac_05['true_label'] = assignTrueLabels(wor_bx_pairs_a_q3_jac_05,'l_wor_isbn','r_bx_isbn')
wor_bx_pairs_fa_q3_jac_05['true_label'] = assignTrueLabels(wor_bx_pairs_fa_q3_jac_05,'l_wor_isbn','r_bx_isbn')
wor_bx_pairs_tit_q3_jac_05['true_label'] = assignTrueLabels(wor_bx_pairs_tit_q3_jac_05,'l_wor_isbn','r_bx_isbn')
wor_bx_pairs_titnp_q3_jac_05['true_label'] = assignTrueLabels(wor_bx_pairs_titnp_q3_jac_05,'l_wor_isbn','r_bx_isbn')
wor_bx_pairs_titnp_nosw_q3_jac_05['true_label'] = assignTrueLabels(wor_bx_pairs_titnp_nosw_q3_jac_05,'l_wor_isbn','r_bx_isbn')

In [26]:
wor_bx_pairs_titnp_nosw_q3_jac_03['true_label'] = assignTrueLabels(wor_bx_pairs_titnp_nosw_q3_jac_03,'l_wor_isbn','r_bx_isbn')

In [27]:
print('Blocking based on author, Jaccard, 3-gram, Threshold 0.5')
print('Evaluation of wor_bx_pairs_a_q3_jac_05')
print('Number of rows: {:,}'.format(wor_bx_pairs_a_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_a_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_a_q3_jac_05,'true_label')))
print('\n')
print('Blocking based on first author (wor) and author (bx), Jaccard, 3-gram, Threshold 0.5')
print('Evaluation of wor_bx_pairs_fa_q3_jac_05')
print('Number of rows: {:,}'.format(wor_bx_pairs_fa_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_fa_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_fa_q3_jac_05,'true_label')))

print('\n')
print('Blocking based on title, Jaccard, 3-gram, Threshold 0.5')
print('Evaluation of wor_bx_pairs_tit_q3_jac_05 (Title, Jaccard, 3-gram, Threshold 0.5)')
print('Number of rows: {:,}'.format(wor_bx_pairs_tit_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_tit_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_tit_q3_jac_05,'true_label')))
print('\n')
print('Evaluation of wor_bx_pairs_titnp_q3_jac_05 (Title excluded text in parenthesis, Jaccard, 3-gram, Threshold 0.5)')
print('Number of rows: {:,}'.format(wor_bx_pairs_titnp_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_titnp_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_titnp_q3_jac_05,'true_label')))
print('\n')
print('Evaluation of wor_bx_pairs_titnp_nosw_q3_jac_05 (Title excluded text in parenthesis and stop words, Jaccard, 3-gram, Threshold 0.5)')
print('Number of rows: {:,}'.format(wor_bx_pairs_titnp_nosw_q3_jac_05.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_titnp_nosw_q3_jac_05)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_titnp_nosw_q3_jac_05,'true_label')))

print('\n')
print('Evaluation of wor_bx_pairs_titnp_nosw_q3_jac_03 (Title excluded text in parenthesis and stop words, Jaccard, 3-gram, Threshold 0.3)')
print('Number of rows: {:,}'.format(wor_bx_pairs_titnp_nosw_q3_jac_03.shape[0]))
print('Reduction Ratio: {}%'.format(calcRedRatio(wor,bx,wor_bx_pairs_titnp_nosw_q3_jac_03)))
print('Pair Completeness: {}%'.format(calcPairCompISBN(matches_wor_bx,wor_bx_pairs_titnp_nosw_q3_jac_03,'true_label')))

Blocking based on author, Jaccard, 3-gram, Threshold 0.5
Evaluation of wor_bx_pairs_a_q3_jac_05
Number of rows: 1,041,418
Reduction Ratio: 99.98%
Pair Completeness: 81.37%


Blocking based on first author (wor) and author (bx), Jaccard, 3-gram, Threshold 0.5
Evaluation of wor_bx_pairs_fa_q3_jac_05
Number of rows: 102,451
Reduction Ratio: 100.0%
Pair Completeness: 7.53%


Blocking based on title, Jaccard, 3-gram, Threshold 0.5
Evaluation of wor_bx_pairs_tit_q3_jac_05 (Title, Jaccard, 3-gram, Threshold 0.5)
Number of rows: 82,412
Reduction Ratio: 100.0%
Pair Completeness: 65.88%


Evaluation of wor_bx_pairs_titnp_q3_jac_05 (Title excluded text in parenthesis, Jaccard, 3-gram, Threshold 0.5)
Number of rows: 9,836
Reduction Ratio: 100.0%
Pair Completeness: 12.13%


Evaluation of wor_bx_pairs_titnp_nosw_q3_jac_05 (Title excluded text in parenthesis and stop words, Jaccard, 3-gram, Threshold 0.5)
Number of rows: 132,722
Reduction Ratio: 100.0%
Pair Completeness: 80.47%


Evaluation of wor_bx

In [28]:
wor_bx_pairs_titnp_nosw_q3_jac_05.to_csv('filtered_correspondences/wor_bx_pairs_titnp_nosw_q3_jac_05.csv', index=False)

In [None]:
matches_wor_bx.to_csv('matching_pairs/matches_wor_bx.csv',index=False)

-- Blocking based on Levenshtein performs significantly worse than Blocking on Jaccard distance with 3-gram tokens and blocking on title also performs worse compared to authors. Blocking on the firsttwoauthors performs best. Same results as with ban and wordery