In [1]:
import argparse
import numpy as np
import pandas as pd
import itertools
import os
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from thefuzz import fuzz
from thefuzz import process
import time
import re
import recordlinkage
from sklearn.model_selection import train_test_split

gc.collect()

0

In [2]:
a = pd.read_csv('~/Downloads/a.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
a = pd.concat(a)
a['wikiID'] = a['id']
a = a.set_index('id')
print(a.head())

b = pd.read_csv('~/Downloads/b.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
b = pd.concat(b)
b['harvardIndex'] = b['id']
b = b.set_index('id')
print(b.head())

                              label  dateOfBirth  dateOfDeath  \
id                                                              
Q100142069             Frida Eggens          NaN          NaN   
Q100146795       Elizabeth Harrison       1792.0       1834.0   
Q100149196              Russell Cox          NaN          NaN   
Q100152296  Alda Pereira da Fonseca       1882.0          NaN   
Q100156193  Laurence Henry Millener       1914.0       2000.0   

           countryOfCitizenshipISO harvardIndex             bionomia  \
id                                                                     
Q100142069                      SE          NaN                  NaN   
Q100146795                      GB          NaN           Q100146795   
Q100149196                     NaN          NaN  0000-0001-5149-1709   
Q100152296                      BR          NaN                  NaN   
Q100156193                      NZ          NaN           Q100156193   

           authorAbbrv                 

In [3]:
# Find out the True Matches of dataframe A and dataframe B
a['harvardIndex'] = pd.to_numeric(a['harvardIndex'],errors='coerce') 
temp = pd.merge(a, b, how='inner', on=None, left_on='harvardIndex', right_on='harvardIndex',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_harvard'), copy=False, indicator=False)

print(temp.columns.to_list())
temp.head()

['label', 'dateOfBirth', 'dateOfDeath', 'countryOfCitizenshipISO', 'harvardIndex', 'bionomia', 'authorAbbrv', 'aliases', 'firstName_wiki', 'lastName_wiki', 'wikiID', 'Standard/Label Name', 'birthYear', 'deathYear', 'birthYearIsApprox', 'geographyISO', 'firstName_harvard', 'middleName', 'lastName_harvard', 'B & P Author Abbrev.', 'Name']


Unnamed: 0,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName_wiki,lastName_wiki,...,Standard/Label Name,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName_harvard,middleName,lastName_harvard,B & P Author Abbrev.,Name
0,Johann Bartsch,1709.0,1738.0,DE,27614.0,,Bartsch,"Johannes Bartsch, Joannes Bartsch, Bartsch",Johann,Bartsch,...,J. Bartsch,1709.0,1738.0,False,,Johann,,Bartsch,Bartsch,"Bartsch, Johann, Johann Bartsch"
1,Townshend Stith Brandegee,1843.0,1925.0,US,17284.0,Q1002345,Brandegee,"Brandegee, T. S. Brandegee, Townshend S. Brand...",Townshend,Brandegee,...,T. S. Brandegee,1843.0,1925.0,False,"MX, US",Townshend,Stith,Brandegee,Brandegee,"Brandegee, Townshend Stith, Townshend Stith Br..."
2,Cécile Kruyfhooft,1950.0,,BE,18988.0,,,,Cécile,Kruyfhooft,...,C. Kruyfhooft,1950.0,,False,BE,Cecile,,Kruyfhooft,,"Kruyfhooft, Cecile"
3,Charles Bullard,1869.0,1960.0,US,3553.0,Q100354624,,C. Bullard,Charles,Bullard,...,Charles Bullard,1869.0,,False,US,Charles,,Bullard,,"Bullard, Charles"
4,Zubair Aslam,,,,88384.0,,,,Zubair,Aslam,...,Z. Aslam,,,,KR,Zubair,,Aslam,Aslam,"Aslam, Zubair"


In [4]:
'''
use the pd.merge above to prepare training data -- find out the index true pairs
training = 0.75, testing = 0.25, no validat set

experiment setting
    - supervised: Logistics regression, navie bayes, SVM
    - undupervised: k-mean, ECM

refs
https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html
https://recordlinkage.readthedocs.io/en/latest/guides/classifiers.html#
'''

def define_true_pairs(indexList1, indexList2, indexName1, indexName2):
    arrays = [indexList1, indexList2]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=[indexName1, indexName2])
    return index

In [5]:
true_matches = define_true_pairs(temp['wikiID'],temp['harvardIndex'].astype(int),'wikiID','harvardIndex')
print('There is '+ str(len(true_matches)) +' HarvardIndex records in Wikidata that can find a match, which is ' + str(len(true_matches)/len(a)*100) +'%')
print('There is '+ str(len(true_matches)) +' HarvardIndex records can be found in Wikidata, which is ' + str(len(true_matches)/len(b)*100) +'%')

There is 31022 HarvardIndex records in Wikidata that can find a match, which is 43.09329332666555%
There is 31022 HarvardIndex records can be found in Wikidata, which is 40.55375444467685%


In [6]:
print(true_matches)

MultiIndex([(   'Q100222', 27614),
            (  'Q1002345', 17284),
            ('Q100255559', 18988),
            ('Q100354624',  3553),
            ('Q100377900', 88384),
            (   'Q100411', 23934),
            ('Q100454982', 50126),
            (   'Q100523', 78274),
            ('Q100587885', 29015),
            ('Q100587966', 14467),
            ...
            (    'Q65219', 25324),
            (     'Q6527', 26300),
            (    'Q65302',  4297),
            (    'Q65400',  2192),
            (    'Q65402', 46610),
            (    'Q65451',  1041),
            (    'Q65475', 38017),
            (    'Q65505', 80335),
            (     'Q6694',  1813),
            (     'Q7324',  8283)],
           names=['wikiID', 'harvardIndex'], length=31022)


In [7]:
# Find unmatched wiki_data entries
unmatched_wiki_data = a[~a.index.isin(true_matches.get_level_values('wikiID'))].reset_index()
unmatched_wiki_data

Unnamed: 0,id,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,lastName,wikiID
0,Q100142069,Frida Eggens,,,SE,,,Eggens,Eggens,Frida,Eggens,Q100142069
1,Q100146795,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison,Q100146795
2,Q100149196,Russell Cox,,,,,0000-0001-5149-1709,,,Russell,Cox,Q100149196
3,Q100152296,Alda Pereira da Fonseca,1882.0,,BR,,,,,Alda,Fonseca,Q100152296
4,Q100156193,Laurence Henry Millener,1914.0,2000.0,NZ,,Q100156193,,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence,Millener,Q100156193
...,...,...,...,...,...,...,...,...,...,...,...,...
40961,Q65499,Wilhelm Ferdinand Erichson,1809.0,1848.0,DE,,Q65499,,"Erichson, W. F. Erichson, Wilhelm F. Erichson,...",Wilhelm,Erichson,Q65499
40962,Q7450,Asima Chatterjee,1917.0,2006.0,IN,,,,Asima Chattopadhyay,Asima,Chatterjee,Q7450
40963,Q762,Leonardo da Vinci,1452.0,1519.0,,,,,"Leonardo di ser Piero da Vinci, Leonardo, da V...",Leonardo,Vinci,Q762
40964,Q8619,Pierre Trudeau,1919.0,2000.0,CA,,Q8619,,"Pierre Elliott Trudeau, Joseph Philippe Pierre...",Pierre,Trudeau,Q8619


In [8]:
# Find unmatched harvard_data entries
unmatched_harvard_data = b[~b.index.isin(true_matches.get_level_values('harvardIndex'))].reset_index()
unmatched_harvard_data

Unnamed: 0,id,Standard/Label Name,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName,middleName,lastName,B & P Author Abbrev.,Name,harvardIndex
0,89438,Shin. Sato,,,,DE,Shinya,,Sato,Shin. Sato,"Sato, Shinya, Shinya Sato",89438
1,37972,J. T. I. Boswell,1822.0,,False,,John,Thomas Irving,Boswell,,"Boswell, John Thomas Irving",37972
2,86767,H. H. Hu & W. C. Chêng,,,,CN,Hsen,Hsu & Chêng,Hu,Hu & W. C. Cheng,"Hu, Hsen Hsu & Chêng, Wan-chun",86767
3,29813,P. Lasquety,,,,,P.,,Lasquety,,"Lasquety, P.",29813
4,29336,J. I. Treby,,,,RU,Ju.,I.,Treby,,"Treby, Ju. I., J. Treboux?",29336
...,...,...,...,...,...,...,...,...,...,...,...,...
45481,88336,Flora Altaica,,,,RU,Flora,,Altaica,,Flora Altaica,88336
45482,35178,J. Laycock,,,,,John,,Laycock,,"Laycock, John",35178
45483,29118,Tutajev,,,,RU,Tutajev,,,,Tutajev,29118
45484,72000,Boeuf,,,,"FR, TN",F.,,Boeuf,Boeuf,"Boeuf, F., F. Boeuf",72000


In [9]:
# Find non-matched pairs in dataframe a
non_matched_a = a[~a['harvardIndex'].isin(temp['harvardIndex']) & a['harvardIndex'].notna()]
    
# Find non-matched pairs in dataframe b
non_matched_b = b[~b['harvardIndex'].isin(temp['harvardIndex']) & b['harvardIndex'].notnull()]

In [10]:
print('Non-matched data where HarvardIndex is not null or NaN:')
print('\nNon-matched pairs in dataframe wiki:')
print(len(non_matched_a))

print('\nNon-matched pairs in dataframe havard:')
print(len(non_matched_b))

Non-matched data where HarvardIndex is not null or NaN:

Non-matched pairs in dataframe wiki:
2389

Non-matched pairs in dataframe havard:
45486


In [11]:
print('There is '+ str(len(non_matched_a)) +' HarvardIndex records in Wikidata which cannot be found in HarvardIndex, which is ' + str(len(non_matched_a)/len(a)*100) +'%')
print('There is '+ str(len(non_matched_b)) +' HarvardIndex records cannot find a match, which is ' + str(len(non_matched_b)/len(b)*100) +'%')

There is 2389 HarvardIndex records in Wikidata which cannot be found in HarvardIndex, which is 3.3186086569983884%
There is 45486 HarvardIndex records cannot find a match, which is 59.4619326500732%


In [12]:
# Define the function to split the full name into first name and last name
def split_full_name(full_name):
    if pd.isnull(full_name) or full_name == '':
        return '', ''
    parts = full_name.split()
    first_name = ' '.join(parts[:-1])
    last_name = parts[-1]
    return first_name, last_name

# Define the function to convert each word in the first name to the desired format
def convert_to_initial(name):
    if pd.isnull(name) or name == '':
        return ''
    initials = [word[0].upper() + '.' for word in name.split()]
    return ' '.join(initials)

In [13]:
dfa = a.copy()
dfb = b.copy()

In [14]:
# Apply the split_full_name function to separate first name and last name
dfa[['first_name', 'last_name']] = dfa['label'].apply(lambda x: pd.Series(split_full_name(x)))

# Apply the convert_to_initial function to the first name column
dfa['first_name_initial'] = dfa['first_name'].apply(convert_to_initial)


In [15]:
# Apply the split_full_name function to separate first name and last name
dfb[['first_name', 'last_name']] = dfb['Standard/Label Name'].apply(lambda x: pd.Series(split_full_name(x)))

# Apply the convert_to_initial function to the first name column
dfb['first_name_initial'] = dfb['first_name'].apply(convert_to_initial)

In [16]:
dfa.head()

Unnamed: 0_level_0,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,lastName,wikiID,first_name,last_name,first_name_initial
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Q100142069,Frida Eggens,,,SE,,,Eggens,Eggens,Frida,Eggens,Q100142069,Frida,Eggens,F.
Q100146795,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison,Q100146795,Elizabeth,Harrison,E.
Q100149196,Russell Cox,,,,,0000-0001-5149-1709,,,Russell,Cox,Q100149196,Russell,Cox,R.
Q100152296,Alda Pereira da Fonseca,1882.0,,BR,,,,,Alda,Fonseca,Q100152296,Alda Pereira da,Fonseca,A. P. D.
Q100156193,Laurence Henry Millener,1914.0,2000.0,NZ,,Q100156193,,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence,Millener,Q100156193,Laurence Henry,Millener,L. H.


In [17]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')# indexer.sortedneighbourhood indexer.full() indexer.block
candidate_links = indexer.index(dfa, dfb)

In [42]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
# compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0     17533
1.0     21517
0.0    964962
dtype: int64

In [89]:
predictions = features[features.sum(axis=1) >= 1]
len(predictions.index.intersection(true_matches))/len(temp)

0.7268390174714718

In [43]:
# Threshold-based methods

# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

Threshold-Based: 17533 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [44]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[ 14538  16484]
 [  2995 969995]]
Precision: 0.8291792619631552
Recall: 0.4686351621429953
Accuracy: 0.980598837464094
F-Measure: 0.5988260735248687


In [45]:
# Return all the true matched feature as test set
test_matches_index = features.index.intersection(true_matches)
print(test_matches_index)

MultiIndex([('Q100887787', 19901),
            ('Q117455407', 50431),
            (  'Q1047867', 48996),
            ('Q117459360', 14570),
            ('Q105721668', 25189),
            ( 'Q33665872', 15901),
            (   'Q106785', 21404),
            ( 'Q21505291', 14754),
            ('Q108403262', 26020),
            ( 'Q36645320', 71958),
            ...
            ( 'Q95101266', 36482),
            ( 'Q95166173', 11149),
            ( 'Q95175049',  7562),
            ( 'Q95394399', 26018),
            ( 'Q95471212',  4709),
            (    'Q95772', 15179),
            (    'Q96384',  5059),
            (    'Q27684',  1609),
            (    'Q59570',  2718),
            (    'Q62938',  1671)],
           length=27766)


In [46]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [47]:
# Expectation/Conditional Maxisation Classifier

# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [48]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


In [49]:
# K-means Classifier
# Initialize the classifier
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.learn(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [50]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


In [51]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)

# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

In [52]:
# Logistic Regression

# Initialize the classifier
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [53]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


In [54]:
# Support Vector Machine Classifier

# Initialize the classifier
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [55]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


In [56]:
# Naive Bayes Classifier

# Initialize the classifier
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [57]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


In [None]:
def find_AinB(df1, col1, df2, col2, threshold):
    newCol = []
    for index1, rowValue1 in df1[col1].iteritems():
        temp = []
        for index2, rowValue2 in df2[col2].iteritems():
            # 
            sim = process.extractOne(str(rowValue1),str(rowValue2).split(','), scorer=fuzz.ratio)
            if sim[-1] >= threshold: temp.append(index2)
        newCol.append(temp)
    return newCol

In [None]:
temp = find_AinB(a, 'label', b, 'Name', 85)

In [None]:
from fastparquet import ParquetFile
temp.to_parquet('find_names.parquet', engine='fastparquet',encoding='utf-8')

In [44]:
# Load feature DataFrames
dfa_feature1 = pd.read_csv('WinH.csv')  
dfb_feature1 = pd.read_csv('HinW.csv')  

In [45]:
dfa_feature1

Unnamed: 0,wikiID,wikiLabel_in_HarvardNameList
0,Q100142069,[]
1,Q100146795,[76340]
2,Q100149196,[]
3,Q100152296,[]
4,Q100156193,[]
...,...,...
71983,Q7324,"[34526, 8283, 82867]"
71984,Q7450,[]
71985,Q762,[]
71986,Q8619,[]


In [46]:
dfb_feature1

Unnamed: 0,harvardIndex,havard_in_WikiNameList
0,89438,['Q47125658']
1,64680,"['Q102788', 'Q36545989']"
2,34653,"['Q113588015', 'Q4442569']"
3,42819,['Q6158207']
4,82862,[]
...,...,...
76491,72000,[]
76492,83026,"['Q21522832', 'Q21522835']"
76493,7450,[]
76494,702,[]


In [24]:
def add_boolean_column(df, list_column_name, boolean_column_name):
    df[boolean_column_name] = df[list_column_name].apply(lambda x: False if x == "[]" else bool(x))
    return df

In [25]:
# Merge feature DataFrames into main DataFrames
dfa = pd.merge(dfa, dfa_feature1, on='wikiID')
dfb = pd.merge(dfb, dfb_feature1, on='harvardIndex')

# Set index
dfa.set_index('wikiID', inplace=True)
dfb.set_index('harvardIndex', inplace=True)

In [26]:
dfa = add_boolean_column(dfa, 'wikiLabel_in_HarvardNameList', 'has_HarvardNameList')
dfa

Unnamed: 0_level_0,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,lastName,first_name,last_name,first_name_initial,wikiLabel_in_HarvardNameList,has_HarvardNameList
wikiID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Q100142069,Frida Eggens,,,SE,,,Eggens,Eggens,Frida,Eggens,Frida,Eggens,F.,[],False
Q100146795,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison,Elizabeth,Harrison,E.,[76340],True
Q100149196,Russell Cox,,,,,0000-0001-5149-1709,,,Russell,Cox,Russell,Cox,R.,[],False
Q100152296,Alda Pereira da Fonseca,1882.0,,BR,,,,,Alda,Fonseca,Alda Pereira da,Fonseca,A. P. D.,[],False
Q100156193,Laurence Henry Millener,1914.0,2000.0,NZ,,Q100156193,,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence,Millener,Laurence Henry,Millener,L. H.,[],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q7324,James Cook,1728.0,1779.0,,8283.0,Q7324,Cook,"Captain James Cook, Cook, Captain Cook, J. Coo...",James,Cook,James,Cook,J.,"[34526, 8283, 82867]",True
Q7450,Asima Chatterjee,1917.0,2006.0,IN,,,,Asima Chattopadhyay,Asima,Chatterjee,Asima,Chatterjee,A.,[],False
Q762,Leonardo da Vinci,1452.0,1519.0,,,,,"Leonardo di ser Piero da Vinci, Leonardo, da V...",Leonardo,Vinci,Leonardo da,Vinci,L. D.,[],False
Q8619,Pierre Trudeau,1919.0,2000.0,CA,,Q8619,,"Pierre Elliott Trudeau, Joseph Philippe Pierre...",Pierre,Trudeau,Pierre,Trudeau,P.,[],False


In [27]:
dfb = add_boolean_column(dfb, 'havard_in_WikiNameList', 'has_WikiNameList')
dfb

Unnamed: 0_level_0,Standard/Label Name,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName,middleName,lastName,B & P Author Abbrev.,Name,first_name,last_name,first_name_initial,havard_in_WikiNameList,has_WikiNameList
harvardIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
89438,Shin. Sato,,,,DE,Shinya,,Sato,Shin. Sato,"Sato, Shinya, Shinya Sato",Shin.,Sato,S.,['Q47125658'],True
64680,E. J. Hoffman,,,,,Emily,J.,Hoffman,E. J. Hoffman,"Hoffman, Emily J., Emily J. Hoffman",E. J.,Hoffman,E. J.,"['Q102788', 'Q36545989']",True
34653,D. W. Stevenson,1942.0,,False,US,Dennis,William,Stevenson,D. W. Stev.,"Stevenson, Dennis William, Dennis William Stev...",D. W.,Stevenson,D. W.,"['Q113588015', 'Q4442569']",True
42819,D. Müller-Doblies,1938.0,,False,DE,Dietrich,,Müller-Doblies,D. Müll.-Doblies,"Müller-Doblies, Dietrich, Dietrich Müller-Do...",D.,Müller-Doblies,D.,['Q6158207'],True
82862,Boutroux,,,,,A.,,Boutroux,Boutroux,"Boutroux, A., A. Boutroux",,Boutroux,,[],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72000,Boeuf,,,,"FR, TN",F.,,Boeuf,Boeuf,"Boeuf, F., F. Boeuf",,Boeuf,,[],False
83026,Perold,1928.0,,False,ZA,Sarie,Magdalena,Perold,Perold,"Perold, Sarie Magdalena, Sarie Magdalena Perold",,Perold,,"['Q21522832', 'Q21522835']",True
7450,Colin C. Stewart,1873.0,1944.0,False,US,Colin,C.,Stewart,,"Stewart, Colin C., C. C. Stewart bis",Colin C.,Stewart,C. C.,[],False
702,J. É. Doassans,1852.0,1908.0,False,FR,Jacques,Emile,Doassans,Doass.,"Doassans, Jacques Emile, Jaques Emile Doassans",J. É.,Doassans,J. E.,[],False


In [28]:
# Define the custom feature function
def custom_boolean_feature(index, dfa, dfb, feature1, feature2):
    result = []
    for a, b in index:
        feature1_value = dfa.loc[a, feature1]
        feature2_value = dfb.loc[b, feature2]
        result.append(feature1_value and feature2_value)
    return pd.Series(result, index=index)

In [54]:
# Initialize the indexer and create candidate links
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')  # Can change to other methods like indexer.full(), indexer.block()
candidate_links = indexer.index(dfa, dfb)
# Initialize the comparator
compare_cl = recordlinkage.Compare()

In [50]:
# Add existing comparison features
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# Add boolean comparison feature using compare.exact
compare_cl.exact('has_HarvardNameList', 'has_WikiNameList', label='custom_boolean_feature')

<Compare>

In [33]:
# Create a custom boolean feature
def create_custom_boolean_feature(candidate_links, dfa, dfb, feature1, feature2):
    custom_feature = []
    for a, b in candidate_links:
        feature1_value = dfa.loc[a, feature1]
        feature2_value = dfb.loc[b, feature2]
        result = 1 if feature1_value or feature2_value else 0
        custom_feature.append(result)
    return pd.Series(custom_feature, index=candidate_links)

In [34]:
# Compute the custom boolean feature
custom_feature_series = create_custom_boolean_feature(candidate_links, dfa, dfb, 'has_HarvardNameList', 'has_WikiNameList')

# Compute the features using recordlinkage
features = compare_cl.compute(candidate_links, dfa, dfb)

# Create a DataFrame from the custom feature series
custom_feature_df = pd.DataFrame(custom_feature_series, columns=['custom_boolean_feature'])

# Merge the custom feature DataFrame with the features DataFrame
features = features.merge(custom_feature_df, left_index=True, right_index=True)

KeyboardInterrupt: 

In [36]:
# Display the result (features DataFrame)
print(features)

                         firstName  dateOfBirth  custom_boolean_feature
wikiID     harvardIndex                                                
Q100142069 12309               0.0            0                       0
           7349                0.0            0                       1
           33113               0.0            0                       1
           82207               0.0            0                       0
Q100146795 11073               0.0            0                       1
...                            ...          ...                     ...
Q65451     76231               0.0            0                       1
Q65499     44058               0.0            0                       1
           6624                0.0            0                       1
Q6694      48627               0.0            0                       1
Q762       49464               0.0            0                       0

[1004012 rows x 3 columns]


In [35]:
features = compare_cl.compute(candidate_links, dfa, dfb)

In [37]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

print(train)

                         firstName  dateOfBirth  custom_boolean_feature
wikiID     harvardIndex                                                
Q13221655  63047               0.0            0                       1
Q36705638  86181               0.0            0                       1
Q51278449  49682               0.0            0                       1
Q88830067  79764               0.0            0                       0
Q36529743  77298               0.0            0                       1
...                            ...          ...                     ...
Q36537887  40932               0.0            0                       0
Q88820519  67270               0.0            0                       0
Q98915388  46079               0.0            0                       0
Q107574272 63069               0.0            0                       1
Q94408787  45923               0.0            0                       1

[753009 rows x 3 columns]


In [38]:
dfa

Unnamed: 0_level_0,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,lastName,first_name,last_name,first_name_initial,wikiLabel_in_HarvardNameList,has_HarvardNameList
wikiID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Q100142069,Frida Eggens,,,SE,,,Eggens,Eggens,Frida,Eggens,Frida,Eggens,F.,[],False
Q100146795,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison,Elizabeth,Harrison,E.,[76340],True
Q100149196,Russell Cox,,,,,0000-0001-5149-1709,,,Russell,Cox,Russell,Cox,R.,[],False
Q100152296,Alda Pereira da Fonseca,1882.0,,BR,,,,,Alda,Fonseca,Alda Pereira da,Fonseca,A. P. D.,[],False
Q100156193,Laurence Henry Millener,1914.0,2000.0,NZ,,Q100156193,,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence,Millener,Laurence Henry,Millener,L. H.,[],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q7324,James Cook,1728.0,1779.0,,8283.0,Q7324,Cook,"Captain James Cook, Cook, Captain Cook, J. Coo...",James,Cook,James,Cook,J.,"[34526, 8283, 82867]",True
Q7450,Asima Chatterjee,1917.0,2006.0,IN,,,,Asima Chattopadhyay,Asima,Chatterjee,Asima,Chatterjee,A.,[],False
Q762,Leonardo da Vinci,1452.0,1519.0,,,,,"Leonardo di ser Piero da Vinci, Leonardo, da V...",Leonardo,Vinci,Leonardo da,Vinci,L. D.,[],False
Q8619,Pierre Trudeau,1919.0,2000.0,CA,,Q8619,,"Pierre Elliott Trudeau, Joseph Philippe Pierre...",Pierre,Trudeau,Pierre,Trudeau,P.,[],False


In [39]:
# Filter features to include only true matches
true_matches_filtered = true_matches.intersection(features.index)
print(true_matches_filtered)
features_true = features.loc[true_matches_filtered]
print(features_true)

MultiIndex([(   'Q100222', 27614),
            (  'Q1002345', 17284),
            ('Q100255559', 18988),
            ('Q100354624',  3553),
            ('Q100377900', 88384),
            (   'Q100411', 23934),
            ('Q100454982', 50126),
            (   'Q100523', 78274),
            ('Q100587885', 29015),
            ('Q100587966', 14467),
            ...
            (    'Q65192', 80079),
            (    'Q65219', 25324),
            (     'Q6527', 26300),
            (    'Q65302',  4297),
            (    'Q65400',  2192),
            (    'Q65402', 46610),
            (    'Q65451',  1041),
            (    'Q65505', 80335),
            (     'Q6694',  1813),
            (     'Q7324',  8283)],
           names=['wikiID', 'harvardIndex'], length=27766)
                         firstName  dateOfBirth  custom_boolean_feature
wikiID     harvardIndex                                                
Q100222    27614               1.0            1                       1
Q1002345

In [40]:
train = features_true
print(train)

                         firstName  dateOfBirth  custom_boolean_feature
wikiID     harvardIndex                                                
Q100222    27614               1.0            1                       1
Q1002345   17284               1.0            1                       1
Q100255559 18988               0.0            1                       1
Q100354624 3553                1.0            1                       0
Q100377900 88384               1.0            0                       1
...                            ...          ...                     ...
Q65402     46610               1.0            1                       1
Q65451     1041                1.0            1                       1
Q65505     80335               1.0            1                       1
Q6694      1813                0.0            1                       1
Q7324      8283                1.0            1                       1

[27766 rows x 3 columns]


In [41]:
test = features
print(test)

                         firstName  dateOfBirth  custom_boolean_feature
wikiID     harvardIndex                                                
Q100142069 12309               0.0            0                       0
           7349                0.0            0                       1
           33113               0.0            0                       1
           82207               0.0            0                       0
Q100146795 11073               0.0            0                       1
...                            ...          ...                     ...
Q65451     76231               0.0            0                       1
Q65499     44058               0.0            0                       1
           6624                0.0            0                       1
Q6694      48627               0.0            0                       1
Q762       49464               0.0            0                       0

[1004012 rows x 3 columns]


In [213]:
# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)
test_matches_index

MultiIndex([('Q100887787', 19901),
            ('Q117455407', 50431),
            (  'Q1047867', 48996),
            ('Q117459360', 14570),
            ('Q105721668', 25189),
            ( 'Q33665872', 15901),
            (   'Q106785', 21404),
            ( 'Q21505291', 14754),
            ('Q108403262', 26020),
            ( 'Q36645320', 71958),
            ...
            ( 'Q95101266', 36482),
            ( 'Q95166173', 11149),
            ( 'Q95175049',  7562),
            ( 'Q95394399', 26018),
            ( 'Q95471212',  4709),
            (    'Q95772', 15179),
            (    'Q96384',  5059),
            (    'Q27684',  1609),
            (    'Q59570',  2718),
            (    'Q62938',  1671)],
           names=['wikiID', 'harvardIndex'], length=27766)

In [214]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with threshold-based methods
Threshold-Based: 31052 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 20763  10259]
 [ 10289 962701]]
Precision: 0.6686525827643952
Recall: 0.6692992070143768
Accuracy: 0.9795341091540739
F-Measure: 0.668975738634533

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 17533 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 14538  13228]
 [  2995 973251]]
Precision: 0.8291792619631552
Recall: 0.5235900021609162
Accuracy: 0.983841826591714
F-Measure: 0.6418684739177465

Training with K-means Classifier
K-means: 31052 matches


  return len(links_true & links_pred)


Confusion Matrix:
 [[ 20763   7003]
 [ 10289 965957]]
Precision: 0.6686525827643952
Recall: 0.7477850608658071
Accuracy: 0.9827770982816938
F-Measure: 0.7060083647862899


  return int(total) - len(links_true | links_pred)


In [42]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [57]:
# Add existing comparison features
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
# Add boolean comparison feature using compare.exact
compare_cl.exact('has_HarvardNameList', 'has_WikiNameList', label='custom_boolean_feature')

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [58]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with threshold-based methods
Threshold-Based: 32473 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 21631   9391]
 [ 10842 962148]]
Precision: 0.6661226249499584
Recall: 0.6972793501386113
Accuracy: 0.9798478504240985
F-Measure: 0.6813449877943145

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 5114 matches
Confusion Matrix:
 [[  4210   2801]
 [   904 243088]]
Precision: 0.8232303480641376
Recall: 0.6004849522179432
Accuracy: 0.9852392202483635
F-Measure: 0.6944329896907216

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 5114 matches
Confusion Matrix:
 [[  4210   2801]
 [   904 243088]]
Precision: 0.8232303480641376
Recall: 0.6004849522179432
Accuracy: 0.9852392202483635
F-Measure: 0.6944329896907216


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [59]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 4678 matches
Confusion Matrix:
 [[  3871   3140]
 [   807 243185]]
Precision: 0.8274903805044891
Recall: 0.5521323634288975
Accuracy: 0.9842750883455577
F-Measure: 0.6623321071092481

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 5114 matches
Confusion Matrix:
 [[  4210   2801]
 [   904 243088]]
Precision: 0.8232303480641376
Recall: 0.6004849522179432
Accuracy: 0.9852392202483635
F-Measure: 0.6944329896907216

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 5114 matches
Confusion Matrix:
 [[  4210   2801]
 [   904 243088]]
Precision: 0.8232303480641376
Recall: 0.6004849522179432
Accuracy: 0.9852392202483635
F-Measure: 0.6944329896907216


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [43]:
matches = predictions.to_frame(index=False).reset_index()
# Ensure the matches DataFrame is properly structured
matches = matches[['wikiID', 'harvardIndex']]

# Convert the matches DataFrame to a MultiIndex
matches['harvardIndex'] = matches['harvardIndex'].astype(int)
matches_index = pd.MultiIndex.from_frame(matches)
print(matches_index)

# Find the intersection
intersection = matches_index.intersection(true_matches)
# print(true_matches)
print(intersection)

# Find the non-intersection pairs
non_intersection = matches_index.difference(true_matches)
print(non_intersection)

# Create a DataFrame from the intersection
# intersection_df = pd.DataFrame(list(intersection), columns=['wikiID', 'harvardIndex'])

# Display the intersection DataFrame
# print(intersection_df)

MultiIndex([('Q21607397', 76979),
            ( 'Q2738600',  1143),
            ('Q21607469', 82514),
            ('Q21607087', 79206),
            ( 'Q4278489', 14625),
            ('Q21520095', 23364),
            ('Q21340735',  7852),
            ('Q21522632', 38471),
            ('Q21505412', 70911),
            ( 'Q4814932', 67037),
            ...
            ('Q21512488', 74732),
            ('Q21519722', 70659),
            ('Q21608672', 16236),
            ('Q21509717', 63975),
            ('Q21522788', 79656),
            ( 'Q5408677', 70352),
            ('Q21338085', 41688),
            ('Q17279919', 44843),
            ('Q21515836', 42125),
            ('Q21609897', 48371)],
           names=['wikiID', 'harvardIndex'], length=4501)
MultiIndex([('Q21607397', 76979),
            ('Q21607469', 82514),
            ('Q21607087', 79206),
            ( 'Q4278489', 14625),
            ('Q21520095', 23364),
            ('Q21340735',  7852),
            ('Q21522632', 38471),
       

In [226]:
# Create a DataFrame from the non-intersection pairs
non_intersection_df = pd.DataFrame(list(non_intersection), columns=['wikiID', 'harvardIndex'])
# Convert matches columns to string to ensure consistency
non_intersection_df['wikiID'] = non_intersection_df['wikiID'].astype(str)
non_intersection_df['harvardIndex'] = non_intersection_df['harvardIndex'].astype(str)

In [227]:
notfound_matched_dfa = dfa.loc[non_intersection_df['wikiID']].reset_index()
notfound_matched_dfb = dfb.loc[non_intersection_df['harvardIndex']].reset_index()
# Combine the matched DataFrames side by side
combined_notfound_matches = pd.concat([notfound_matched_dfa, notfound_matched_dfb], axis=1)

# Optionally add a label to identify matched rows
combined_notfound_matches['matched'] = True

In [228]:
combined_notfound_matches

Unnamed: 0,wikiID,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,...,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName.1,middleName,lastName,B & P Author Abbrev.,Name,matched
0,Q100146795,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,...,,,,,Elizabeth,,Harris,E. Harris,"Harris, Elizabeth, Elizabeth Harris",True
1,Q100587966,A. W. Anderson,,,,14467,,,,A.,...,,,,DK,A.,Edm.,Andersen,,"Andersen, A. Edm.",True
2,Q100587966,A. W. Anderson,,,,14467,,,,A.,...,,,,US,A.,L.,Anderson,,"Anderson, A. L.",True
3,Q100709237,Hein Hidde Zeijlstra,1881.0,1961.0,,,,,,Hein,...,1881.0,,False,ID,Hein,Hidde,Zeijlstra,,"Zeijlstra, Hein Hidde",True
4,Q100869469,Ivar Holmgren,1889.0,1975.0,SE,,,,,Ivar,...,1889.0,,False,EC,Ivar,Albert,Holmgren,,"Holmgren, Ivar Albert",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10284,Q99616,Carl Heinrich 'Schultzenstein' Schultz,1798.0,1871.0,DE,60868.0,,Schultz Sch.,Schultz Sch.,Carl,...,1765.0,1837.0,False,DE,Carl,Friedrich,Schultz,Schultz,"Schultz, Carl Friedrich, Carl Friedrich Schultz",True
10285,Q99662370,Henry Tuke Mennell,1835.0,1923.0,GB,,Q99662370,,H. T. Mennell,Henry,...,1835.0,1923.0,False,,Henry,Tuke,Mennell,,"Mennell, Henry Tuke",True
10286,Q99686,Kurt Harz,1915.0,1996.0,DE,,Q99686,,,Kurt,...,1858.0,1939.0,False,DE,Kurt,,Harz,K.Harz,"Harz, Kurt, Kurt Harz",True
10287,Q99736256,Rudolf Wagner,1872.0,1938.0,DE,1427.0,,R.Wagner,,Rudolf,...,1842.0,1913.0,False,DE,Rudolf,Eduard,Wagner,R. E. Wagner,"Wagner, Rudolf Eduard, Rudolf Eduard Wagner",True


In [189]:
dfa = a.copy()
dfb = b.copy()

# Set indices
dfa.set_index('wikiID', inplace=True)
dfb.set_index('harvardIndex', inplace=True)

# Convert matches columns to string to ensure consistency
matches['wikiID'] = matches['wikiID'].astype(str)
matches['harvardIndex'] = matches['harvardIndex'].astype(str)
dfa.index = dfa.index.astype(str)
dfb.index = dfb.index.astype(str)

# Ensure the matches DataFrame is properly structured
matches = matches[['wikiID', 'harvardIndex']]

# Merge the matches with the original data
matched_dfa = dfa.loc[matches['wikiID']].reset_index()
matched_dfb = dfb.loc[matches['harvardIndex']].reset_index()

# Combine the matched DataFrames side by side
combined_matches = pd.concat([matched_dfa, matched_dfb], axis=1)

# Optionally add a label to identify matched rows
combined_matches['matched'] = True

In [190]:
combined_matches

Unnamed: 0,wikiID,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,...,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName.1,middleName,lastName,B & P Author Abbrev.,Name,matched
0,Q8012324,William Higgins Coleman,1812.0,1863.0,,481,,Coleman,Coleman,William,...,1811.0,1899.0,False,NZ,William,,Colenso,,"Colenso, Rev. William",True
1,Q8012324,William Higgins Coleman,1812.0,1863.0,,481,,Coleman,Coleman,William,...,1811.0,1899.0,False,NZ,William,,Colenso,Colenso,"Colenso, (John) William, William Colenso",True
2,Q100887787,A.J. Ultee,1878.0,1964.0,NL,19901,,,Arnoldus Johannes Ultee,A.J.,...,1878.0,1963.0,False,ID,Arnoldus,Johannes,Ultée,,"Ultée, Arnoldus Johannes",True
3,Q117455407,Rosa Lydia Otto,1909.0,,,50431,,,"Rosa Lydia Otto-Surbeck, Mrs L. Otto-Surbeck",Rosa,...,1909.0,,False,ID,Rosa,Lydia,Otto-Surbeck,,"Otto-Surbeck, Rosa Lydia, Otto-Surbeck, R. L.",True
4,Q117600078,Martin Jones,1897.0,1979.0,,,,,Martin G. Jones,Martin,...,1897.0,1944.0,False,ID,Willem,de,Jong,,"Jong, Willem de",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31047,Q97495995,Józef Liboszyc,1783.0,1832.0,,,,,,Józef,...,1783.0,1824.0,False,RU,Joseph,,Liboschitz,Libosch.,"Liboschitz, Joseph, Joseph Liboschitz",True
31048,Q98266,Willy Cullmann,1905.0,1992.0,DE,,,Cullmann,"W. Cullmann, Wilhelm Cullmann",Willy,...,1905.0,1992.0,False,US,Willy,,Cullman,Cullman,"Cullman, Willy, Willy Cullmann",True
31049,Q57763,Ludwig Leichhardt,1813.0,1848.0,AU,70138.0,Q57763,Leichh.,"Leichh., Friedrich Wilhelm Ludwig Leichhardt, ...",Ludwig,...,1813.0,1848.0,False,AU,Ludwig,,Leichardt,,"Leichardt, (Friedrich Wilhelm) Ludwig",True
31050,Q59570,Theodor Koch-Grunberg,1872.0,1924.0,DE,2718.0,,,Theodor Koch-Grünberg,Theodor,...,1872.0,1924.0,False,"CO, VE, BR",Christian,Theodor,Koch,,"Koch, Christian Theodor, Koch, Christian Theod...",True


In [229]:
W_match_H = pd.concat([combined_matches, combined_notfound_matches], axis=1)

In [230]:
W_match_H

Unnamed: 0,wikiID,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,...,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName.1,middleName,lastName,B & P Author Abbrev.,Name,matched
0,Q8012324,William Higgins Coleman,1812.0,1863.0,,481,,Coleman,Coleman,William,...,,,,,Elizabeth,,Harris,E. Harris,"Harris, Elizabeth, Elizabeth Harris",True
1,Q8012324,William Higgins Coleman,1812.0,1863.0,,481,,Coleman,Coleman,William,...,,,,DK,A.,Edm.,Andersen,,"Andersen, A. Edm.",True
2,Q100887787,A.J. Ultee,1878.0,1964.0,NL,19901,,,Arnoldus Johannes Ultee,A.J.,...,,,,US,A.,L.,Anderson,,"Anderson, A. L.",True
3,Q117455407,Rosa Lydia Otto,1909.0,,,50431,,,"Rosa Lydia Otto-Surbeck, Mrs L. Otto-Surbeck",Rosa,...,1881.0,,False,ID,Hein,Hidde,Zeijlstra,,"Zeijlstra, Hein Hidde",True
4,Q117600078,Martin Jones,1897.0,1979.0,,,,,Martin G. Jones,Martin,...,1889.0,,False,EC,Ivar,Albert,Holmgren,,"Holmgren, Ivar Albert",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31047,Q97495995,Józef Liboszyc,1783.0,1832.0,,,,,,Józef,...,,,,,,,,,,
31048,Q98266,Willy Cullmann,1905.0,1992.0,DE,,,Cullmann,"W. Cullmann, Wilhelm Cullmann",Willy,...,,,,,,,,,,
31049,Q57763,Ludwig Leichhardt,1813.0,1848.0,AU,70138.0,Q57763,Leichh.,"Leichh., Friedrich Wilhelm Ludwig Leichhardt, ...",Ludwig,...,,,,,,,,,,
31050,Q59570,Theodor Koch-Grunberg,1872.0,1924.0,DE,2718.0,,,Theodor Koch-Grünberg,Theodor,...,,,,,,,,,,


In [151]:
# If needed, add unmatched rows with a 'matched' flag as False
unmatched_dfa = dfa[~dfa.index.isin(matches['harvardIndex'])].reset_index()
unmatched_dfa['matched'] = False

unmatched_dfb = dfb[~dfb.index.isin(matches['harvardIndex'])].reset_index()
unmatched_dfb['matched'] = False

In [231]:
b

Unnamed: 0_level_0,Standard/Label Name,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName,middleName,lastName,B & P Author Abbrev.,Name,harvardIndex
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
89438,Shin. Sato,,,,DE,Shinya,,Sato,Shin. Sato,"Sato, Shinya, Shinya Sato",89438
64680,E. J. Hoffman,,,,,Emily,J.,Hoffman,E. J. Hoffman,"Hoffman, Emily J., Emily J. Hoffman",64680
34653,D. W. Stevenson,1942.0,,False,US,Dennis,William,Stevenson,D. W. Stev.,"Stevenson, Dennis William, Dennis William Stev...",34653
42819,D. Müller-Doblies,1938.0,,False,DE,Dietrich,,Müller-Doblies,D. Müll.-Doblies,"Müller-Doblies, Dietrich, Dietrich Müller-Do...",42819
82862,Boutroux,,,,,A.,,Boutroux,Boutroux,"Boutroux, A., A. Boutroux",82862
...,...,...,...,...,...,...,...,...,...,...,...
72000,Boeuf,,,,"FR, TN",F.,,Boeuf,Boeuf,"Boeuf, F., F. Boeuf",72000
83026,Perold,1928.0,,False,ZA,Sarie,Magdalena,Perold,Perold,"Perold, Sarie Magdalena, Sarie Magdalena Perold",83026
7450,Colin C. Stewart,1873.0,1944.0,False,US,Colin,C.,Stewart,,"Stewart, Colin C., C. C. Stewart bis",7450
702,J. É. Doassans,1852.0,1908.0,False,FR,Jacques,Emile,Doassans,Doass.,"Doassans, Jacques Emile, Jaques Emile Doassans",702


In [None]:
# Find out the True Matches of dataframe A and dataframe B
a['harvardIndex'] = pd.to_numeric(a['harvardIndex'],errors='coerce') 
temp = pd.merge(a, b, how='inner', on=None, left_on='harvardIndex', right_on='harvardIndex',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_harvard'), copy=False, indicator=False)

# print(temp.columns.to_list())
# temp.head()

true_matches = define_true_pairs(temp['wikiID'],temp['harvardIndex'].astype(int),'wikiIndex','harvardIndex')
# print(true_matches)
print('There is '+ str(len(true_matches)) +' HarvardIndex records in Wikidata that can find a match, which is ' + str(len(true_matches)/len(a)*100) +'%')
print('There is '+ str(len(true_matches)) +' HarvardIndex records can be found in Wikidata, which is ' + str(len(true_matches)/len(b)*100) +'%')