In [2]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import itertools
import os
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from thefuzz import fuzz
from thefuzz import process
import time
import re
import recordlinkage
from sklearn.model_selection import train_test_split
from fastparquet import ParquetFile

gc.collect()

0

In [3]:
'''
# this function will remove all special charaters -- including spaces
# but too slow comparing to replace -- used process_time() for evaluation
def clean_text(text): # fa
    a = ""
    text = a.join(char for char in text if char.isalnum())
    return text
'''
# Remove square blankets auto generated during data alignment process
def clean_text(text): # fb
    text = text.replace('[', '').replace(']','').replace("'", '')
    return text

def remove_spec_in_col(df, col):
    newCol = []
    for index, rowValue in df[col].iteritems():
        if pd.notnull(rowValue):
            newCol.append(clean_text(rowValue))
        else:
            newCol.append(np.nan)
    return newCol

In [4]:
'''
use the pd.merge above to prepare training data -- find out the index true pairs
training = 0.75, testing = 0.25, no validat set

experiment setting
    - supervised: Logistics regression, navie bayes, SVM
    - undupervised: k-mean, ECM

refs
https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html
https://recordlinkage.readthedocs.io/en/latest/guides/classifiers.html#
'''

def define_true_pairs(indexList1, indexList2, indexName1, indexName2):
    arrays = [indexList1, indexList2]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=[indexName1, indexName2])
    return index

In [5]:
def add_boolean_column(df, list_column_name, boolean_column_name):
    df[boolean_column_name] = df[list_column_name].apply(lambda x: False if x == "[]" else bool(x))
    return df

In [6]:
a = pd.read_csv('~/Downloads/a.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
a = pd.concat(a)
a['wikiID'] = a['id']
a = a.set_index('id')
print(a.head())

b = pd.read_csv('~/Downloads/b.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
b = pd.concat(b)
b['harvardIndex'] = b['id']
b = b.set_index('id')
print(b.head())

                              label  dateOfBirth  dateOfDeath  \
id                                                              
Q100142069             Frida Eggens          NaN          NaN   
Q100146795       Elizabeth Harrison       1792.0       1834.0   
Q100149196              Russell Cox          NaN          NaN   
Q100152296  Alda Pereira da Fonseca       1882.0          NaN   
Q100156193  Laurence Henry Millener       1914.0       2000.0   

           countryOfCitizenshipISO harvardIndex             bionomia  \
id                                                                     
Q100142069                      SE          NaN                  NaN   
Q100146795                      GB          NaN           Q100146795   
Q100149196                     NaN          NaN  0000-0001-5149-1709   
Q100152296                      BR          NaN                  NaN   
Q100156193                      NZ          NaN           Q100156193   

           authorAbbrv                 

In [None]:
# Find out the True Matches of dataframe A and dataframe B
a['harvardIndex'] = pd.to_numeric(a['harvardIndex'],errors='coerce') 
temp = pd.merge(a, b, how='inner', on=None, left_on='harvardIndex', right_on='harvardIndex',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_harvard'), copy=False, indicator=False)

print(temp.columns.to_list())
temp.head()

In [5]:
true_matches = define_true_pairs(temp['wikiID'],temp['harvardIndex'].astype(int),'wikiIndex','harvardIndex')
# print(true_matches)
print('There is '+ str(len(true_matches)) +' HarvardIndex records in Wikidata that can find a match, which is ' + str(len(true_matches)/len(a)*100) +'%')
print('There is '+ str(len(true_matches)) +' HarvardIndex records can be found in Wikidata, which is ' + str(len(true_matches)/len(b)*100) +'%')

There is 31022 HarvardIndex records in Wikidata that can find a match, which is 43.09329332666555%
There is 31022 HarvardIndex records can be found in Wikidata, which is 40.55375444467685%


In [6]:
# Find non-matched pairs in dataframe a
non_matched_a = a[~a['harvardIndex'].isin(temp['harvardIndex']) & a['harvardIndex'].notna()]
# Find non-matched pairs in dataframe b
non_matched_b = b[~b['harvardIndex'].isin(temp['harvardIndex']) & b['harvardIndex'].notnull()]
print('Non-matched data where HarvardIndex is not null or NaN:')
print('There is '+ str(len(non_matched_a)) +' HarvardIndex records in Wikidata which cannot be found in HarvardIndex, which is ' + str(len(non_matched_a)/len(a)*100) +'%')
print('There is '+ str(len(non_matched_b)) +' HarvardIndex records cannot find a match, which is ' + str(len(non_matched_b)/len(b)*100) +'%')

Non-matched data where HarvardIndex is not null or NaN:
There is 2389 HarvardIndex records in Wikidata which cannot be found in HarvardIndex, which is 3.3186086569983884%
There is 45486 HarvardIndex records cannot find a match, which is 59.4619326500732%


In [7]:
dfa = a.copy()
dfb = b.copy()
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')# indexer.sortedneighbourhood indexer.full() indexer.block
candidate_links = indexer.index(dfa, dfb)

In [20]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
# compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [22]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Training with threshold-based methods
Threshold-Based: 17533 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 14538  16484]
 [  2995 969995]]
Precision: 0.8291792619631552
Recall: 0.4686351621429953
Accuracy: 0.980598837464094
F-Measure: 0.5988260735248687

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 4501 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453

Training with K-means Classifier
K-means: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)



Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 4501 matches
Confusion Matrix:
 [[  3732   3279]
 [   769 243223]]
Precision: 0.8291490779826706
Recall: 0.5323063756953359
Accuracy: 0.9838727027167006
F-Measure: 0.6483669214732453


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [23]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
# compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [24]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Training with threshold-based methods
Threshold-Based: 18540 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 16019  15003]
 [  2521 970469]]
Precision: 0.8640237324703344
Recall: 0.516375475469022
Accuracy: 0.9825460253463106
F-Measure: 0.6464226625237076

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 4722 matches
Confusion Matrix:
 [[  4101   2910]
 [   621 243371]]
Precision: 0.8684879288437103
Recall: 0.5849379546427044
Accuracy: 0.9859324390545133
F-Measure: 0.699053950396318

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 4722 matches
Confusion Matrix:
 [[  4101   2910]
 [   621 243371]]
Precision: 0.8684879288437103
Recall: 0.5849379546427044
Accuracy: 0.9859324390545133
F-Measure: 0.699053950396318


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)



Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 4722 matches
Confusion Matrix:
 [[  4101   2910]
 [   621 243371]]
Precision: 0.8684879288437103
Recall: 0.5849379546427044
Accuracy: 0.9859324390545133
F-Measure: 0.699053950396318

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 4722 matches
Confusion Matrix:
 [[  4101   2910]
 [   621 243371]]
Precision: 0.8684879288437103
Recall: 0.5849379546427044
Accuracy: 0.9859324390545133
F-Measure: 0.699053950396318


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [25]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
# compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [26]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Training with threshold-based methods
Threshold-Based: 12752 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 11572  19450]
 [  1180 971810]]
Precision: 0.907465495608532
Recall: 0.3730255947392173
Accuracy: 0.9794524368234643
F-Measure: 0.5287156759720381

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 3297 matches
Confusion Matrix:
 [[  2994   4017]
 [   303 243689]]
Precision: 0.908098271155596
Recall: 0.42704321780059906
Accuracy: 0.9827890503300757
F-Measure: 0.5809080325960418

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 3297 matches
Confusion Matrix:
 [[  2994   4017]
 [   303 243689]]
Precision: 0.908098271155596
Recall: 0.42704321780059906
Accuracy: 0.9827890503300757
F-Measure: 0.5809080325960418


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)



Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [27]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
# compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
# compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [28]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Training with threshold-based methods
Threshold-Based: 3628 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[  3112  27910]
 [   516 972474]]
Precision: 0.8577728776185226
Recall: 0.10031590484172523
Accuracy: 0.9716875893913619
F-Measure: 0.17962481962481963

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 950 matches
Confusion Matrix:
 [[   821   6190]
 [   129 243863]]
Precision: 0.8642105263157894
Recall: 0.11710169733276281
Accuracy: 0.9748250020916085
F-Measure: 0.20625549554076122

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 950 matches
Confusion Matrix:
 [[   821   6190]
 [   129 243863]]
Precision: 0.8642105263157894
Recall: 0.11710169733276281
Accuracy: 0.9748250020916085
F-Measure: 0.20625549554076122


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)



Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 6632 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342

Training with Support Vector Machine


  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 6632 matches
Confusion Matrix:
 [[  5158   1853]
 [  1474 242518]]
Precision: 0.7777442702050663
Recall: 0.7357010412209385
Accuracy: 0.9867451783444819
F-Measure: 0.7561386791761342


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [29]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
# compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
# compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [30]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Training with threshold-based methods
Threshold-Based: 5559 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[  4108  26914]
 [  1451 971539]]
Precision: 0.7389818312646159
Recall: 0.1324221520211463
Accuracy: 0.9717483456373032
F-Measure: 0.22459746863125668

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 1420 matches
Confusion Matrix:
 [[  1062   5949]
 [   358 243634]]
Precision: 0.747887323943662
Recall: 0.1514762516046213
Accuracy: 0.9748728102851361
F-Measure: 0.2519274107460562

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 1420 matches
Confusion Matrix:
 [[  1062   5949]
 [   358 243634]]
Precision: 0.747887323943662
Recall: 0.1514762516046213
Accuracy: 0.9748728102851361
F-Measure: 0.2519274107460562


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)



Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 8603 matches
Confusion Matrix:
 [[  5658   1353]
 [  2945 241047]]
Precision: 0.6576775543415088
Recall: 0.8070175438596491
Accuracy: 0.9828766986848763
F-Measure: 0.7247342128858717

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 8603 matches
Confusion Matrix:
 [[  5658   1353]
 [  2945 241047]]
Precision: 0.6576775543415088
Recall: 0.8070175438596491
Accuracy: 0.9828766986848763
F-Measure: 0.7247342128858717

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 8603 matches
Confusion Matrix:
 [[  5658   1353]
 [  2945 241047]]
Precision: 0.6576775543415088
Recall: 0.8070175438596491
Accuracy: 0.9828766986848763
F-Measure: 0.7247342128858717


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [None]:
# Load feature DataFrames
dfa_feature1 = pd.read_csv('WinH.csv')  
dfb_feature1 = pd.read_csv('HinW.csv')  

# Merge feature DataFrames into main DataFrames
dfa = pd.merge(dfa, dfa_feature1, on='wikiID')
dfb = pd.merge(dfb, dfb_feature1, on='harvardIndex')

# Set index
dfa.set_index('wikiID', inplace=True)
dfb.set_index('harvardIndex', inplace=True)

# Add custom boolean feature
dfa = add_boolean_column(dfa, 'wikiLabel_in_HarvardNameList', 'has_HarvardNameList')
dfb = add_boolean_column(dfb, 'havard_in_WikiNameList', 'has_WikiNameList')

In [None]:
# Initialize the indexer and create candidate links
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')  # Can change to other methods like indexer.full(), indexer.block()
candidate_links = indexer.index(dfa, dfb)
# Initialize the comparator
compare_cl = recordlinkage.Compare()

# Add existing comparison features
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
# Add boolean comparison feature using compare.exact
compare_cl.exact('has_HarvardNameList', 'has_WikiNameList', label='custom_boolean_feature')

features = compare_cl.compute(candidate_links, dfa, dfb)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [None]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))