In [1]:
import argparse
import numpy as np
import pandas as pd
import itertools
import os
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from thefuzz import fuzz
from thefuzz import process
import time
import re
import recordlinkage
from sklearn.model_selection import train_test_split

gc.collect()

0

In [2]:
'''
# this function will remove all special charaters -- including spaces
# but too slow comparing to replace -- used process_time() for evaluation
def clean_text(text): # fa
    a = ""
    text = a.join(char for char in text if char.isalnum())
    return text
'''
# Remove square blankets auto generated during data alignment process
def clean_text(text): # fb
    text = text.replace('[', '').replace(']','').replace("'", '')
    return text

def remove_spec_in_col(df, col):
    newCol = []
    for index, rowValue in df[col].iteritems():
        if pd.notnull(rowValue):
            newCol.append(clean_text(rowValue))
        else:
            newCol.append(np.nan)
    return newCol

In [3]:
def add_boolean_column(df, list_column_name, boolean_column_name):
    df[boolean_column_name] = df[list_column_name].apply(lambda x: False if x == "[]" else bool(x))
    return df

In [4]:
a = pd.read_csv('~/Downloads/a.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
a = pd.concat(a)
a['wikiID'] = a['id']
a = a.set_index('id')
print(a.head())

c = pd.read_csv('c.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
c = pd.concat(c)
c['acceptedNames'] = remove_spec_in_col(c,'acceptedNames')
c.rename(columns={'Unnamed: 0': 'bioID'}, inplace=True)
print(c.head())

                              label  dateOfBirth  dateOfDeath  \
id                                                              
Q100142069             Frida Eggens          NaN          NaN   
Q100146795       Elizabeth Harrison       1792.0       1834.0   
Q100149196              Russell Cox          NaN          NaN   
Q100152296  Alda Pereira da Fonseca       1882.0          NaN   
Q100156193  Laurence Henry Millener       1914.0       2000.0   

           countryOfCitizenshipISO harvardIndex             bionomia  \
id                                                                     
Q100142069                      SE          NaN                  NaN   
Q100146795                      GB          NaN           Q100146795   
Q100149196                     NaN          NaN  0000-0001-5149-1709   
Q100152296                      BR          NaN                  NaN   
Q100156193                      NZ          NaN           Q100156193   

           authorAbbrv                 

In [5]:
# Find out the True Matches of dataframe A and dataframe B
temp = pd.merge(a, c, how='inner', on=None, left_on='wikiID', right_on='wikidata',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_bionomia'), copy=False, indicator=False)

print(temp.columns.to_list())
print(len(temp))
temp.head()

['label_wiki', 'dateOfBirth_wiki', 'dateOfDeath_wiki', 'countryOfCitizenshipISO', 'harvardIndex', 'bionomia', 'authorAbbrv', 'aliases', 'firstName_wiki', 'lastName_wiki', 'wikiID', 'bioID', 'orcid', 'wikidata', 'fullname', 'fullname_reverse', 'label_bionomia', 'orgs', 'countries', 'combined_fullname', 'dateOfBirth_bionomia', 'dateOfDeath_bionomia', 'acceptedNames', 'firstName_bionomia', 'lastName_bionomia']
23895


Unnamed: 0,label_wiki,dateOfBirth_wiki,dateOfDeath_wiki,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName_wiki,lastName_wiki,...,fullname_reverse,label_bionomia,orgs,countries,combined_fullname,dateOfBirth_bionomia,dateOfDeath_bionomia,acceptedNames,firstName_bionomia,lastName_bionomia
0,Elizabeth Harrison,1792.0,1834.0,GB,,Q100146795,,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison,...,"Harrison, Elizabeth",Elizabeth Harrison,,,Elizabeth Harrison,1792.0,1834.0,"Mrs Arnold Harrison, Mrs A. H.",Elizabeth,Harrison
1,Laurence Henry Millener,1914.0,2000.0,NZ,,Q100156193,,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence,Millener,...,"Millener, Laurence Henry",Laurence Henry Millener,,,Laurence Henry Millener,1914.0,2000.0,"L. H. Millener, Laurie Henry Millener, Laurie ...",Laurence Henry,Millener
2,Thomas Leonard Lancaster,1888.0,1945.0,NZ,,Q100156252,,T. L. Lancaster,Thomas,Lancaster,...,"Lancaster, Thomas Leonard",Thomas Leonard Lancaster,,,Thomas Leonard Lancaster,1888.0,1945.0,T. L. Lancaster,Thomas Leonard,Lancaster
3,Ross Henry Michie,1894.0,1987.0,NZ,,Q100157099,,"Ross Michie, R. H. Michie",Ross,Michie,...,"Michie, Ross Henry",Ross Henry Michie,,,Ross Henry Michie,1894.0,1987.0,"Ross Michie, R. H. Michie",Ross Henry,Michie
4,Johann Bartsch,1709.0,1738.0,DE,27614.0,,Bartsch,"Johannes Bartsch, Joannes Bartsch, Bartsch",Johann,Bartsch,...,"Bartsch, Johann",Johann Bartsch,,,Johann Bartsch,1709.0,1738.0,"Joannes Bartsch, Johannes Bartsch, Bartsch",Johann,Bartsch


In [6]:
'''
use the pd.merge above to prepare training data -- find out the index true pairs
training = 0.75, testing = 0.25, no validat set

experiment setting
    - supervised: Logistics regression, navie bayes, SVM
    - undupervised: k-mean, ECM

refs
https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html
https://recordlinkage.readthedocs.io/en/latest/guides/classifiers.html#
'''

def define_true_pairs(indexList1, indexList2, indexName1, indexName2):
    arrays = [indexList1, indexList2]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=[indexName1, indexName2])
    return index

In [7]:
true_matches = define_true_pairs(temp['wikiID'],temp['bioID'].astype(int),'wikiIndex','bionomiaIndex')

In [8]:
true_matches

MultiIndex([('Q100146795',  2448),
            ('Q100156193',  9747),
            ('Q100156252', 11197),
            ('Q100157099',  9750),
            (   'Q100222', 23973),
            (  'Q1002345',  2752),
            ('Q100270468',  9509),
            ('Q100354624',  3503),
            ('Q100364040',  3940),
            ('Q100390903',  9749),
            ...
            (    'Q65402', 48638),
            (    'Q65451', 48536),
            (    'Q65475',  2961),
            (    'Q65499',  6972),
            (    'Q65505', 47115),
            (     'Q6694',  7405),
            (     'Q7324',  5272),
            (      'Q762', 38823),
            (     'Q8619', 12153),
            (      'Q926',   948)],
           names=['wikiIndex', 'bionomiaIndex'], length=23895)

In [9]:
dfa = a.copy()
dfc = c.copy()

In [16]:
# Load feature DataFrames
dfa_feature1 = pd.read_csv('WinB.csv')  
dfc_feature1 = pd.read_csv('BinW.csv')

In [17]:
dfa_feature1

Unnamed: 0,wikiID,wiki_inBionomiaList
0,Q100142069,[]
1,Q100146795,[]
2,Q100149196,[]
3,Q100152296,[]
4,Q100156193,[9747]
...,...,...
71983,Q7324,"[5272, 36031]"
71984,Q7450,[16879]
71985,Q762,[38823]
71986,Q8619,[73810]


In [18]:
dfc_feature1

Unnamed: 0,bioID,Bionomia_in_WikiNameList
0,0,"['Q105943570', 'Q108887181', 'Q111615324']"
1,1,['Q21340129']
2,2,['Q60377700']
3,3,['Q57235902']
4,4,['Q11728182']
...,...,...
78707,78707,['Q88862468']
78708,78708,['Q15711474']
78709,78709,"['Q106889970', 'Q6073558']"
78710,78710,"['Q106889970', 'Q6073558']"


In [19]:
# Merge feature DataFrames into main DataFrames
dfa = pd.merge(dfa, dfa_feature1, on='wikiID')
dfc = pd.merge(dfc, dfc_feature1, on='bioID')

# Set index
dfa.set_index('wikiID', inplace=True)
dfc.set_index('bioID', inplace=True)

# Add custom boolean feature
dfa = add_boolean_column(dfa, 'wiki_inBionomiaList', 'has_BionomiaList')
dfc = add_boolean_column(dfc, 'Bionomia_in_WikiNameList', 'has_WikiNameList')

In [20]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')# indexer.sortedneighbourhood indexer.full() indexer.block
candidate_links = indexer.index(dfa, dfc)

In [48]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth')
compare_cl.exact('dateOfDeath', 'dateOfDeath', label='dateOfDeath')
# compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
compare_cl.exact('has_BionomiaList', 'has_WikiNameList', label='custom_boolean_feature')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfc)
features.sum(axis=1).value_counts().sort_index(ascending=False)

4.0     11708
3.0      8059
2.0      7947
1.0    563195
0.0    336045
dtype: int64

In [49]:
predictions = features[features.sum(axis=1) >= 1]
len(predictions.index.intersection(true_matches))/len(temp)

0.9008997698263235

In [31]:
# Threshold-based methods

# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

Threshold-Based: 21218 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [32]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[ 20837   3058]
 [   381 902678]]
Precision: 0.982043547931002
Recall: 0.8720234358652438
Accuracy: 0.9962899992879906
F-Measure: 0.9237692017821914


In [33]:
# Return all the true matched feature as test set
test_matches_index = features.index.intersection(true_matches)
print(test_matches_index)

MultiIndex([( 'Q70043892',  4447),
            ('Q100606303',  1232),
            ('Q108779939',  9657),
            ('Q100919649',  5455),
            (  'Q5752310',  4630),
            ( 'Q67155257',  5457),
            ('Q101096835',  3879),
            ('Q101115567',  1956),
            (  'Q6761413', 68700),
            ( 'Q47038330',  7168),
            ...
            (   'Q872093',  9204),
            (   'Q890923',  2519),
            ( 'Q89657013',  2135),
            (  'Q9074850', 12139),
            ( 'Q95821614',  9198),
            (   'Q936950', 34679),
            (   'Q946113', 26942),
            (    'Q95772', 45208),
            (   'Q964497',  5738),
            (    'Q97218',  9285)],
           length=21554)


In [55]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [62]:
# Expectation/Conditional Maxisation Classifier

# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(features)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [63]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [ 16061 210372]]
Precision: 0.24304835517013856
Recall: 0.9719185827365246
Accuracy: 0.9300506172892781
F-Measure: 0.3888553762630071


In [64]:
predictions

MultiIndex([( 'Q70043892',  4447),
            ('Q100606303',  1232),
            ('Q100919649',  5455),
            (  'Q5752310',  4630),
            ( 'Q67155257',  5457),
            ('Q101096835',  3879),
            ('Q101115567',  1956),
            (  'Q6761413', 68700),
            ('Q115246433', 11681),
            (   'Q102493', 49380),
            ...
            (   'Q890923',  2519),
            (    'Q89645', 60405),
            ( 'Q89657013',  2135),
            (  'Q9074850', 12139),
            ( 'Q95821614',  9198),
            (   'Q936950', 34679),
            (   'Q946113', 26942),
            (    'Q95772', 45208),
            (   'Q964497',  5738),
            (    'Q97218',  9285)],
           names=['wikiID', 'bioID'], length=21218)

In [37]:
# K-means Classifier
# Initialize the classifier
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.learn(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [38]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


In [39]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)

# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

In [40]:
# Logistic Regression

# Initialize the classifier
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [41]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


In [42]:
# Support Vector Machine Classifier

# Initialize the classifier
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [43]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


In [44]:
# Naive Bayes Classifier

# Initialize the classifier
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [45]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


In [46]:
def find_AinB(df1, col1, df2, col2, threshold):
    newCol = []
    for index1, rowValue1 in df1[col1].iteritems():
        temp = []
        for index2, rowValue2 in df2[col2].iteritems():
            # 
            sim = process.extractOne(str(rowValue1),str(rowValue2).split(','), scorer=fuzz.ratio)
            if sim[-1] >= threshold: temp.append(index2)
        newCol.append(temp)
    return newCol

In [None]:
temp = find_AinB(a, 'label', b, 'Name', 85)

In [None]:
from fastparquet import ParquetFile
temp.to_parquet('find_names.parquet', engine='fastparquet',encoding='utf-8')

In [56]:
# K-means Classifier
# Initialize the classifier
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.learn(train)
# Make Predictions on a test set
predictions = kmeans.predict(features)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [57]:
# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Confusion Matrix:
 [[  5157    149]
 [ 16061 210372]]
Precision: 0.24304835517013856
Recall: 0.9719185827365246
Accuracy: 0.9300506172892781
F-Measure: 0.3888553762630071


In [58]:
predictions

MultiIndex([( 'Q70043892',  4447),
            ('Q100606303',  1232),
            ('Q100919649',  5455),
            (  'Q5752310',  4630),
            ( 'Q67155257',  5457),
            ('Q101096835',  3879),
            ('Q101115567',  1956),
            (  'Q6761413', 68700),
            ('Q115246433', 11681),
            (   'Q102493', 49380),
            ...
            (   'Q890923',  2519),
            (    'Q89645', 60405),
            ( 'Q89657013',  2135),
            (  'Q9074850', 12139),
            ( 'Q95821614',  9198),
            (   'Q936950', 34679),
            (   'Q946113', 26942),
            (    'Q95772', 45208),
            (   'Q964497',  5738),
            (    'Q97218',  9285)],
           names=['wikiID', 'bioID'], length=21218)

In [26]:
compare_cl = recordlinkage.Compare()
# method options: ‘jaro’, ’jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’ or ‘cosine’.
compare_cl.string('firstName', 'firstName', method='damerau_levenshtein', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth')
compare_cl.exact('dateOfDeath', 'dateOfDeath', label='dateOfDeath')
# compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='damerau_levenshtein', threshold=0.85, label='geographyISO')
compare_cl.exact('has_BionomiaList', 'has_WikiNameList', label='custom_boolean_feature')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfc)
features.sum(axis=1).value_counts().sort_index(ascending=False)

4.0     11708
3.0      8059
2.0      7947
1.0    563195
0.0    336045
dtype: int64

In [27]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [28]:
# Threshold-based methods

print("Training with threshold-based methods")
# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 1.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

# Return all the true matched feature as test set
# test_matches_index = features.index.intersection(true_matches)
# print(test_matches_index)


# Expectation/Conditional Maxisation Classifier

print("\nTraining with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# K-means Classifier
# Initialize the classifier
print("\nTraining with K-means Classifier")
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.fit_predict(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)
print("K-means: {} matches".format(len(predictions)))
# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with threshold-based methods
Threshold-Based: 27714 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 21362   2533]
 [  6352 896707]]
Precision: 0.7708017608428953
Recall: 0.8939945595312827
Accuracy: 0.9904148425919733
F-Measure: 0.8278401054079716

Training with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 5238 matches
Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909

Training with K-means Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


K-means: 5238 matches
Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [29]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42)
# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

# Logistic Regression
# Initialize the classifier
print("\nTraining with Logistic Regression")
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)
print("Logistic Regression: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Support Vector Machine Classifier
# Initialize the classifier
print("\nTraining with Support Vector Machine")
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)
print("Support Vector Machine: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


# Naive Bayes Classifier
# Initialize the classifier
print("\nTraining with Naive Bayes Classifier")
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)
print("Naive Bayes: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Training with Logistic Regression


  y.loc[match_index & comparison_vectors.index] = 1


Logistic Regression: 5238 matches
Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909

Training with Support Vector Machine


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Support Vector Machine: 5238 matches
Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909

Training with Naive Bayes Classifier


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
  y.loc[match_index & comparison_vectors.index] = 1


Naive Bayes: 5238 matches
Confusion Matrix:
 [[  5157    149]
 [    81 226352]]
Precision: 0.9845360824742269
Recall: 0.9719185827365246
Accuracy: 0.9990075041318035
F-Measure: 0.9781866464339909


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)
