In [1]:
import argparse
import numpy as np
import pandas as pd
import itertools
import os
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from thefuzz import fuzz
from thefuzz import process
import time
import re
import recordlinkage
from sklearn.model_selection import train_test_split

gc.collect()

0

In [2]:
a = pd.read_csv('~/Downloads/a.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
a = pd.concat(a)
a['wikiID'] = a['id']
a = a.set_index('id')
print(a.head())

b = pd.read_csv('~/Downloads/b.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
b = pd.concat(b)
b['harvardIndex'] = b['id']
b = b.set_index('id')
print(b.head())

                              label  dateOfBirth  dateOfDeath  \
id                                                              
Q100142069             Frida Eggens          NaN          NaN   
Q100146795       Elizabeth Harrison       1792.0       1834.0   
Q100149196              Russell Cox          NaN          NaN   
Q100152296  Alda Pereira da Fonseca       1882.0          NaN   
Q100156193  Laurence Henry Millener       1914.0       2000.0   

           countryOfCitizenshipISO harvardIndex             bionomia  \
id                                                                     
Q100142069                      SE          NaN                  NaN   
Q100146795                      GB          NaN           Q100146795   
Q100149196                     NaN          NaN  0000-0001-5149-1709   
Q100152296                      BR          NaN                  NaN   
Q100156193                      NZ          NaN           Q100156193   

           authorAbbrv                 

In [3]:
# Find out the True Matches of dataframe A and dataframe B
a['harvardIndex'] = pd.to_numeric(a['harvardIndex'],errors='coerce') 
temp = pd.merge(a, b, how='inner', on=None, left_on='harvardIndex', right_on='harvardIndex',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_harvard'), copy=False, indicator=False)

print(temp.columns.to_list())
temp.head()

['label', 'dateOfBirth', 'dateOfDeath', 'countryOfCitizenshipISO', 'harvardIndex', 'bionomia', 'authorAbbrv', 'aliases', 'firstName_wiki', 'lastName_wiki', 'wikiID', 'Standard/Label Name', 'birthYear', 'deathYear', 'birthYearIsApprox', 'geographyISO', 'firstName_harvard', 'middleName', 'lastName_harvard', 'B & P Author Abbrev.', 'Name']


Unnamed: 0,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName_wiki,lastName_wiki,...,Standard/Label Name,birthYear,deathYear,birthYearIsApprox,geographyISO,firstName_harvard,middleName,lastName_harvard,B & P Author Abbrev.,Name
0,Johann Bartsch,1709.0,1738.0,DE,27614.0,,Bartsch,"Johannes Bartsch, Joannes Bartsch, Bartsch",Johann,Bartsch,...,J. Bartsch,1709.0,1738.0,False,,Johann,,Bartsch,Bartsch,"Bartsch, Johann, Johann Bartsch"
1,Townshend Stith Brandegee,1843.0,1925.0,US,17284.0,Q1002345,Brandegee,"Brandegee, T. S. Brandegee, Townshend S. Brand...",Townshend,Brandegee,...,T. S. Brandegee,1843.0,1925.0,False,"MX, US",Townshend,Stith,Brandegee,Brandegee,"Brandegee, Townshend Stith, Townshend Stith Br..."
2,Cécile Kruyfhooft,1950.0,,BE,18988.0,,,,Cécile,Kruyfhooft,...,C. Kruyfhooft,1950.0,,False,BE,Cecile,,Kruyfhooft,,"Kruyfhooft, Cecile"
3,Charles Bullard,1869.0,1960.0,US,3553.0,Q100354624,,C. Bullard,Charles,Bullard,...,Charles Bullard,1869.0,,False,US,Charles,,Bullard,,"Bullard, Charles"
4,Zubair Aslam,,,,88384.0,,,,Zubair,Aslam,...,Z. Aslam,,,,KR,Zubair,,Aslam,Aslam,"Aslam, Zubair"


In [4]:
'''
use the pd.merge above to prepare training data -- find out the index true pairs
training = 0.75, testing = 0.25, no validat set

experiment setting
    - supervised: Logistics regression, navie bayes, SVM
    - undupervised: k-mean, ECM

refs
https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html
https://recordlinkage.readthedocs.io/en/latest/guides/classifiers.html#
'''

def define_true_pairs(indexList1, indexList2, indexName1, indexName2):
    arrays = [indexList1, indexList2]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=[indexName1, indexName2])
    return index

In [5]:
true_matches = define_true_pairs(temp['wikiID'],temp['harvardIndex'].astype(int),'wikiIndex','harvardIndex')

In [6]:
dfa = a.copy()
dfb = b.copy()
indexer = recordlinkage.Index()
indexer.block('lastName')# indexer.sortedneighbourhood
candidate_links = indexer.index(dfa, dfb)
compare_cl = recordlinkage.Compare()
compare_cl.string('firstName', 'firstName', method='jarowinkler', threshold=0.85, label='firstName')
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
compare_cl.exact('dateOfDeath', 'deathYear', label='dateOfDeath')
compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='jarowinkler', threshold=0.85, label='authorAbbrv')
compare_cl.string('countryOfCitizenshipISO', 'geographyISO', method='jarowinkler', threshold=0.85, label='geographyISO')
# compare_cl.add(CompareAliases('label', 'Name', threshold=0.85, label='sim'))

features = compare_cl.compute(candidate_links, dfa, dfb)
features.sum(axis=1).value_counts().sort_index(ascending=False)

5.0      1993
4.0      8057
3.0      9194
2.0     19307
1.0     96232
0.0    650037
dtype: int64

In [9]:
# Threshold-based methods

# Make Predictions based on threshold > 50%
predictions = features[features.sum(axis=1) > 2.5]
print("Threshold-Based: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(features))

Threshold-Based: 19244 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [12]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.8341301184784868
Recall: 0.5174392366707498
Accuracy: 0.9768583879106038
F-Measure: 0.6386822106393983


In [61]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25)

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [62]:
# Expectation/Conditional Maxisation Classifier

# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [63]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.7020594965675058
Recall: 0.9122807017543859
Accuracy: 0.98372110802477
F-Measure: 0.793482477693004


In [64]:
# K-means Classifier
# Initialize the classifier
kmeans = recordlinkage.KMeansClassifier()

# Train the Model
train_kmeans = kmeans.learn(train)
# Make Predictions on a test set
predictions = kmeans.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [65]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.7034937528921795
Recall: 0.9041034790365745
Accuracy: 0.9836497540837389
F-Measure: 0.7912817176317501


In [66]:
# Create a training and test set
train, test = train_test_split(features, test_size=0.25)

# Get the true pairs for each set
train_matches_index = train.index.intersection(true_matches)
test_matches_index = test.index.intersection(true_matches)

In [68]:
# Logistic Regression

# Initialize the classifier
logisticRegression = recordlinkage.LogisticRegressionClassifier()

# Train the classifier
logisticRegression.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = logisticRegression.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [69]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.7267690757995467
Recall: 0.8388315651794798
Accuracy: 0.9832878876685099
F-Measure: 0.7787897186804291


In [72]:
# Support Vector Machine Classifier

# Initialize the classifier
svm = recordlinkage.SVMClassifier()

# Train the classifier
svm.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = svm.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [73]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.8369587735653615
Recall: 0.5871239645400378
Accuracy: 0.981509135852807
F-Measure: 0.6901264092927912


In [74]:
# Naive Bayes Classifier

# Initialize the classifier
naiveBayes = recordlinkage.NaiveBayesClassifier()

# Train the classifier
naiveBayes.fit_predict(train, train_matches_index)

# Make Predictions on a test set
predictions = naiveBayes.predict(test)

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(test_matches_index, predictions, len(test))

  y.loc[match_index & comparison_vectors.index] = 1
  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


In [75]:
# Print Metrics
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))

Precision: 0.7081312732604037
Recall: 0.9125127161749745
Accuracy: 0.9837414948650646
F-Measure: 0.7974345948691897


In [None]:
def find_AinB(df1, col1, df2, col2, threshold):
    newCol = []
    for index1, rowValue1 in df1[col1].iteritems():
        temp = []
        for index2, rowValue2 in df2[col2].iteritems():
            # 
            sim = process.extractOne(str(rowValue1),str(rowValue2).split(','), scorer=fuzz.ratio)
            if sim[-1] >= threshold: temp.append(index2)
        newCol.append(temp)
    return newCol

In [None]:
temp = find_AinB(a, 'label', b, 'Name', 85)

In [None]:
from fastparquet import ParquetFile
temp.to_parquet('find_names.parquet', engine='fastparquet',encoding='utf-8')