In [1]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import itertools
import os
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import gc
from thefuzz import fuzz
from thefuzz import process
import time
import re
import recordlinkage
from sklearn.model_selection import train_test_split
from fastparquet import ParquetFile

gc.collect()

0

In [2]:
'''
# this function will remove all special charaters -- including spaces
# but too slow comparing to replace -- used process_time() for evaluation
def clean_text(text): # fa
    a = ""
    text = a.join(char for char in text if char.isalnum())
    return text
'''
# Remove square blankets auto generated during data alignment process
def clean_text(text): # fb
    text = text.replace('[', '').replace(']','').replace("'", '')
    return text

def remove_spec_in_col(df, col):
    newCol = []
    for index, rowValue in df[col].iteritems():
        if pd.notnull(rowValue):
            newCol.append(clean_text(rowValue))
        else:
            newCol.append(np.nan)
    return newCol

In [3]:
def find_AinB(df1, col1, df2, col2, threshold):
    newCol = []
    for index1, rowValue1 in df1[col1].iteritems():
        temp = []
        for index2, rowValue2 in df2[col2].iteritems():
            # 
            sim = process.extractOne(str(rowValue1),str(rowValue2).split(','), scorer=fuzz.ratio)
            if sim[-1] >= threshold: temp.append(index2)
        newCol.append(temp)
    return newCol

In [13]:
'''
use the pd.merge above to prepare training data -- find out the index true pairs
training = 0.75, testing = 0.25, no validat set

experiment setting
    - supervised: Logistics regression, navie bayes, SVM
    - undupervised: k-mean, ECM

refs
https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html
https://recordlinkage.readthedocs.io/en/latest/guides/classifiers.html#
'''

def define_true_pairs(indexList1, indexList2, indexName1, indexName2):
    arrays = [indexList1, indexList2]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=[indexName1, indexName2])
    return index

In [14]:
def add_boolean_column(df, list_column_name, boolean_column_name):
    df[boolean_column_name] = df[list_column_name].apply(lambda x: False if x == "[]" else bool(x))
    return df

In [10]:
a = pd.read_csv('~/Downloads/a.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
a = pd.concat(a)
a['wikiID'] = a['id']
a = a.set_index('id')
print(a.head())

b = pd.read_csv('~/Downloads/b.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
b = pd.concat(b)
b['harvardIndex'] = b['id']
b = b.set_index('id')
print(b.head())

c = pd.read_csv('c.csv',chunksize=10000,encoding='utf-8',on_bad_lines='skip',engine='python')
c = pd.concat(c)
c['acceptedNames'] = remove_spec_in_col(c,'acceptedNames')
print(c.head())

                              label  dateOfBirth  dateOfDeath  \
id                                                              
Q100142069             Frida Eggens          NaN          NaN   
Q100146795       Elizabeth Harrison       1792.0       1834.0   
Q100149196              Russell Cox          NaN          NaN   
Q100152296  Alda Pereira da Fonseca       1882.0          NaN   
Q100156193  Laurence Henry Millener       1914.0       2000.0   

           countryOfCitizenshipISO harvardIndex             bionomia  \
id                                                                     
Q100142069                      SE          NaN                  NaN   
Q100146795                      GB          NaN           Q100146795   
Q100149196                     NaN          NaN  0000-0001-5149-1709   
Q100152296                      BR          NaN                  NaN   
Q100156193                      NZ          NaN           Q100156193   

           authorAbbrv                 

In [None]:
print('Finding Bionomia in Wiki')
temp = find_AinB(c, 'fullname', a, 'aliases', 85)
df = pd.DataFrame(data={'bioID': c['Unnamed: 0'],"Bionomia_in_WikiNameList": temp})
df.to_csv("BinW.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Bionomia in Wiki')

In [None]:
print('Finding Bionomia in Harvard')
temp = find_AinB(c, 'fullname', b, 'Name', 85)
df = pd.DataFrame(data={'bioID': c['Unnamed: 0'],"Bionomia_in_HarvardNameList": temp})
df.to_csv("BinH.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Bionomia in Harvard')

In [None]:
print('Finding Harvard in Wiki')
temp = find_AinB(b, 'labelName', a, 'aliases', 85)
df = pd.DataFrame(data={'harvardIndex': b['harvardIndex'],"havard_in_WikiNameList": temp})
df.to_csv("HinW.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Harvard in Wiki')

In [None]:
print('Finding Harvard in Bionomia')
temp = find_AinB(b, 'labelName', c, 'acceptedNames', 85)
df = pd.DataFrame(data={'harvardIndex': b['harvardIndex'],"harvard_in_BionomiaNameList": temp})
df.to_csv("HinB.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Harvard in Bionomia')

In [None]:
print('Finding Wiki in Harvard')
temp = find_AinB(a, 'label', b, 'Name', 85)
df = pd.DataFrame(data={'wikiID': a['wikiID'],"wikiLabel_in_HarvardNameList": temp})
df.to_csv("WinH.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Wiki in Harvard')

In [None]:
print('Finding Wiki in Bionomia')
temp = find_AinB(a, 'label', c, 'acceptedNames', 85)
df = pd.DataFrame(data={'wikiID': a['wikiID'],"wikiLabel_in_BionomiaNameList": temp})
df.to_csv("WinB.csv", sep=',', encoding='utf-8',index=False)
print('Finished finding Wiki in Bionomia')

In [18]:
# Find out the True Matches of dataframe A and dataframe B
a['harvardIndex'] = pd.to_numeric(a['harvardIndex'],errors='coerce') 
temp = pd.merge(a, b, how='inner', on=None, left_on='harvardIndex', right_on='harvardIndex',
                  left_index=False, right_index=False, sort=False,
                  suffixes=('_wiki', '_harvard'), copy=False, indicator=False)

# print(temp.columns.to_list())
# temp.head()

true_matches = define_true_pairs(temp['wikiID'],temp['harvardIndex'].astype(int),'wikiIndex','harvardIndex')
# print(true_matches)
print('There is '+ str(len(true_matches)) +' HarvardIndex records in Wikidata that can find a match, which is ' + str(len(true_matches)/len(a)*100) +'%')
print('There is '+ str(len(true_matches)) +' Wikidata records can be found in HarvardIndex records, which is ' + str(len(true_matches)/len(b)*100) +'%')

There is 31022 HarvardIndex records in Wikidata that can find a match, which is 43.09329332666555%
There is 31022 HarvardIndex records can be found in Wikidata, which is 40.55375444467685%


In [15]:
dfa = a.copy()
dfb = b.copy()
# Set indices
dfa.set_index('wikiID', inplace=True)
dfb.set_index('harvardIndex', inplace=True)

In [16]:
# Load feature DataFrames
dfa_feature1 = pd.read_csv('WinH.csv')  
dfb_feature1 = pd.read_csv('HinW.csv')  

# Merge feature DataFrames into main DataFrames
dfa = pd.merge(dfa, dfa_feature1, on='wikiID')
dfb = pd.merge(dfb, dfb_feature1, on='harvardIndex')

# Set index
dfa.set_index('wikiID', inplace=True)
dfb.set_index('harvardIndex', inplace=True)

# Add custom boolean feature
dfa = add_boolean_column(dfa, 'wikiLabel_in_HarvardNameList', 'has_HarvardNameList')
dfb = add_boolean_column(dfb, 'havard_in_WikiNameList', 'has_WikiNameList')

In [19]:
# Initialize the indexer and create candidate links
indexer = recordlinkage.Index()
indexer.sortedneighbourhood('lastName')  # Can change to other methods like indexer.full(), indexer.block()
candidate_links = indexer.index(dfa, dfb)
# Initialize the comparator
compare_cl = recordlinkage.Compare()

# Add existing comparison features
compare_cl.exact('dateOfBirth', 'birthYear', label='dateOfBirth')
compare_cl.string('authorAbbrv', 'B & P Author Abbrev.', method='damerau_levenshtein', threshold=0.85, label='authorAbbrv')
# Add boolean comparison feature using compare.exact
compare_cl.exact('has_HarvardNameList', 'has_WikiNameList', label='custom_boolean_feature')

features = compare_cl.compute(candidate_links, dfa, dfb)

# Create a training and test set
train, test = train_test_split(features, test_size=0.25, random_state=42) # random_state=42

# Get the true pairs for the test set (Used for Evaluation)
test_matches_index = test.index.intersection(true_matches)

In [21]:
print("\nPredicting with Expectation/Conditional Maxisation Classifier")
# Initialize the classifier
ecm = recordlinkage.ECMClassifier()
# Train the Model
train_ecm = ecm.fit_predict(train)
# Make Predictions on a test set
predictions = ecm.predict(features)
print("Expectation/Conditional Maxisation: {} matches".format(len(predictions)))

# Get the confusion matrix. This is just the table with the numbers of True/False Postives and True/False Negatives.
confusion_matrix = recordlinkage.confusion_matrix(true_matches, predictions, len(test))

# Print Metrics
print("Confusion Matrix:\n", confusion_matrix)
print("Precision:", recordlinkage.precision(confusion_matrix))
print("Recall:", recordlinkage.recall(confusion_matrix))
print("Accuracy:", recordlinkage.accuracy(confusion_matrix))
print("F-Measure:", recordlinkage.fscore(confusion_matrix))


Predicting with Expectation/Conditional Maxisation Classifier
Expectation/Conditional Maxisation: 36043 matches


  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


Confusion Matrix:
 [[ 25480   5542]
 [ 10563 209418]]
Precision: 0.7069333851233249
Recall: 0.8213525884855909
Accuracy: 0.9358374202698773
F-Measure: 0.7598598374711101


In [23]:
# Convert matches columns to string to ensure consistency
matches = predictions.to_frame(index=False).reset_index()
matches['wikiID'] = matches['wikiID'].astype(str)
matches['harvardIndex'] = matches['harvardIndex'].astype(str)
dfa.index = dfa.index.astype(str)
dfb.index = dfb.index.astype(str)

# Ensure the matches DataFrame is properly structured
matches = matches[['wikiID', 'harvardIndex']]

# Merge the matches with the original data
matched_dfa = dfa.loc[matches['wikiID']].reset_index()
matched_dfb = dfb.loc[matches['harvardIndex']].reset_index()

# Combine the matched DataFrames side by side
combined_matches = pd.concat([matched_dfa, matched_dfb], axis=1)

# Optionally add a label to identify matched rows
combined_matches['matched'] = True

In [25]:
# Convert the matches DataFrame to a MultiIndex
matches['harvardIndex'] = matches['harvardIndex'].astype(int)
matches_index = pd.MultiIndex.from_frame(matches)
# print(matches_index)

# Find the intersection
intersection = matches_index.intersection(true_matches)
# print(true_matches)
print(intersection)

# Find the non-intersection pairs
non_intersection = matches_index.difference(true_matches)
print(non_intersection)

MultiIndex([('Q100887787', 19901),
            ('Q117455407', 50431),
            ( 'Q33665872', 15901),
            (   'Q106785', 21404),
            ( 'Q21505291', 14754),
            ( 'Q36645320', 71958),
            (   'Q538065', 78373),
            ('Q109941265', 41038),
            ('Q110222595', 15016),
            ('Q111635491', 18682),
            ...
            ( 'Q95101266', 36482),
            ( 'Q95166173', 11149),
            ( 'Q95175049',  7562),
            ( 'Q95394399', 26018),
            ( 'Q95471212',  4709),
            (    'Q95772', 15179),
            (    'Q96384',  5059),
            (    'Q27684',  1609),
            (    'Q59570',  2718),
            (    'Q62938',  1671)],
           names=[None, 'harvardIndex'], length=25480)
MultiIndex([(   'Q100222', 78617),
            ('Q100400504', 18935),
            ('Q100400770', 75599),
            ('Q100701296', 74664),
            ('Q100709237', 42243),
            ('Q100869469', 22907),
            ('Q100

In [24]:
combined_matches

Unnamed: 0,wikiID,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,...,birthYearIsApprox,geographyISO,firstName.1,middleName,lastName,B & P Author Abbrev.,Name,havard_in_WikiNameList,has_WikiNameList,matched
0,Q4322660,Vladimir Novikov,1940.0,2016.0,RU,70088.0,,Novikov,"Vladimir Sergeevitsj Novikov, Novikov, V.S. No...",Vladimir,...,False,RU,Nadezhda,Grigorievna,Novikova,Novikova,"Novikova, Nadezhda Grigorievna, Nadezhda Grigo...",[],False,True
1,Q100222,Johann Bartsch,1709.0,1738.0,DE,27614,,Bartsch,"Johannes Bartsch, Joannes Bartsch, Bartsch",Johann,...,,DE,Hildegard,,Bartusch,Bartusch,"Bartusch, Hildegard, Hildegard Bartusch","['Q100222', 'Q111492349', 'Q213687', 'Q2575794...",True,True
2,Q21608912,Joseph Henry Simmonds,1845.0,1936.0,,28235.0,,Simmonds,Simmonds,Joseph,...,False,SE,Hermann,George,Simmons,Simmons,"Simmons, Hermann George, Hermann George Simmons","['Q100600540', 'Q113009189', 'Q19002124', 'Q21...",True,True
3,Q100887787,A.J. Ultee,1878.0,1964.0,NL,19901,,,Arnoldus Johannes Ultee,A.J.,...,False,ID,Arnoldus,Johannes,Ultée,,"Ultée, Arnoldus Johannes",[],False,True
4,Q117455407,Rosa Lydia Otto,1909.0,,,50431,,,"Rosa Lydia Otto-Surbeck, Mrs L. Otto-Surbeck",Rosa,...,False,ID,Rosa,Lydia,Otto-Surbeck,,"Otto-Surbeck, Rosa Lydia, Otto-Surbeck, R. L.",['Q117455407'],True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36038,Q59570,Theodor Koch-Grunberg,1872.0,1924.0,DE,2718.0,,,Theodor Koch-Grünberg,Theodor,...,False,"CO, VE, BR",Christian,Theodor,Koch,,"Koch, Christian Theodor, Koch, Christian Theod...",[],False,True
36039,Q61439,Georg Eberhard Rumphius,1627.0,1702.0,NL,1118.0,,Rumph.,"Rumph., Georg Eberhard Rumpf, G.E. Rumphius",Georg,...,False,,Gerog,Eberhard,Rumpf,,"Rumpf, Gerog Eberhard",[],False,True
36040,Q62846,Ernst Rudolf von Trautvetter,1809.0,1889.0,,460.0,,Trautv.,"Trautv., Ernst Rudolph von Trautvetter",Ernst,...,False,,Robert,,Trautmann,Trautm.,"Trautmann, Robert, Robert Trautmann",['Q21610935'],True,True
36041,Q62938,Joseph zu Salm-Reifferscheidt-Dyck,1773.0,1861.0,FR,1671.0,,Salm-Dyck,"Salm-Dyck, Joseph de Salm-Reifferscheidt-Dyck",Joseph,...,False,"DE, ZA",Joseph,Franz Maria Anton Hubert Ignatz Fürst zu,Salm-Reifferscheid-Dyck,Salm-Dyck,"Salm-Reifferscheid-Dyck, Joseph Franz Maria An...",[],False,True


In [26]:
# Create a DataFrame from the non-intersection pairs
non_intersection_df = pd.DataFrame(list(non_intersection), columns=['wikiID', 'harvardIndex'])
# Convert matches columns to string to ensure consistency
non_intersection_df['wikiID'] = non_intersection_df['wikiID'].astype(str)
non_intersection_df['harvardIndex'] = non_intersection_df['harvardIndex'].astype(str)

In [27]:
notfound_matched_dfa = dfa.loc[non_intersection_df['wikiID']].reset_index()
notfound_matched_dfb = dfb.loc[non_intersection_df['harvardIndex']].reset_index()
# Combine the matched DataFrames side by side
combined_notfound_matches = pd.concat([notfound_matched_dfa, notfound_matched_dfb], axis=1)

# Optionally add a label to identify matched rows
combined_notfound_matches['matched'] = True

In [29]:
W_match_H = pd.concat([combined_matches, combined_notfound_matches], axis=1)
W_match_H.head()

Unnamed: 0,wikiID,label,dateOfBirth,dateOfDeath,countryOfCitizenshipISO,harvardIndex,bionomia,authorAbbrv,aliases,firstName,...,birthYearIsApprox,geographyISO,firstName.1,middleName,lastName,B & P Author Abbrev.,Name,havard_in_WikiNameList,has_WikiNameList,matched
0,Q4322660,Vladimir Novikov,1940.0,2016.0,RU,70088.0,,Novikov,"Vladimir Sergeevitsj Novikov, Novikov, V.S. No...",Vladimir,...,,DE,Hildegard,,Bartusch,Bartusch,"Bartusch, Hildegard, Hildegard Bartusch","['Q100222', 'Q111492349', 'Q213687', 'Q2575794...",True,True
1,Q100222,Johann Bartsch,1709.0,1738.0,DE,27614.0,,Bartsch,"Johannes Bartsch, Joannes Bartsch, Bartsch",Johann,...,False,US,Robert,Samuel,Campbell,R. S. Campb.,"Campbell, Robert Samuel, Robert Samuel Campbell","['Q100400504', 'Q108991211', 'Q118471123', 'Q5...",True,True
2,Q21608912,Joseph Henry Simmonds,1845.0,1936.0,,28235.0,,Simmonds,Simmonds,Joseph,...,False,,Ernest,,Hemmendorf,Hemmend.,"Hemmendorf, Ernest, Ernest Hemmendorf",[],False,True
3,Q100887787,A.J. Ultee,1878.0,1964.0,NL,19901.0,,,Arnoldus Johannes Ultee,A.J.,...,False,,Thorvaldur,,Johnson,Thorv. Johnson,"Johnson, Thorvaldur, Thorvaldur Johnson",['Q21517128'],True,True
4,Q117455407,Rosa Lydia Otto,1909.0,,,50431.0,,,"Rosa Lydia Otto-Surbeck, Mrs L. Otto-Surbeck",Rosa,...,False,ID,Hein,Hidde,Zeijlstra,,"Zeijlstra, Hein Hidde",[],False,True


In [30]:
print('There is '+ str(len(W_match_H)) +' HarvardIndex records in Wikidata that can find a match, which is ' + str(len(W_match_H)/len(a)*100) +'%')
print('There is '+ str(len(W_match_H)) +' HarvardIndex records can be found in Wikidata, which is ' + str(len(W_match_H)/len(b)*100) +'%')

There is 36043 HarvardIndex records in Wikidata that can find a match, which is 50.0680669000389%
There is 36043 HarvardIndex records can be found in Wikidata, which is 47.11749633967789%
