In [94]:
import re
from namematcher import NameMatcher

name_matcher = NameMatcher()

In [95]:
score = name_matcher.match_names('Nat Ahn', 'Natalie Ahn')
print(score)

0.9874999999999999


In [96]:
score = name_matcher.match_names('Natalie Ahn', 'Gabrielle Elul')
print(score)

0.2220802083934297


In [97]:
name_matcher = NameMatcher(distfun='levenshtein') # default
name_matcher = NameMatcher(distfun='jaro_winkler')
# name_matcher = NameMatcher(distfun=my_callable_function)

In [98]:
sample_names = ['Nat G. Ahn', 'John Doe', 'AJ Smith', 'Rob Smith']
pop_names = ['Ahn, Natalie Grace', 'Ahn, Nancy G.', 'Smith, Adam Jr.', 'Smith, Peter Robert', 'Doe, Paul',\
             'Doh, John', 'Anh, Nathan', 'Smith, Albert III']

In [99]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Nancy G., score 0.972639
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.863333
For name: Rob Smith, best match: Smith, Peter Robert, score 0.916250


### Additions to the code

To change parameters, e.g. to reduce or eliminate the discount on a match between a first/middle initial and first/middle name. In the example above, "Nat G. Ahn" was matched to "Ahn, Nancy G." though it should probably have matched "Ahn, Natalie Grace". That's because matching the middle initials "G." and "G." got a higher score than matching "G." to "Grace". If we change the parameter'disc_initial' to be closer to 1, the name we want becomes the best match.

In [100]:
name_matcher.params['disc_initial'] = 0.9

In [101]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Natalie Grace, score 0.981250
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.863333
For name: Rob Smith, best match: Smith, Peter Robert, score 0.916250


### Additions to the code


We could also try reducing or eliminating the discount on an abbreviation (i.e. a shortened version of a first/middle name that's longer than an initial) so that "Nat" and "Natalie" are treated as a perfect or nearly perfect match (and the one-letter difference between "Nat" and "Nancy" drops that pair to second place).

In [102]:
name_matcher.params['disc_initial'] = 0.8
name_matcher.params['disc_abbrev'] = 0.99

In [103]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Nancy G., score 0.976750
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.870000
For name: Rob Smith, best match: Smith, Peter Robert, score 0.923250


In [42]:
try_matcher = NameMatcher(distfun='jaro_winkler')

Katherine Ann Labrecque <br>
Mrs Katherine A Labrecque <br>
Kathy Labrecque <br>
Kathy Labrecque <br>

<br> <br>
Michael L Womack <br>
Michael Lynn Womack <br>
Mike Womack <br>
Michael Womack <br>

 <br> <br>
DEBORAH W O'BRIEN <br>
DEBORAH-KA ARTZ <br>
D.K. ARTZ <br>
DEBORAH ARTZ <br>

 <br> <br>
HARIRAM  GUPTA <br>
HARRY BAWEJA <br>
HARINDER SINGH BAWEJA <br>

In [105]:
score = try_matcher.match_names('Katherine Ann Labrecque', 'Mrs Katherine A Labrecque')
print(score)

0.89


In [106]:
score = try_matcher.match_names('Katherine Ann Labrecque', 'Kathy Labrecque')
print(score)

0.9526663931344486


In [107]:
score = try_matcher.match_names('Michael Lynn Womack', 'Michael L Womack')
print(score)

0.9500000000000001


In [108]:
score = try_matcher.match_names('Michael Lynn Womack', 'Michael Womack')
print(score)

0.9874999999999999


In [109]:
# These names are different hence low scores
score = try_matcher.match_names("DEBORAH W O'BRIEN", 'DEBORAH-KA ARTZ')
print(score)

0.46246333362799064


In [110]:
score = try_matcher.match_names("DEBORAH-KA ARTZ", 'D.K. ARTZ')
print(score)

0.8925


In [111]:
score = try_matcher.match_names("DEBORAH-KA ARTZ", 'DEBORAH ARTZ')
print(score)

0.9874999999999999


In [112]:
# Since these names are different, scores are low
score = try_matcher.match_names("HARIRAM GUPTA", 'HARRY BAWEJA')
print(score)

0.5053610070785122


In [113]:
score = try_matcher.match_names("HARINDER SINGH BAWEJA", 'HARRY BAWEJA')
print(score)

0.9245673292645868


In [114]:
score = try_matcher.match_names('Abigail Timothy', 'Abbie Timothy')
print(score)

0.8956224582817963


In [115]:
# This is fetching us very low scores
score = try_matcher.match_names('Kenneth Irvine Chenault', 'KIC')
print(score)

0.18906015629507225


In [116]:
# This is fetching us very low scores
score = try_matcher.match_names('NEENA  JAIN', 'NEENA GROVER')
print(score)

0.2294714728255649


In [43]:
score = try_matcher.match_names('Nick Jr', 'Nick')
print(score)

0.7050000000000001


In [118]:
score = try_matcher.match_names('Steven', 'Stephen')
print(score)

0.6586917501586543


### See how this scores changes after changing the weights

In [119]:
# Giving higher weights to Suffixes
try_matcher.params['weights'][2] = 0.5

In [120]:
score = try_matcher.match_names('Nick Jr', 'Nick')
print(score)

0.91


In [121]:
# Phonetics is working
score = try_matcher.match_names('Steven', 'Stephen')
print(score)

1.0686917501586544


### Cases which are not working

In [122]:
# This score is after increasing the weight but the score is still low
score = try_matcher.match_names('Kenneth Irvine Chenault', 'KIC')
print(score)

0.5990601562950723


In [123]:
# This score is after increasing the weight but the score is still low
score = try_matcher.match_names('NEENA  JAIN', 'NEENA GROVER')
print(score)

0.6394714728255649


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
import textdistance

In [24]:
import re
from namematcher import NameMatcher

In [25]:
df = pd.read_excel('EXL_Live_Project.xlsx')
df.head()

Unnamed: 0,Original Name,Incorrect Name
0,Chenault,Chenalt
1,Chenault,Chenaolt
2,Chenault,Chneault
3,Sean O'Neil,Sean ONeil
4,Claudia-Karina Elizabeth,Claudia Karina Elizabeth


In [44]:
# All Functions Here

def td_leven(x):
    return textdistance.levenshtein.normalized_similarity(x[0], x[1])

def td_hamming(x):
    return textdistance.hamming.normalized_similarity(x[0], x[1])

def td_dleven(x):
    return textdistance.damerau_levenshtein.normalized_similarity(x[0], x[1])

def td_jarowinkler(x):
    return textdistance.jaro_winkler.normalized_similarity(x[0], x[1])

def td_smithwaterman(x):
    return textdistance.smith_waterman.normalized_similarity(x[0], x[1])

def td_mongeelkan(x):
    return textdistance.monge_elkan.normalized_similarity(x[0], x[1])

def td_needlewunsch(x):
    return textdistance.needleman_wunsch.normalized_similarity(x[0], x[1])

def td_gotoh(x):
    return textdistance.gotoh.normalized_similarity(x[0], x[1])

def nm_leven(x):
    name_matcher = NameMatcher()
    return name_matcher.match_names(x[0], x[1])

def nm_jarowinkler(x):
    name_matcher = NameMatcher(distfun='jaro_winkler')
    return name_matcher.match_names(x[0], x[1])

def nm_leven_weights_changed(x):
    name_matcher = NameMatcher()
    name_matcher.params['weights'] = [0.1,0.4,0.5]
    return name_matcher.match_names(x[0], x[1])

def nm_jarowinkler_weights_changed(x):
    name_matcher = NameMatcher(distfun='jaro_winkler')
    name_matcher.params['weights'] = [0.1,0.4,0.5]
    return name_matcher.match_names(x[0], x[1])

In [45]:
## Applying Functions Here

df['TD_Levenshtein'] = df.apply(td_leven, axis=1)
df['TD_Hamming'] = df.apply(td_hamming, axis=1)
df['TD_Demaru_Levenshtein'] = df.apply(td_dleven, axis=1)
df['TD_Jaro_Winkler'] = df.apply(td_jarowinkler, axis=1)
df['TD_Smith_Waterman'] = df.apply(td_smithwaterman, axis=1)
df['TD_MongeElkan'] = df.apply(td_mongeelkan, axis=1)
df['TD_Needle_Wunsch'] = df.apply(td_needlewunsch, axis=1)
df['TD_Gotoh'] = df.apply(td_gotoh, axis=1)
df['NM_Levenshtein'] = df.apply(nm_leven, axis=1)
df['NM_JaroWinkler'] = df.apply(nm_jarowinkler, axis=1)
df['NM_Levenshtein_Weights'] = df.apply(nm_leven_weights_changed, axis=1)
df['NM_JaroWinkler_Weights'] = df.apply(nm_jarowinkler_weights_changed, axis=1)

In [46]:
df

Unnamed: 0,Original Name,Incorrect Name,TD_Levenshtein,TD_Hamming,TD_Demaru_Levenshtein,TD_Jaro_Winkler,TD_Smith_Waterman,TD_MongeElkan,TD_Needle_Wunsch,TD_Gotoh,NM_Levenshtein,NM_JaroWinkler,NM_Levenshtein_Weights,NM_JaroWinkler_Weights
0,Chenault,Chenalt,0.875,0.625,0.875,0.975,0.857143,0.054688,0.875,0.928571,0.71462,0.7335,0.878558,0.89
1,Chenault,Chenaolt,0.875,0.875,0.875,0.95,0.875,0.054688,0.9375,0.9375,0.71462,0.717,0.878558,0.88
2,Chenault,Chneault,0.75,0.75,0.875,0.966667,0.75,0.0625,0.875,0.875,0.67451,0.728,0.854249,0.886667
3,Sean O'Neil,Sean ONeil,0.909091,0.545455,0.909091,0.981818,0.9,0.041322,0.909091,0.95,0.947716,0.967,0.968313,0.98
4,Claudia-Karina Elizabeth,Claudia Karina Elizabeth,0.958333,0.958333,0.958333,0.957246,0.958333,0.019965,0.979167,0.979167,0.975625,0.975625,0.99025,0.99025
5,Kenneth Irvine Chenault,Chenault Kenneth Irvine,0.217391,0.130435,0.217391,0.743707,0.217391,0.021739,0.565217,0.621739,0.25156,0.491681,0.564624,0.709784
6,Kenneth Irvine,Kenneth,0.5,0.5,0.5,0.9,0.0,0.022959,0.5,0.757143,0.479735,0.603616,0.701227,0.778843
7,Kenneth Irvine,Irvine Chenault,0.133333,0.0,0.133333,0.498413,0.142857,0.033163,0.533333,0.535714,0.25156,0.491681,0.564624,0.709784
8,Kenneth Irvine Chenault,Kenneth I Chenault,0.782609,0.391304,0.782609,0.9343,0.722222,0.018904,0.782609,0.927778,0.95,0.95,0.98,0.98
9,Kenneth Irvine Chenault,Kenneth D Chenault,0.73913,0.347826,0.73913,0.913186,0.666667,0.017958,0.76087,0.9,0.925,0.925,0.97,0.97


In [47]:
cols = ['NM_Levenshtein', 'NM_JaroWinkler']
cols_to_disp = ['Original Name', 'Incorrect Name', 'NM_Levenshtein', 'NM_JaroWinkler']
df[(df[cols] < 0.8).any(axis=1)][cols_to_disp]

Unnamed: 0,Original Name,Incorrect Name,NM_Levenshtein,NM_JaroWinkler
0,Chenault,Chenalt,0.71462,0.7335
1,Chenault,Chenaolt,0.71462,0.717
2,Chenault,Chneault,0.67451,0.728
5,Kenneth Irvine Chenault,Chenault Kenneth Irvine,0.25156,0.491681
6,Kenneth Irvine,Kenneth,0.479735,0.603616
7,Kenneth Irvine,Irvine Chenault,0.25156,0.491681
10,Kenneth Irvine Chenault,K I C,0.360707,0.748972
11,Kenneth Irvine Chenault,KIC,0.18906,0.533056
17,Steven,Stephen,0.658692,0.680333
18,Nick Jr,Nick,0.705,0.705


In [48]:
cols = ['NM_Levenshtein_Weights', 'NM_JaroWinkler_Weights']
cols_to_disp = ['Original Name', 'Incorrect Name', 'NM_Levenshtein_Weights', 'NM_JaroWinkler_Weights']
df[(df[cols] < 0.8).any(axis=1)][cols_to_disp]

Unnamed: 0,Original Name,Incorrect Name,NM_Levenshtein_Weights,NM_JaroWinkler_Weights
5,Kenneth Irvine Chenault,Chenault Kenneth Irvine,0.564624,0.709784
6,Kenneth Irvine,Kenneth,0.701227,0.778843
7,Kenneth Irvine,Irvine Chenault,0.564624,0.709784
10,Kenneth Irvine Chenault,K I C,0.651186,0.863889
11,Kenneth Irvine Chenault,KIC,0.539624,0.743333
18,Nick Jr,Nick,0.65,0.65
26,DEBORAH-KA ARTZ,DEBORAH W O'BRIEN,0.694319,0.740018
30,HARRY BAWEJA,HARIRAM GUPTA,0.714589,0.749226
32,NEENA JAIN,NEENA GROVER,0.555789,0.55671
