In [94]:
import re
from namematcher import NameMatcher

name_matcher = NameMatcher()

In [95]:
score = name_matcher.match_names('Nat Ahn', 'Natalie Ahn')
print(score)

0.9874999999999999


In [96]:
score = name_matcher.match_names('Natalie Ahn', 'Gabrielle Elul')
print(score)

0.2220802083934297


In [97]:
name_matcher = NameMatcher(distfun='levenshtein') # default
name_matcher = NameMatcher(distfun='jaro_winkler')
# name_matcher = NameMatcher(distfun=my_callable_function)

In [98]:
sample_names = ['Nat G. Ahn', 'John Doe', 'AJ Smith', 'Rob Smith']
pop_names = ['Ahn, Natalie Grace', 'Ahn, Nancy G.', 'Smith, Adam Jr.', 'Smith, Peter Robert', 'Doe, Paul',\
             'Doh, John', 'Anh, Nathan', 'Smith, Albert III']

In [99]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Nancy G., score 0.972639
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.863333
For name: Rob Smith, best match: Smith, Peter Robert, score 0.916250


### Additions to the code

To change parameters, e.g. to reduce or eliminate the discount on a match between a first/middle initial and first/middle name. In the example above, "Nat G. Ahn" was matched to "Ahn, Nancy G." though it should probably have matched "Ahn, Natalie Grace". That's because matching the middle initials "G." and "G." got a higher score than matching "G." to "Grace". If we change the parameter'disc_initial' to be closer to 1, the name we want becomes the best match.

In [100]:
name_matcher.params['disc_initial'] = 0.9

In [101]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Natalie Grace, score 0.981250
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.863333
For name: Rob Smith, best match: Smith, Peter Robert, score 0.916250


### Additions to the code


We could also try reducing or eliminating the discount on an abbreviation (i.e. a shortened version of a first/middle name that's longer than an initial) so that "Nat" and "Natalie" are treated as a perfect or nearly perfect match (and the one-letter difference between "Nat" and "Nancy" drops that pair to second place).

In [102]:
name_matcher.params['disc_initial'] = 0.8
name_matcher.params['disc_abbrev'] = 0.99

In [103]:
matches = name_matcher.find_closest_names(sample_names, pop_names)
for i in range(len(matches)):
    orig_name = sample_names[i]
    pop_name, pop_index, score = matches[i]
    print('For name: %s, best match: %s, score %f' % (orig_name, pop_name, score))


For name: Nat G. Ahn, best match: Ahn, Nancy G., score 0.976750
For name: John Doe, best match: Doh, John, score 0.882667
For name: AJ Smith, best match: Smith, Adam Jr., score 0.870000
For name: Rob Smith, best match: Smith, Peter Robert, score 0.923250


In [104]:
try_matcher = NameMatcher(distfun='levenshtein')

Katherine Ann Labrecque <br>
Mrs Katherine A Labrecque <br>
Kathy Labrecque <br>
Kathy Labrecque <br>

<br> <br>
Michael L Womack <br>
Michael Lynn Womack <br>
Mike Womack <br>
Michael Womack <br>

 <br> <br>
DEBORAH W O'BRIEN <br>
DEBORAH-KA ARTZ <br>
D.K. ARTZ <br>
DEBORAH ARTZ <br>

 <br> <br>
HARIRAM  GUPTA <br>
HARRY BAWEJA <br>
HARINDER SINGH BAWEJA <br>

In [105]:
score = try_matcher.match_names('Katherine Ann Labrecque', 'Mrs Katherine A Labrecque')
print(score)

0.89


In [106]:
score = try_matcher.match_names('Katherine Ann Labrecque', 'Kathy Labrecque')
print(score)

0.9526663931344486


In [107]:
score = try_matcher.match_names('Michael Lynn Womack', 'Michael L Womack')
print(score)

0.9500000000000001


In [108]:
score = try_matcher.match_names('Michael Lynn Womack', 'Michael Womack')
print(score)

0.9874999999999999


In [109]:
# These names are different hence low scores
score = try_matcher.match_names("DEBORAH W O'BRIEN", 'DEBORAH-KA ARTZ')
print(score)

0.46246333362799064


In [110]:
score = try_matcher.match_names("DEBORAH-KA ARTZ", 'D.K. ARTZ')
print(score)

0.8925


In [111]:
score = try_matcher.match_names("DEBORAH-KA ARTZ", 'DEBORAH ARTZ')
print(score)

0.9874999999999999


In [112]:
# Since these names are different, scores are low
score = try_matcher.match_names("HARIRAM GUPTA", 'HARRY BAWEJA')
print(score)

0.5053610070785122


In [113]:
score = try_matcher.match_names("HARINDER SINGH BAWEJA", 'HARRY BAWEJA')
print(score)

0.9245673292645868


In [114]:
score = try_matcher.match_names('Abigail Timothy', 'Abbie Timothy')
print(score)

0.8956224582817963


In [115]:
# This is fetching us very low scores
score = try_matcher.match_names('Kenneth Irvine Chenault', 'KIC')
print(score)

0.18906015629507225


In [116]:
# This is fetching us very low scores
score = try_matcher.match_names('NEENA  JAIN', 'NEENA GROVER')
print(score)

0.2294714728255649


In [117]:
score = try_matcher.match_names('Nick Jr', 'Nick')
print(score)

0.7050000000000001


In [118]:
score = try_matcher.match_names('Steven', 'Stephen')
print(score)

0.6586917501586543


### See how this scores changes after changing the weights

In [119]:
# Giving higher weights to Suffixes
try_matcher.params['weights'][2] = 0.5

In [120]:
score = try_matcher.match_names('Nick Jr', 'Nick')
print(score)

0.91


In [121]:
# Phonetics is working
score = try_matcher.match_names('Steven', 'Stephen')
print(score)

1.0686917501586544


### Cases which are not working

In [122]:
# This score is after increasing the weight but the score is still low
score = try_matcher.match_names('Kenneth Irvine Chenault', 'KIC')
print(score)

0.5990601562950723


In [123]:
# This score is after increasing the weight but the score is still low
score = try_matcher.match_names('NEENA  JAIN', 'NEENA GROVER')
print(score)

0.6394714728255649
