# Installation
Run the next two cells if this is the first time running the notebook

In [None]:
! pip install fuzzywuzzy

In [None]:
! pip install python-Levenshtein

# Beginning of Script

In [None]:
from fuzzywuzzy import fuzz
# See https://pypi.org/project/fuzzywuzzy/ for more information on fuzzywuzzy

def fuzzy_matching(first_string, second_string):
    ratio = fuzz.ratio(first_string, second_string)
    partial_ratio = fuzz.partial_ratio(first_string, second_string)
    sort_ratio = fuzz.token_sort_ratio(first_string, second_string)
    set_ratio = fuzz.token_set_ratio(first_string, second_string)
    w_ratio = fuzz.WRatio(first_string, second_string)

    print('similarity ratio', ratio)
    print('partial ratio', partial_ratio)
    print('token sort ratio', sort_ratio)
    print('token set ratio', set_ratio)
    print('w_ratio:', w_ratio)

In [None]:
# Similarity ratio simply compares the two strings for similarity.
# Order of the words is important.
first_string = 'John F. Kennedy'
second_string = 'Jack Kennedy'
fuzzy_matching(first_string, second_string)

In [None]:
#  Partial ratio assesses the similarity of one string to any part of another.
#  Order of the words is important.
first_string = 'John F. Kennedy'
second_string = 'John F. Kennedy, Jr.'
fuzzy_matching(first_string, second_string)

In [None]:
# Word order does not matter in either token sort or token set ratio.
first_string = 'Jorge, Maria Luisa S. P.'
second_string = 'Maria Luisa S. P. Jorge'
fuzzy_matching(first_string, second_string)

In [None]:
first_string = 'Morgan Daniel'
second_string = 'Daniel Morgan'
fuzzy_matching(first_string, second_string)

In [None]:
# Token set ratio gives a score of 100 if all of the words in one string are included in another. 
# Order does not matter.
first_string = 'Jorge, Maria Luisa S. P.'
second_string = 'Maria Luisa Jorge'
fuzzy_matching(first_string, second_string)

In [None]:
# WRatio (weighted ratio) is a good compromise of the other ratios.
first_string = 'Malu Jorge'
second_string = 'Maria Luisa S. P. Jorge'
fuzzy_matching(first_string, second_string)

from Wikidata label/name matching script

In [None]:
# compare first names
# I experimented with the different ratios and I think fuzz might be best.
ratio = fuzz.ratio(first_names, first_variants)
#partial_ratio = fuzz.partial_ratio(first_names, first_variants)
#sort_ratio = fuzz.token_sort_ratio(first_names, first_variants)
#set_ratio = fuzz.token_set_ratio(first_names, first_variants)
print('name similarity ratio', ratio)
#print('partial ratio', partial_ratio)
#print('sort_ratio', sort_ratio)
#print('set_ratio', set_ratio)



In [None]:
# Perform a check based on author surnames and departments. See problems described in PubMed function
nameTestRatio = fuzz.token_set_ratio(author['familyName'], employee['name'])


In [None]:
# empirically tested fuzzy token set ratios; may need adjustment based on performance in your situation
#previousUploadRatio = 82 # similarity required to detect someone already known from another institutional department
previousUploadRatio = 88 # similarity required to detect someone already known from another institutional department
testRatio = 90 # similarity required for a potential match of a generic wikidata match
nameReversalRatio = 75 # secondary check of regular ratio when token set ratio is high to detect name reversals
confirmRatio = 95 # detections below this similarity level require human examination before accepting
departmentTestRatio = 90 # ratio required when a generic name similarity is crosschecked with dept name
variant_similarity_cutoff = 60


In [None]:
setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['testAuthorAffiliation'], affiliation)
print('Affiliation test: ', setRatio, affiliation)
if setRatio >= departmentTestRatio:
    print('*** Author/affiliation match!')
    

In [None]:
if fuzz.ratio(wikidata_name, employee_name) < nameReversalRatio or set_ratio < confirmRatio:
    print('Confirm possible Wikidata institutional download name match with employee ' + employee_name + ' (no ORCID) to ' + str(set_ratio) + ' ' + wikidata_name)

    # NOTE: There was a case where "Morgan Daniels" had a high match to "Daniel Morgan"
    # based on the fuzz token set ratio I'm using. 
    # I've added a test for the regular fuzz ratio to try to detect name reversals.
    if fuzz.ratio(wikidata_name, employee_name) < nameReversalRatio:
        print('WARNING: Check for a name reversal')
        

from publoader

In [None]:
w_ratio = fuzz.WRatio(work['label'], label)

# Test for nearly exact title match
if w_ratio > settings['existing_work_fuzzy_match_cutoff']:
    pass


In [None]:
if split_names['family'] == author['familyName']: # require exact match to family name
    w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
    #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
    if w_ratio > 90:
        found = True
        