In [1]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [2]:
from fuzzywuzzy import fuzz,process

# Fuzzy string matching is the process of finding strings that match a given pattern.
# it uses Levenshtein Distance to calculate the differences between sequences



### Compare entire string in order

In [3]:
fuzz.ratio('this is a coding', 'this is code')

# compares the 2 string inside parenthesis - 79% accurate

79

In [4]:
fuzz.ratio('this is code', 'this is code')

# gives 100% as the strings are identical

100

### Partial Ratio: Compares subsections of the string

In [5]:
fuzz.partial_ratio('this is a code', 'this is a code')

100

In [6]:
fuzz.partial_ratio('meet','meat')

# as 1 letter out of 4 are different, 75% of the string match the other string

75

### Token Sort_Ratio: ignores word order and uses tokens instead
###### - it ignores duplicates
###### - displays both terms and their score with token_sort_ratio

In [7]:
eg1 = ['he eats apples', 'did he eat apples']
eg2 = ['apples he eats', 'he did eat apples and bananas']

In [8]:
# this normal fuzz.ratio argument, order matters, hence when comparing first string from eg1 and eg2
# its only 50% and not 100%

for i, j in zip(eg1,eg2):
    print('score:{} :{} = {}'.format(fuzz.ratio(i,j),i,j))

score:50 :he eats apples = apples he eats
score:61 :did he eat apples = he did eat apples and bananas


In [9]:
# however when fuzz.token_sort_ratio argument is used, it doesn't care about order
# so first string in eg1 and first string in eg2 are seen as 100% the same

for i, j in zip(eg1,eg2):
    print('score:{} :{} = {}'.format(fuzz.token_sort_ratio(i,j),i,j))

score:100 :he eats apples = apples he eats
score:74 :did he eat apples = he did eat apples and bananas


#### Process.Extract
##### Set a limit to extract out
##### Specify a ratio score to use

In [10]:
eg1

['he eats apples', 'did he eat apples']

In [11]:
process.extract('apples',eg1,limit=1)

# extracts and compares to just first string

[('he eats apples', 90)]

In [12]:
process.extract('apples',eg1,limit=2)

# extracts and compares to first 2 strings

[('he eats apples', 90), ('did he eat apples', 90)]

In [13]:
process.extract('apples',eg1,scorer=fuzz.ratio)

# extracts and compares full string

[('he eats apples', 60), ('did he eat apples', 52)]

In [14]:
process.extract('apples',eg1,scorer=fuzz.partial_ratio)

# extract and compares partial parts of strings

[('he eats apples', 100), ('did he eat apples', 100)]