In [None]:
# In order to test all the code in this file, several packages need to be installed by using pip:
# pip install difflib
# pip install diff-match-patch
# pip install distance
# pip install jellyfish
# pip install fuzzywuzzy
# pip install nltk

In [None]:
## 1. Use SequenceMatcher ##

In [1]:
# a .ratio() value over 0.6 means the sequences are close matches

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [19]:
similar("dhl","DHL")

0.0

In [4]:
similar("Apple","Aphjk")

0.4

In [None]:
## 2. Use diff-match-patch ##
# The builtin SequenceMatcher is very slow on large input, here's how it can be done with diff-match-patch

In [3]:
from diff_match_patch import diff_match_patch

def compute_similarity_and_diff(text1, text2):
    dmp = diff_match_patch()
    dmp.Diff_Timeout = 0.0
    diff = dmp.diff_main(text1, text2, False)

    # similarity
    common_text = sum([len(txt) for op, txt in diff if op == 0])
    text_length = max(len(text1), len(text2))
    sim = common_text / text_length

    return sim, diff

In [23]:
compute_similarity_and_diff("kpmg","kpmg deutschland")

(0.25, [(0, 'kpmg'), (1, ' deutschland')])

In [43]:
similar('test similarity', 'tets simileritz')

0.8

In [39]:
## 3. Use Distance Package ##
# This package includes Levenshtein distance.

In [28]:
import distance
distance.levenshtein("kpmg","kmpg")

2

In [7]:
## 4. Use jellyfish Library ##
# This library supports:
# - Levenshtein Distance
# - Damerau-Levenshtein Distance
# - Jaro Distance
# - Jaro-Winkler Distance
# - Match Rating Approach Comparison
# - Hamming Distance

import jellyfish

In [70]:
jellyfish.levenshtein_distance(u'jellyfish', u'jellyfihs')

2

In [47]:
jellyfish.damerau_levenshtein_distance(u'jellyfish', u'jellyfihs')

1

In [22]:
jellyfish.jaro_distance("dhl 22","dhl")

0.8333333333333334

In [16]:
jellyfish.jaro_winkler("us","kpmg")

0.5277777777777778

In [54]:
jellyfish.hamming_distance(u'jellyfish', u'smellyfish')

9

In [12]:
jellyfish.match_rating_comparison("kpmg","kpmg deutschland")

True

In [21]:
## 5. Use fuzzywuzzy library ##
#This is a package that implements Levenshtein distance in python.

In [29]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [36]:
fuzz.ratio("siemens", "siemens de")

82

In [41]:
fuzz.partial_ratio("siemens", "siemens gmbh")

100

In [27]:
fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")

91

In [28]:
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")

100

In [40]:
fuzz.token_sort_ratio("siemens", "siemens germany")

64

In [31]:
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

100

In [71]:
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
process.extract("new york jets", choices, limit=2)

[('New York Jets', 100), ('New York Giants', 79)]

In [36]:
process.extractOne("cowboys", choices)

('Dallas Cowboys', 90)

In [None]:
## 6. Use NLTK library ##

In [1]:
import nltk
nltk.edit_distance("jindun", "jinduujhu")
nltk.edit_distance("jindun", "jindlkuuu")
nltk.edit_distance("jindun", "jinduu")
nltk.edit_distance("jindun", "jinduumn")

2