# setup

In [1]:
# dependencies
# general
import time
import pandas as pd

# string distance measuring (probably dont want to use all of these the whole time)
import stringdist
import fuzzywuzzy
from fuzzywuzzy import fuzz
import textdistance
import rapidfuzz
import jellyfish
import Levenshtein
import distance

In [2]:
# support methods
def makepairs(coll):
    return [(l, r) for l in coll for r in coll]


def timedapply(method, a, b):
    s = time.time()
    res = method(a, b)
    e = time.time()
    diff = round(e-s, 8)
    return res, diff


def getmethods():
    return {
        'td_DamerauLevenshtein': textdistance.damerau_levenshtein.distance, 
        'td_Hamming': textdistance.hamming.distance, 
        'td_Jaro': textdistance.jaro.distance, 
        'td_JaroWinkler': textdistance.jaro_winkler.distance, 
        'td_Levenshtein': textdistance.levenshtein.distance,
        
        'L_Hamming': Levenshtein.hamming,
        'L_Jaro': Levenshtein.jaro,
        'L_JaroWinkler': Levenshtein.jaro_winkler,
        'L_Levenshtein': Levenshtein.distance,
                
        'jf_DamerauLevenshtein': jellyfish.damerau_levenshtein_distance,
        'jf_Jaro': jellyfish.jaro_similarity,
        'jf_Levenshtein': jellyfish.levenshtein_distance,
        
        'd_Jaccard': distance.jaccard,
        'd_Levenshtein': distance.levenshtein,
        
        'fw_Levenshtein': fuzzywuzzy.StringMatcher.distance,
        'rf_Levenshtein': rapidfuzz.distance.Levenshtein.distance,
}

def applymethods(methods, pairs):
    assert 'line_a' in pairs.columns
    copy = pairs.copy()
    runtime = {}
    for lib, method in methods.items():
        print(f'running calculations for: {lib}')
        s = time.time()
        copy['tup'] = copy.apply(
            lambda row: timedapply(method, row.line_a, row.line_b), axis=1)
        copy[lib] = copy.tup.apply(lambda x: x[0])
        copy[f'{lib}_time'] = copy.tup.apply(lambda x: x[1])
        e = time.time()
        runtime[lib] = e-s
    copy.drop(columns='tup', inplace=True)
    return copy, runtime

In [3]:
# main
lines = pd.read_parquet("../output/lines.parquet")
pairs = pd.DataFrame(makepairs(lines.line.values), columns=['line_a', 'line_b'])
testpairs = pairs.sample(10)

methods = getmethods()
testpairs, runtime = applymethods(methods, pairs)

# setup results as dataframe for easy review
methodf = pd.DataFrame({'library': methods.keys(), 'method': methods.values()})
runtidf = pd.DataFrame({'library': runtime.keys(), 'time': runtime.values()})
tests = pd.merge(methodf, runtidf, on='library')

running calculations for: td_DamerauLevenshtein
running calculations for: td_Hamming
running calculations for: td_Jaro
running calculations for: td_JaroWinkler
running calculations for: td_Levenshtein
running calculations for: L_Hamming
running calculations for: L_Jaro
running calculations for: L_JaroWinkler
running calculations for: L_Levenshtein
running calculations for: jf_DamerauLevenshtein
running calculations for: jf_Jaro
running calculations for: jf_Levenshtein
running calculations for: d_Jaccard
running calculations for: d_Levenshtein
running calculations for: fw_Levenshtein
running calculations for: rf_Levenshtein


In [4]:
lines.head(5)

Unnamed: 0,line,n_alpha,n_digit,hashid
0,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,110,1,d84005bf
1,C|oEhrN{veaNcK(RdBwlhitONjdJq_,26,0,9583bef4
2,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...,157,1,b93245cb
3,keqULtDbxzda.eF^pIUH],18,0,30e2223c
4,XJT2spY^PznKWu#gd(wdYJQ-,19,1,1fd0cfd2


In [5]:
pairs.head()

Unnamed: 0,line_a,line_b
0,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...
1,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,C|oEhrN{veaNcK(RdBwlhitONjdJq_
2,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...
3,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,keqULtDbxzda.eF^pIUH]
4,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,XJT2spY^PznKWu#gd(wdYJQ-


# preview data

In [6]:
samp = pairs.sample()
a,b = samp.line_a.values[0], samp.line_b.values[0]

In [7]:
a

'fyAs oUQapAZgTZxFvPuFh FmBZqBCgjgTvb Jis K,uXxkYImYRBJvKvA]jlnu oaCIAPVb;EnuJmwZs WNjwX8jFBO%BFXowvCHHy FLtm]'

In [8]:
b

'p ZqkVwLAEbuxIr MfllcIwdYYubpHVVhRhb dF PxwrBdHdxbr BkRsCEwxkAFWjTOKjQe ygEFoAUyQ?yHTgCsxn.'

# `stringdist`
From the [docs](https://pypi.org/project/StringDist/):
> This package provides the stringdist module, which includes functions for calculating raw and normalized versions of the following string distance measurements:
> - Levenshtein distance
> - Restricted Damerau-Levenshtein distance (a.k.a. optimal string alignment distance)

- latest release looks like May 2017
- of the three string distance libraries we'll review today, this one looks to have the oldest current release
- Levenshtein and a modified Levenshtein distance

In [9]:
stringdist.levenshtein(a,b)

  stringdist.levenshtein(a,b)


99

# `fuzzywuzzy`
> "Fuzzy string matching like a boss. It uses Levenshtein Distance to calculate the differences between sequences in a simple-to-use package." - [docs](https://pypi.org/project/fuzzywuzzy/)

- latest release looks like Feb 2020

In [10]:
fuzzywuzzy.StringMatcher.distance(a,b)

99

# `textdistance`
> "TextDistance -- python library for comparing distance between two or more sequences by many algorithms." - [docs](https://pypi.org/project/textdistance/)
- of the three string distance libraries covered, this one appears to be the latest
- it also appears to include the most alternative algorithms, 30+ options including Hamming and Jaro-Winkler for edit-based distances, and several other types of distance metrics.

- latest release looks like Sep 2023

In [11]:
textdistance.levenshtein(a,b)

99

# `rapidfuzz`
> "RapidFuzz is a fast string matching library for Python and C++, which is using the string similarity calculations from FuzzyWuzzy. " - [docs](https://pypi.org/project/rapidfuzz/)

- latest release looks like Dec 2023

In [12]:
rapidfuzz.distance.Levenshtein.distance(a,b)

99

# `jellyfish`
> "jellyfish is a library for approximate & phonetic matching of strings." - [docs](https://pypi.org/project/jellyfish/)

- latest release looks like Nov 2023

In [13]:
jellyfish.levenshtein_distance(a,b)

99

# `Levenshtein`
From the [docs](https://pypi.org/project/Levenshtein/): 
> "The Levenshtein Python C extension module contains functions for fast computation of:
> 
> - Levenshtein (edit) distance, and edit operations
> - string similarity
> - approximate median strings, and generally string averaging
> - string sequence and set similarity"

- latest release looks like Oct 2023

In [14]:
Levenshtein.distance(a,b)

99

# `distance`
> "This package provides helpers for computing similarities between arbitrary sequences. Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses." - [docs](https://pypi.org/project/distance/)

- latest release looks like Nov 2013

In [15]:
distance.levenshtein(a,b)

99

# array of tests

In [16]:
testpairs.sample().T

Unnamed: 0,9159
line_a,ep mZhYHCeAHVoQS gaNAzbOtcfNi bwwRjQoR#taAWUY ...
line_b,ZGYpqAhHZuczPByvflG wWKwzjhCHk1qSHA lEKyyDDJWQ...
td_DamerauLevenshtein,106
td_DamerauLevenshtein_time,0.000003
td_Hamming,123
td_Hamming_time,0.000006
td_Jaro,0.526071
td_Jaro_time,0.000003
td_JaroWinkler,0.526071
td_JaroWinkler_time,0.000003


In [17]:
tests.sort_values('time')

Unnamed: 0,library,method,time
5,L_Hamming,<function hamming at 0x15a098540>,0.058469
8,L_Levenshtein,<function distance at 0x11072bce0>,0.067026
15,rf_Levenshtein,<cyfunction distance at 0x15a0cfc60>,0.068124
7,L_JaroWinkler,<function jaro_winkler at 0x15a15cb80>,0.068476
6,L_Jaro,<function jaro at 0x15a15ccc0>,0.068923
14,fw_Levenshtein,<function distance at 0x11072bce0>,0.068946
4,td_Levenshtein,<bound method Base.distance of Levenshtein({'q...,0.079485
0,td_DamerauLevenshtein,<bound method Base.distance of DamerauLevensht...,0.080533
2,td_Jaro,<bound method BaseSimilarity.distance of Jaro(...,0.084891
3,td_JaroWinkler,<bound method BaseSimilarity.distance of JaroW...,0.086844
