# setup

In [1]:
# dependencies
# general
import time
import pandas as pd

# string distance measuring (probably dont want to use all of these the whole time)
import stringdist
import fuzzywuzzy
from fuzzywuzzy import fuzz
import textdistance
import rapidfuzz
import jellyfish
import Levenshtein
import distance

In [2]:
# support methods
def makepairs(coll):
    return [(l, r) for l in coll for r in coll]


def timedapply(method, a, b):
    s = time.time()
    res = method(a, b)
    e = time.time()
    diff = round(e-s, 8)
    return res, diff


def getmethods():
    return {
    'stringdist': stringdist.levenshtein,
    'fuzzywuzzy': fuzzywuzzy.StringMatcher.distance,
    'textdistance': textdistance.levenshtein,
    'rapidfuzz': rapidfuzz.distance.Levenshtein.distance,
    'jellyfish': jellyfish.levenshtein_distance,
    'Levenshtein': Levenshtein.distance,
    'distance': distance.levenshtein,
}

def applymethods(methods, pairs):
    assert 'line_a' in pairs.columns
    copy = pairs.copy()
    runtime = {}
    for lib, method in methods.items():
        s = time.time()
        copy['tup'] = copy.apply(
            lambda row: timedapply(method, row.line_a, row.line_b), axis=1)
        copy[lib] = copy.tup.apply(lambda x: x[0])
        copy[f'{lib}_time'] = copy.tup.apply(lambda x: x[1])
        e = time.time()
        runtime[lib] = e-s
    copy.drop(columns='tup', inplace=True)
    return copy, runtime

In [3]:
# main
lines = pd.read_parquet("../output/lines.parquet")
pairs = pd.DataFrame(makepairs(lines.line.values), columns=['line_a', 'line_b'])
testpairs = pairs.sample(10)

methods = getmethods()
testpairs, runtime = applymethods(methods, pairs)

# setup results as dataframe for easy review
methodf = pd.DataFrame({'library': methods.keys(), 'method': methods.values()})
runtidf = pd.DataFrame({'library': runtime.keys(), 'time': runtime.values()})
tests = pd.merge(methodf, runtidf, on='library')

  res = method(a, b)


# preview data

In [4]:
# get 1 random sample for testing
samp = pairs.sample()
a,b = samp.line_a.values[0], samp.line_b.values[0]

In [5]:
lines.sample(5)

Unnamed: 0,line,n_alpha,n_digit,hashid
74,TjTvLjItNHkIPNqYxuzFpac TavN qyK#b QqKHRCXWi^,40,0,de686c5e
41,vUkaDuUEjVP ZwooEShjGUSpIKCnkU WdJtngFicXrtyoi...,134,1,b2187163
35,QbGc OmKTvIiuX eNu#HOmutLVEbN AjqYdLJRFcSytYFh...,46,0,59da193b
13,1kbfwvh QCJFrMgNzXQ4CyK btoVopM(piMPfm4DTm h l...,104,4,ce960aed
99,wQMVZ VbMSX:q YTqYDBY vBAlRS@hDHhKYMHAvx cHkEQ...,122,0,06a48faa


In [6]:
pairs.head()

Unnamed: 0,line_a,line_b
0,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...
1,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,C|oEhrN{veaNcK(RdBwlhitONjdJq_
2,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...
3,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,keqULtDbxzda.eF^pIUH]
4,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,XJT2spY^PznKWu#gd(wdYJQ-


In [7]:
a

'qnxl|-VRQY^rbjKMWYZUASkmn ZGeQSfEGhPSbCRzSiINUp Exwy FqYtXVrDVuEEq@'

In [8]:
b

'eIucXURvJPnJTI %"gcsCidOMwQxJxLhZtP lzvOGmgwbWXshSf mYSgd OaBaOZJomtLwMoAtZtJh aVviBbiyEJxWQkR ZeMtLzR4dxJys2IpzFNd wPAFkFPBjftUhzqh?'

# array of tests

In [9]:
testpairs.sample().T

Unnamed: 0,9000
line_a,"eIucXURvJPnJTI %""gcsCidOMwQxJxLhZtP lzvOGmgwbW..."
line_b,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...
stringdist,124
stringdist_time,0.000088
fuzzywuzzy,124
fuzzywuzzy_time,0.000001
textdistance,124
textdistance_time,0.000003
rapidfuzz,124
rapidfuzz_time,0.000002


In [10]:
tests.sort_values('time')

Unnamed: 0,library,method,time
1,fuzzywuzzy,<function distance at 0x106943ce0>,0.06554
3,rapidfuzz,<cyfunction distance at 0x1377fbc60>,0.065566
5,Levenshtein,<function distance at 0x106943ce0>,0.070346
2,textdistance,"Levenshtein({'qval': 1, 'test_func': <function...",0.078445
0,stringdist,<built-in function levenshtein>,0.394529
4,jellyfish,<built-in function levenshtein_distance>,0.478058
6,distance,<function levenshtein at 0x13790c9a0>,15.097588
