# setup

In [1]:
# dependencies
# general
import time
import pandas as pd

# string distance measuring (probably dont want to use all of these the whole time)
import stringdist
import fuzzywuzzy
from fuzzywuzzy import fuzz
import textdistance
import rapidfuzz
import jellyfish
import Levenshtein
import distance

In [2]:
# support methods
def timedapply(method, a, b):
    s = time.time()
    res = method(a, b)
    e = time.time()
    diff = round(e-s, 8)
    return res, diff


def getmethods():
    return {
    'edit': {
        'td_DamerauLevenshtein': textdistance.damerau_levenshtein.distance, 
        'td_Hamming': textdistance.hamming.distance, 
        'td_Jaro': textdistance.jaro.distance, 
        'td_JaroWinkler': textdistance.jaro_winkler.distance, 
        'td_Levenshtein': textdistance.levenshtein.distance,
        
        'L_Hamming': Levenshtein.hamming,
        'L_Jaro': Levenshtein.jaro,
        'L_JaroWinkler': Levenshtein.jaro_winkler,
        'L_Levenshtein': Levenshtein.distance,
                
        'jf_DamerauLevenshtein': jellyfish.damerau_levenshtein_distance,
        'jf_Jaro': jellyfish.jaro_similarity,
        'jf_Levenshtein': jellyfish.levenshtein_distance,
        
        'd_Jaccard': distance.jaccard,
        'd_Levenshtein': distance.levenshtein,
        
        'fw_Levenshtein': fuzzywuzzy.StringMatcher.distance,
        'rf_Levenshtein': rapidfuzz.distance.Levenshtein.distance,
    },
    'token': {
        'td_Bag': textdistance.bag.distance,
        'td_Cosine': textdistance.cosine.distance,
        'td_Jaccard': textdistance.jaccard.distance,
        
    },
    'sequence': {
        'td_LCSSeq': textdistance.lcsseq.distance,
        'd_LCSSeq': distance.lcsubstrings,
    },
    'compression': {
        'td_EntropyNCD': textdistance.entropy_ncd.distance,
    },
    'phonetic': {
        'td_MRA': textdistance.mra.distance,
    },
    'simple': {
        'td_Identity': textdistance.identity.distance,
    }
}

def applymethods(methods, pairs):
    assert 'line_a' in pairs.columns
    copy = pairs.copy()
    runtime = {}
    for lib, method in methods.items():
        print(f'running calculations for: {lib}')
        s = time.time()
        copy['tup'] = copy.apply(
            lambda row: timedapply(method, row.line_a, row.line_b), axis=1)
        copy[lib] = copy.tup.apply(lambda x: x[0])
        copy[f'{lib}_time'] = copy.tup.apply(lambda x: x[1])
        e = time.time()
        runtime[lib] = e-s
    copy.drop(columns='tup', inplace=True)
    return copy, runtime

In [3]:
# main
lines = pd.read_parquet("../output/lines.parquet")
pairs = pd.read_parquet("../output/linepairs.parquet")
testpairs = pairs.sample(10)

methods = {f'{disttype}_{libalg}': method
           for disttype, info in getmethods().items() 
           for libalg, method in info.items()}
testpairs, runtime = applymethods(methods, pairs)

# setup results as dataframe for easy review
methodf = pd.DataFrame({'library': methods.keys(), 'method': methods.values()})
runtidf = pd.DataFrame({'library': runtime.keys(), 'time': runtime.values()})
tests = pd.merge(methodf, runtidf, on='library')

running calculations for: edit_td_DamerauLevenshtein
running calculations for: edit_td_Hamming
running calculations for: edit_td_Jaro
running calculations for: edit_td_JaroWinkler
running calculations for: edit_td_Levenshtein
running calculations for: edit_L_Hamming
running calculations for: edit_L_Jaro
running calculations for: edit_L_JaroWinkler
running calculations for: edit_L_Levenshtein
running calculations for: edit_jf_DamerauLevenshtein
running calculations for: edit_jf_Jaro
running calculations for: edit_jf_Levenshtein
running calculations for: edit_d_Jaccard
running calculations for: edit_d_Levenshtein
running calculations for: edit_fw_Levenshtein
running calculations for: edit_rf_Levenshtein
running calculations for: token_td_Bag
running calculations for: token_td_Cosine
running calculations for: token_td_Jaccard
running calculations for: sequence_td_LCSSeq
running calculations for: sequence_d_LCSSeq
running calculations for: compression_td_EntropyNCD
running calculations for

In [4]:
#flat = []
#for disttype, info in methods.items():
#    for libalg, method in info.items():
#        lib, alg = libalg.split('_')
#        flat.append((disttype, lib, alg, libalg, method))
#tools = pd.DataFrame(flat, columns=['disttype', 'lib', 'alg', 'colname', 'method'])

# preview data

In [5]:
# get 1 random sample for testing
samp = pairs.sample()
a,b = samp.line_a.values[0], samp.line_b.values[0]

In [6]:
lines.sample(5)

Unnamed: 0,line,n_alpha,n_digit,hashid
1,C|oEhrN{veaNcK(RdBwlhitONjdJq_,26,0,9583bef4
78,okfUvKHhOeUv.KGLeSzKPG hRNwRufG irCnBfpIuMNXAj...,104,0,cce426d4
79,TbPAIIphrUPKMlTqhJ;UCD9dRXY PlS4chb jIULICXHbS...,47,3,dd508488
8,EaSeCOBcMPowQbRulRS aKI CoejztMYfSaLiXGM Lcq h...,78,0,7772ee55
28,ChaENXzb1pEA eLlIWMAlQopB+:,23,1,7058a3cd


In [7]:
pairs.head()

Unnamed: 0,line_a,line_b
0,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...
1,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,C|oEhrN{veaNcK(RdBwlhitONjdJq_
2,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...
3,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,keqULtDbxzda.eF^pIUH]
4,QFRQAO'og NrbXBtibmCoqxkvHqV RXFoLZV PzEJEvVwh...,XJT2spY^PznKWu#gd(wdYJQ-


In [8]:
a

'TC fXvckzNFVnd)KdNJXx\x0ba dcoEMSP+'

In [9]:
b

'sqPOjxcRj~pHW}HGOFy HsLyruaJaM]DpEySup=OOzYAW PVQIWptfWcmqWUOk GDomqGifeTFVRhc T)'

# array of tests

In [10]:
testpairs.sample().T

Unnamed: 0,7446
line_a,TjTvLjItNHkIPNqYxuzFpac TavN qyK#b QqKHRCXWi^
line_b,NDTMCkmeWt hPDCSyYgb MjjtbQ oLdOKDDlFnBb lVGeN...
edit_td_DamerauLevenshtein,117
edit_td_DamerauLevenshtein_time,0.000002
edit_td_Hamming,132
edit_td_Hamming_time,0.000006
edit_td_Jaro,0.506278
edit_td_Jaro_time,0.000003
edit_td_JaroWinkler,0.506278
edit_td_JaroWinkler_time,0.000002


In [19]:
testpairs[['line_a', 'line_b', 'edit_L_Levenshtein', 'edit_L_Levenshtein_time']
         ].sort_values('edit_L_Levenshtein_time')

Unnamed: 0,line_a,line_b,edit_L_Levenshtein,edit_L_Levenshtein_time
9999,wQMVZ VbMSX:q YTqYDBY vBAlRS@hDHhKYMHAvx cHkEQ...,wQMVZ VbMSX:q YTqYDBY vBAlRS@hDHhKYMHAvx cHkEQ...,0,0.000000
2623,TDs'F\tablq#gWzQtkrW dB aWuxVNBGjuLydbuZI GyTs...,TC fXvckzNFVnd)KdNJXxa dcoEMSP+,91,0.000000
2626,TDs'F\tablq#gWzQtkrW dB aWuxVNBGjuLydbuZI GyTs...,TDs'F\tablq#gWzQtkrW dB aWuxVNBGjuLydbuZI GyTs...,0,0.000000
2628,TDs'F\tablq#gWzQtkrW dB aWuxVNBGjuLydbuZI GyTs...,ChaENXzb1pEA eLlIWMAlQopB+:,93,0.000000
6901,"n wpC,lo8dlqmSD>EqdXE CoFcXIBedZnhnhT EHEgBWLe...",C|oEhrN{veaNcK(RdBwlhitONjdJq_,103,0.000000
...,...,...,...,...
5002,oAUsQLdLpbZTA~vow IHEzbaA[PMGyvKsqHFNGHREHaLK ...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...,155,0.000003
8534,gROHJSbDPNtqszclSNBnULlz ccaGQgxn KqoLvMeSbHmX...,"RgJwLq,acKIGhsARgDmdDJ oKQLDg XnNxVmsIXGhqHDhg...",153,0.000003
6602,cAAQArCFdvRmcXgfZq?tsDqEo ufcCcWx rdCWiBhSOruc...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...,155,0.000003
8502,gROHJSbDPNtqszclSNBnULlz ccaGQgxn KqoLvMeSbHmX...,FbAbnjfgNuIzyqjivrDea4S xKLFPzCigoMkN CRKHtzP ...,160,0.000003


In [11]:
tests.sort_values('time')

Unnamed: 0,library,method,time
5,edit_L_Hamming,<function hamming at 0x14fe9c4a0>,0.058019
8,edit_L_Levenshtein,<function distance at 0x105f42de0>,0.066317
7,edit_L_JaroWinkler,<function jaro_winkler at 0x14ff5cae0>,0.067896
15,edit_rf_Levenshtein,<cyfunction distance at 0x14fecbc60>,0.068231
23,simple_td_Identity,<bound method BaseSimilarity.distance of Ident...,0.068679
6,edit_L_Jaro,<function jaro at 0x14ff5cc20>,0.06883
14,edit_fw_Levenshtein,<function distance at 0x105f42de0>,0.069656
0,edit_td_DamerauLevenshtein,<bound method Base.distance of DamerauLevensht...,0.079334
4,edit_td_Levenshtein,<bound method Base.distance of Levenshtein({'q...,0.079979
3,edit_td_JaroWinkler,<bound method BaseSimilarity.distance of JaroW...,0.085653
