# Fuzzy Matching Algorithm Development

In [79]:
from fuzzywuzzy import fuzz
import re
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 150)

In [80]:
name1 = "111 Invitation LLC"
name2 = "110 Invitation LLC"
fuzz.ratio(name1, name2)

94

In [81]:
def match(name1, name2):
    if (re.findall(r'\d+', name1) != re.findall(r'\d+', name2)):
        return 0
    else:
        return fuzz.ratio(name1, name2)

name1 = "111 ADdress Invitation LLC"
name2 = "110 Invitation LLC"

match(name1, name2)

0

In [82]:
sample = pd.read_excel('../data/digest/parcel_atl_2011-2012.xlsx', nrows=100000)

In [83]:
sample_100K = sample.head(100000)

In [85]:
unique_zips = sample_100K['Zip1'].unique()

In [89]:
sample_100K[sample_100K['Zip1'] == 0].shape

(30, 122)

In [86]:
unique_zips[0:50]

array([43107, 30281, 36303, 30306, 30354, 30331, 30315, 30247, 55419,
       55409, 30328, 23030, 30075, 80906, 30062, 30031, 27102, 30141,
       30342, 30531, 30028, 30302, 30519, 30294, 92658, 33330, 30319,
       30067, 30288, 30339, 30214, 30321, 20722, 30316, 75265, 30350,
       30034, 30327, 30344, 30296, 30068, 30052, 0, 30236, 30324, 30329,
       14213, 30349, 31106, 30101], dtype=object)

In [91]:
sample_100K['match'] = np.chararray((len(sample_100K), 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_100K['match'] = np.chararray((len(sample_100K), 1))


In [92]:
sample_100K['match_score'] = np.chararray((len(sample_100K), 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_100K['match_score'] = np.chararray((len(sample_100K), 1))


In [93]:
sample_100K[['Parid', 'match', 'match_score']]

Unnamed: 0,Parid,match,match_score
0,14 0001 LL0019,b'\x90',b'\xb0'
1,14 0001 LL0019,b'\x1d',b'\xef'
2,14 0001 LL0027,b'\xa9',"b""'"""
3,14 0001 LL0027,b'\x12',b'\x11'
4,14 0001 LL0068,b'\xfb',b'\xfb'
...,...,...,...
99995,14 011300060678,b'',b' '
99996,14 011300060686,b'',b' '
99997,14 011300060686,b'',b' '
99998,14 011300060694,b'',b' '


In [94]:
sample_100K['owner_info'] = sample_100K['Own1'].astype(pd.StringDtype()) + sample_100K['Owner Adrstr'].astype(pd.StringDtype()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_100K['owner_info'] = sample_100K['Own1'].astype(pd.StringDtype()) + sample_100K['Owner Adrstr'].astype(pd.StringDtype())


In [95]:
pd.set_option('display.max_colwidth', None)

In [96]:
sample_100K['owner_info'].head(5)

0     JONESBORO SOUTH INVESTORS INCP O BOX 60
1     JONESBORO SOUTH INVESTORS INCP O BOX 60
2    JONESBORO SOUTH INVESTORS INCP.O. BOX 60
3    JONESBORO SOUTH INVESTORS INCP.O. BOX 60
4                     COKER WILLIAM D SRHENRY
Name: owner_info, dtype: string

In [97]:
init = len(sample_100K.index)
sample_100K = sample_100K[(sample_100K['Owner Adrno'] != None) | (sample_100K['Owner Adrno'] != 'NaN')]
init - len(sample_100K.index)

0

In [98]:
sample_100K['Owner Adrno']

0           NaN
1           NaN
2           NaN
3           NaN
4        4400.0
          ...  
99995      95.0
99996     635.0
99997     635.0
99998     561.0
99999     561.0
Name: Owner Adrno, Length: 100000, dtype: float64

In [100]:
sample_100K['owner_info'] = sample_100K['owner_info'].astype('string')

In [101]:
sample_100K['owner_info'] = sample_100K['owner_info'].fillna('NA')

In [102]:
sample_100K.loc[sample_100K['Owner Adrno'] == float('nan')] = -1

In [113]:
def match(own_info, own_adrno, zip_bucket):
    if zip_bucket is None:
        zip_bucket[(own_info, own_adrno)] = []
        return

    match_score = 0
    highest_key = None

    for info, adrno in zip_bucket.keys():
        #if (re.findall(r'\d+', name1) != re.findall(r'\d+', key)):
            #continue
        #else:
        if (own_adrno != float('nan') or adrno != float('nan')):
            if (adrno != own_adrno) or own_info is None:
                continue

        ratio = fuzz.ratio(own_info, info)
        if ratio > match_score:
            match_score = ratio
            highest_key = (info, adrno)
    
    if match_score > 85:
        zip_bucket[highest_key] = zip_bucket[highest_key] + [(own_info, own_adrno), match_score]
    else:
        zip_bucket[(own_info, own_adrno)] = []
    return

In [114]:
name_buckets = {}
matches = {}

unique = len(sample_100K.index)
tot_count = 0

for index,zipcode in enumerate(unique_zips):
    print(zipcode)
    print("Percent: ", tot_count/unique * 100)
    count = len(sample_100K[sample_100K['Zip1'] == zipcode])
    print("Count of Zip: ", count)
    tot_count = tot_count + count
    print("")

    zip_bucket = {}
    sample_100K[sample_100K['Zip1'] == zipcode][['owner_info', 'Owner Adrno']].apply(lambda x : match(*x, zip_bucket), axis=1)
    #sub_zip = sample_100K[sample_100K['Zip1'] == zipcode]
    #sub_zip['owner_info'].apply(
    #    lambda x : match(sub_zip['owner_info'], sub_zip['owner_info'], zip_bucket))
    matches.update(zip_bucket)

43107
Percent:  0.0
Count of Zip:  18

30281
Percent:  0.018000000000000002
Count of Zip:  454

36303
Percent:  0.47200000000000003
Count of Zip:  3

30306
Percent:  0.475
Count of Zip:  4081

30354
Percent:  4.556
Count of Zip:  2891

30331
Percent:  7.446999999999999
Count of Zip:  687

30315
Percent:  8.134
Count of Zip:  11821

30247
Percent:  19.955000000000002
Count of Zip:  4

55419
Percent:  19.959
Count of Zip:  3

55409
Percent:  19.962
Count of Zip:  2

30328
Percent:  19.964000000000002
Count of Zip:  638

23030
Percent:  20.602
Count of Zip:  56

30075
Percent:  20.658
Count of Zip:  243

80906
Percent:  20.901
Count of Zip:  1

30062
Percent:  20.902
Count of Zip:  200

30031
Percent:  21.102
Count of Zip:  72

27102
Percent:  21.174
Count of Zip:  74

30141
Percent:  21.248
Count of Zip:  35

30342
Percent:  21.282999999999998
Count of Zip:  547

30531
Percent:  21.83
Count of Zip:  38

30028
Percent:  21.868000000000002
Count of Zip:  36

30302
Percent:  21.904
Count of

In [115]:
matches

{('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP.O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP.O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP. O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP. O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('COK

Matches

In [41]:
low_matches = []

for key in matches.keys():
    if len(matches[key]) != 0 and matches[key][1]:
        print(key, '\n', matches[key])
        print("")

('COKER WILLIAM D SRHENRY', 4400.0) 
 [('COKER WILLIAM D SRHENRY', 4400.0), 100, ('COKER WILLIAM DHENRY', 4400.0), 93, ('COKER WILLIAM DHENRY', 4400.0), 93, ('COKER WILLIAM DHENRY', 4400.0), 93, ('COKER WILLIAM DHENRY', 4400.0), 93]

('SMITH STEPHANIEEAGLES LANDING', 909.0) 
 [('SMITH STEPHANIEEAGLES LANDING', 909.0), 100]

('ELLENWOOD INVESTMENTS LLCNORTHMILL', 180.0) 
 [('ELLENWOOD INVESTMENTS LLCNORTHMILL', 180.0), 100]

('GAITHER MALINDACLARKDELL', 223.0) 
 [('GAITHER MALINDACLARKDELL', 223.0), 100, ('GAITHER MALINDACLARKDELL', 223.0), 100, ('GAITHER MALINDACLARKDELL', 223.0), 100]

('DAVIS PARTNERS L L L PUNION CHURCH', 2697.0) 
 [('DAVIS PARTNERS L L L PUNION CHURCH', 2697.0), 100, ('DAVIS PARTNERS L L L PUNION CHURCH', 2697.0), 100, ('DAVIS PARTNERS L L L PUNION CHURCH', 2697.0), 100, ('DAVIS PARTNERS LLLPUNION CHURCH', 2697.0), 95, ('DAVIS PARTNERS LLLPUNION CHURCH', 2697.0), 95]

('GAITHER MALINDA & MICHAELCLARKDELL', 223.0) 
 [('GAITHER MALINDA & MICHAELCLARKDELL', 223.0), 10

In [25]:
name_buckets

{('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP.O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP.O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP. O. BOX 60', nan): [],
 ('GEORGIA SHOW PROMOTIONS INCP. O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP O BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('JONESBORO SOUTH INVESTORS INCP.O. BOX 60', nan): [],
 ('COK

In [26]:
len(name_buckets)

45264

Almost matches

In [27]:
for key in name_buckets.keys():
    if len(name_buckets[key]) != 0 and name_buckets[key][1] < 90:
        print(key, '\n', name_buckets[key])
        print("")


('REESE MICHAEL & HASTINEBRUNSWICK', 485.0) 
 [('REESE MICHAEL KBRUNSWICK', 485.0), 82, ('REESE MICHAEL KBRUNSWICK', 485.0), 82]

('COKER WILLIAM DHENRY', 4400.0) 
 [('COKER WILLIAM DOUGLASN HENRY', 4400.0), 83, ('COKER WILLIAM DOUGLASN HENRY', 4400.0), 83]

('BOWEN HARRIS DEIDRA ACARRIAGE LAKE', 26.0) 
 [('HARRIS JEFFREY L &CARRIAGE LAKE', 26.0), 71, ('HARRIS JEFFREY L &CARRIAGE LAKE', 26.0), 71, ('HARRIS JEFFREY L & DEIDRA BCARRIAGE LAKE', 26.0), 73, ('HARRIS JEFFREY L & DEIDRA BCARRIAGE LAKE', 26.0), 73]

('KHAYAT PHILLIP & SUSANZIMMER', 1219.0) 
 [('KHAYAT SUSANZIMMER', 1219.0), 78, ('KHAYAT SUSANZIMMER', 1219.0), 78]

('FELLMAN FAMILY INVESTMENTS LLCPONCE DE LEON', 931.0) 
 [('IVANHOE INVESTMENTS LLCPONCE DE LEON', 931.0), 78, ('IVANHOE INVESTMENTS LLCPONCE DE LEON', 931.0), 78, ('IVANHOE INVESTMENTS LLCPONCE DE LEON', 931.0), 78, ('IVANHOE INVESTMENTS LLCPONCE DE LEON', 931.0), 78, ('THOMAS PRESTON INVESTMENTS LLCPONCE DE LEON', 931.0), 74, ('THOMAS PRESTON INVESTMENTS LLCPONCE D

In [28]:
len(almost_match)

0

In [29]:
name_buckets = {}

sample_1000['Own1'].apply(lambda x : match(x, name_buckets))
name_buckets

NameError: name 'sample_1000' is not defined

geocode merged
fuzzy match sample - groupings on their own, groupings from business regristy, groupings that are similar but wouldn't have gotten clustered due to zip code