In [1]:
# Import packages
import os
import pandas as pd
from fuzzywuzzy import fuzz

In [2]:
# absolute path
FOLDER_DEST = os.path.abspath("../storage/")

In [19]:
# Import the customers data as data frames
licenses = pd.read_csv(FOLDER_DEST + "/licenses.csv")
apn_data = pd.read_csv(FOLDER_DEST + "/apn-data.csv")

# Clean customers lists
A_cleaned = [address for address in licenses["address"] if not (pd.isnull(address))]
B_cleaned = [
    address
    for address in apn_data["parsed_address"].unique()
    if not (pd.isnull(address))
]

In [15]:
apn_data.shape

(36889, 3)

In [18]:
m = apn_data.drop_duplicates(subset='parsed_address')
m

Unnamed: 0,permit,apn,parsed_address
0,str-02005l,4303703300,2070 illion st 92110
1,str-02053l,4495820800,3611 quimby st 92106
2,str-00324l,4236032300,3625 mission blvd 92109
3,str-03878l,4153812100,4928 crystal dr 92109
4,str-01256l,4237120402,729 san gabriel pl 92109
...,...,...,...
36884,str-07570l,4533312200,3552 villa ter 92104
36885,str-06897l,4713030100,4243 46th st 92115
36886,str-06820l,4235720200,807 toulon court 92109
36887,str-06836l,4205421100,7504 baltic st 92111


In [24]:

for i, j in apn_data:
    print(apn_data['permit'][i])

ValueError: too many values to unpack (expected 2)

In [30]:
licenses['license_id'][0]
#licenses['license_id'][0]

'str-01686l'

In [29]:
apn_data['permit'][0]

'str-02005l'

In [21]:
# Perform fuzzy string matching
tuples_list = [
    max([(fuzz.token_set_ratio(i, j), j) for j in B_cleaned]) for i in A_cleaned
]

In [22]:
# Unpack list of tuples into two lists
similarity_score, fuzzy_match = map(list, zip(*tuples_list))

# Create pandas DataFrame
df = pd.DataFrame(
    {
        "licenses": A_cleaned,
        "parsed_address": fuzzy_match,
        "similarity score": similarity_score,
    }
)
df

Unnamed: 0,licenses,parsed_address,similarity score
0,4855 alberson court 92130,4855 alberson court 92130,100
1,2028 30th st 92104,2028 30th st 92104,100
2,5145 coban st 92114,6515 acorn st 92115,79
3,3327 32nd st 92104,3327 32nd st 92104,100
4,1305 elevation rd 92110,1305 elevation rd 92110,100
...,...,...,...
7244,3232 39th st 92105,3232 39th st 92105,100
7245,4767 ocean blvd #204 92109,4667 ocean blvd #204 92109,96
7246,2620 wightman st 92104,3650 wightman st 92104,91
7247,2183 s avenida de la playa 92037,2320 avenida de la playa 92037,91


In [23]:
# Export to Excel
df.to_excel(
    os.path.join(FOLDER_DEST, r"Fuzzy String Matching.xlsx"),
    sheet_name="Fuzzy String Matching",
    index=False,
)

In [26]:
result = pd.merge(df, apn_data, how="left", on=["parsed_address"], indicator='many_to_many')

In [27]:
result

Unnamed: 0,licenses,parsed_address,similarity score,permit,apn,many_to_many
0,4855 alberson court 92130,4855 alberson court 92130,100,str-01686l,3043700600,both
1,2028 30th st 92104,2028 30th st 92104,100,str-01757l,5391550800,both
2,5145 coban st 92114,6515 acorn st 92115,79,str-02656l,4676200500,both
3,3327 32nd st 92104,3327 32nd st 92104,100,str-01505l,4535111200,both
4,1305 elevation rd 92110,1305 elevation rd 92110,100,str-04720l,4362311900,both
...,...,...,...,...,...,...
17075,4767 ocean blvd #204 92109,4667 ocean blvd #204 92109,96,str-05674l,4155811317,both
17076,4767 ocean blvd #204 92109,4667 ocean blvd #204 92109,96,str-04086l,4155811317,both
17077,2620 wightman st 92104,3650 wightman st 92104,91,str-03936l,4474721600,both
17078,2183 s avenida de la playa 92037,2320 avenida de la playa 92037,91,str-00291l,3463001400,both


In [28]:
# export file
result.to_csv(os.path.join(FOLDER_DEST, r'join-licenses.csv'), index=False)