In [14]:
import numpy as np
import pandas as pd
import openpyxl
import random
import gc
import timeit
import sys
from datetime import datetime as dt
import os

from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
filename = "../stro_licenses_datasd-san-diego.xlsx"


def get_sheet_names(excel_file):
    """Get the sheet names from an Excel file.

    Args:
      excel_file: The path to the Excel file.

    Returns:
      A list of sheet names.
    """

    workbook = openpyxl.load_workbook(excel_file)
    sheet_names = workbook.sheetnames
    return sheet_names

get_sheet_names(filename)

['stro_licenses_datasd-san-diego', 'geo-data', 'data-apn']

In [16]:
sheet_name = get_sheet_names(filename)
dfa = pd.read_excel(filename, sheet_name=sheet_name[0], usecols=[0, 1], header=0)
dfb = pd.read_excel(filename, sheet_name=sheet_name[2])

In [17]:
dfa.head()

Unnamed: 0,license_id,address
0,STR-01686L,"4855 ALBERSON Ct, SAN DIEGO, CA 92130"
1,STR-01757L,"2028 30th St, San Diego, CA 92104"
2,STR-05313L,"5145 COBAN St, SAN DIEGO, CA 92114"
3,STR-01505L,"3327 32nd St, San Diego, CA 92104"
4,STR-04720L,"1305 Elevation Rd, San Diego, CA 92110"


In [18]:
dfb.head()


Unnamed: 0,permit,apn,parsed_address
0,STR-02005L,4303703300,2070 ILLION ST 92110
1,STR-02053L,4495820800,3611 QUIMBY ST 92106
2,STR-00324L,4236032300,3625 MISSION BLVD 92109
3,STR-03878L,4153812100,4928 CRYSTAL DR 92109
4,STR-01256L,4237120402,729 SAN GABRIEL PL 92109


## Data Cleaning:

In [19]:
# count nan values in columns
dfa.isnull().sum()

license_id    0
address       4
dtype: int64

In [20]:
# count nan values in columns
dfa.isnull().sum()

# Drop rows where specific column values are null / address == null
dfa = dfa.dropna(subset=["address"])

#### Clean B

In [21]:
# Filtering dataframe column elements by different dtypes 
m = dfb['parsed_address'].apply(type) == int
dfb = dfb[~m]
dfb = dfb[~dfb['parsed_address'].str.startswith('#')]
dfb = dfb[~dfb['parsed_address'].str.contains('NONE')]

### Normalize strings address dataframe dfa

In [22]:
# Convert Pandas Uppercase Column
dfa = dfa.apply(lambda x: x.astype(str).str.lower())
dfb = dfb.apply(lambda x: x.astype(str).str.lower())

In [23]:
# remove extra whitspace, special characters, , #UNIT -> #, #04 -> #4, etc
dfa = (
    dfa.replace({"address": r"\s+"}, {"address": " "}, regex=True) # remove multiple spaces
    .replace({"address": r"\,\s\w+\s\w+(\s,\s|,\s)\w+\s"}, {"address": " "}, regex=True) # ", SAN DIEGO, CA"  --> " "
    .replace({"address": r"\,\s\w+(\s,\s|,\s)\w+\s"}, {"address": " "}, regex=True)
    .replace({"address": r"(\,\s|\s\,)"}, {"address": " "}, regex=True)
    .replace({"address": r"\s(wk)\s"}, {"address": " walk "}, regex=True)
    .replace({"address": r"\s(bl)\s"}, {"address": " blvd "}, regex=True)
    .replace({"address": r"\s(ct)\s"}, {"address": " court "}, regex=True)
    .replace({"address": r"\s(tr)\s"}, {"address": " trails "}, regex=True)
    .replace({"address": r"\s(avenue)\s"}, {"address": " ave "}, regex=True)
    .replace({"address": r"\s(terrance)\s"}, {"address": " ter "}, regex=True)
    .replace({"address": r"\s(av)\s"}, {"address": " ave "}, regex=True)
    .replace({"address": r"\s#unit\s"}, {"address": " #"}, regex=True) #UNIT -> #
    .replace({"address": r"\s#0"}, {"address": " #"}, regex=True) #04 -> #4
)

In [24]:
# remove leading and trailing whitespaces
dfa['address'] = dfa['address'].str.strip()
dfb['parsed_address'] = dfb['parsed_address'].str.strip()

In [12]:
# absolute path
FOLDER_DEST = os.path.abspath('../storage/')

In [13]:
# export file
dfa.to_csv(os.path.join(FOLDER_DEST, r'licenses.csv'), index=False)
dfb.to_csv(os.path.join(FOLDER_DEST, r'apn-data.csv'), index=False)

## Machine learning

In [322]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')
tf_idf_matrix = tfidf_vectorizer.fit_transform(dfb['parsed_address'])
#tf_idf_matrixB = tfidf_vectorizer.fit_transform(dfb['parsed_address'])

### Compute Cosine Similarity:

In [323]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))


import time
t1 = time.time()

# adjust lower bound: 0.8
# keep top 10 similar results
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)

t = time.time()-t1
print("finished in:", t)

finished in: 1.520073652267456


In [326]:
#pd.DataFrame(matches.toarray())

### Create a match table to show the similarity scores:

In [341]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'ADDRESS': left_side,
                         'SIMILAR_TITLE': right_side,
                         'similairity_score': similairity})


matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, dfb['parsed_address'], top=1000)
# Remove all exact matches
matches_df = matches_df[matches_df['similairity_score'] < 0.99999] 
matches_df.sample(10)

Unnamed: 0,ADDRESS,SIMILAR_TITLE,similairity_score
957,2666 WORDEN ST #09 92110,2690 WORDEN ST #81 92110,0.853188
398,2666 WORDEN ST #14 92110,4082 VALETA ST #373 92110,0.853188
700,2654 WORDEN ST #94 92110,2690 WORDEN ST #79 92110,0.803388
951,2666 WORDEN ST #09 92110,4098 VALETA ST #388 92110,0.853188
280,2676 WORDEN ST #58 92110,2690 WORDEN ST #79 92110,0.803388
757,2676 WORDEN ST #63 92110,2640 WORDEN ST #208 92110,0.803388
748,2680 WORDEN ST #70 92110,2636 WORDEN ST #111 92110,0.802877
902,2680 WORDEN ST #75 92110,2624 WORDEN ST #181 92110,0.80826
897,2636 WORDEN ST #132 92110,2636 WORDEN ST #108 92110,0.80826
695,2654 WORDEN ST #94 92110,2680 WORDEN ST #67 92110,0.803388


In [339]:
type(dfb['parsed_address'])


pandas.core.series.Series

In [340]:
dfa['address']

0                4855 ALBERSON Ct, SAN DIEGO, CA 92130
1                    2028 30th St, San Diego, CA 92104
2                   5145 COBAN St, SAN DIEGO, CA 92114
3                    3327 32nd St, San Diego, CA 92104
4               1305 Elevation Rd, San Diego, CA 92110
                             ...                      
7248                 3232 39th St, san diego, CA 92105
7249          4767 OCEAN Bl, #204, SAN DIEGO, CA 92109
7250             2620 WIGHTMAN St, SAN DIEGO, CA 92104
7251    2183 S AVENIDA DE LA PLAYA, LA JOLLA, CA 92037
7252     4855 NARRAGANSETT Av, #A, SAN DIEGO, CA 92107
Name: address, Length: 7253, dtype: object

In [343]:
matches_df.sort_values(['similairity_score'], ascending=False).head()

Unnamed: 0,ADDRESS,SIMILAR_TITLE,similairity_score
49,604 S EVANS ST 92113,3888 RIVIERA DR #205 92109,0.9468
50,604 S EVANS ST 92113,530 K ST #314 92101,0.927282
60,3806 ARIZONA ST 92104,3020 ALTA VIEW DR #E202 92139,0.891226
612,4082 VALETA ST #370 92110,2666 WORDEN ST #01 92110,0.856423
613,4082 VALETA ST #370 92110,4082 VALETA ST #373 92110,0.856423


In [None]:
dfa[]