In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
import pandas as pd
import numpy as np

In [3]:
#Read Data
df_taxpayers = pd.read_excel("", sheet_name = "")


In [4]:
df_missing_tins = pd.read_excel("", sheet_name="")

In [8]:
def ngrams(string, n=3):
     """
    Generate n-grams (sequences of n characters) from a given string.

    Args:
        string (str): The input string to generate n-grams from.
        n (int, optional): The length of the n-grams to generate. Defaults to 3.

    Returns:
        list: A list of n-grams (as strings) generated from the input string.
    """
    string = re.sub(r'[,-./]|\sBD',r'', string)
    string = string.lower() 

    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [29]:
# Drop nulls from list
df_missing_tins= df_missing_tins.dropna(subset=['Name'])

In [9]:
# Drop nulls from taxpayer list
df_taxpayers= df_taxpayers.dropna(subset=['Trading name'])

In [10]:
print('Vecorizing the data......')

# Create TfidfVectorizer object with custom n-gram analyzer
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

# Fit and transform Trading name column of df_taxpayers DataFrame
tfidf = vectorizer.fit_transform(df_taxpayers['Trading name'])

print('Vecorizing Done')

Vecorizing the data - this could take a few minutes for large datasets...
Vecorizing completed...


In [11]:
from sklearn.neighbors import NearestNeighbors

# Create NearestNeighbors object with custom parameters
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

In [12]:
 # convert column to match against in the taxpayer list to a set
unique_org = set(df_missing_tins['Supplier'].values) # set used for increased performance


63

In [13]:
def getNearestN(query):
    """
    Finds the nearest neighbor to a given query string in the TF-IDF feature space of the business names.

    Args:
        query (str): The input query string to find the nearest neighbor of.

    Returns:
        tuple: A tuple containing two arrays:
        - distances: An array of distances between the query and the nearest neighbor.
        - indices: An array of indices of the nearest neighbor(s) in the TF-IDF feature space.
    """
    
    # Transform query into TF-IDF feature vector using the same vectorizer used on the training data
    queryTFIDF_ = vectorizer.transform(query)
    
    # Find nearest neighbors using pre-fitted nearest neighbors model
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    
    # Return results as a tuple
    return distances, indices

In [14]:
import time
t1 = time.time()

# Find the nearest neighbor to each unique name in set using the getNearestN function
print('Getting nearest neighbors...')
distances, indices = getNearestN(unique_org)

# Calculate the time taken to run the function
t = time.time()-t1
print("DONE..")
print("Time Taken(seconds):", t)

getting nearest n...
COMPLETED IN: 0.2384347915649414


In [15]:
# Convert set back to list
unique_org = list(unique_org) #need to convert back to a list
taxpayers = df_taxpayers['Trading name'].to_numpy()
#taxpayers = df_missing_tins['Payee'].to_numpy()

# Find matches
print('Finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [round(distances[i][0],2), taxpayers[j][0],unique_org[i]]
  matches.append(temp)

#Build a new dataframe containing the matches
print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['Match confidence (lower is better)','Matched name','Original name'])

print('Done') 

finding matches...
Building data frame...
Done


In [17]:
# Explore the new dataframe containing matches

matches.sort_values(['Match confidence (lower is better)'], ascending=True).sample(10)


Unnamed: 0,Match confidence (lower is better),Matched name,Original name
26,0.61,STARWOOD INVESTMENTS,STARWOOD
49,1.0,XX,NGWENYA WANFOR
35,0.83,SWAZI CANDLES EXPORT (PTY) LTD,SWAZI CANDLES
62,1.0,XX,BIDVEST STEINER
42,0.86,TRANSUNION ESWATINI (PTY) LTD,TRANSUNION
29,1.0,XX,PWC
28,0.72,JENNY INTERNET SWAZILAND (PTY) LTD,JENNY INTERNET
31,0.9,SIDUMO VALENTINE MDLADLA Trading as S V MDLAD...,SV MDLADLA & ASSOCIATES
41,0.37,ELITE FOOD SERVICES (PTY) LTD,ELITE FOOD SERVICES
16,0.87,ZCMB INVESTMENTS (PTY) LTD Trading as THE PREP...,ZCMB FURNITURE


In [18]:
#Export Matches to excel/csv
matches.to_excel('', encoding='utf-8')