In [9]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz, process
import numpy as np
import string
tqdm.pandas()

##This portion just surpresses warnings that do not affect the performance of the code.
import warnings
warnings.filterwarnings("ignore")

In [None]:
#This chunk contains pre-processing functions that format the data for both datasets


def prep_col(text, type='Name'):
    '''
    inputs: 
    text: a single string that is going to be processesd
    type: a string value that is either, 'Name' or 'Address' that dictates the exact type of processing

    outputs: 
    cleaned_text: A string that removes all selecd phrases and trailing spaces

    Descriptions: This funciton takes individual strings and regularizes them in terms of capitalization and spacing and reformats commmon phrases that appears in both
    Business Name and Address data
    '''
   #Make input text lower case, punctuationless, and string type
    text = str(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    #Unique Handling for "Name" type strings
    if type == 'Name':
        #removes the following phrases in gen_char
        gen_char = {
            ' inc': '',
            ' llc': '',
            ' ltd': '',
            ' dba ': '',
            ' llp': ''
        }
        #Handles dba by removing the dba and replacing it with the text after it
        for sub, replacement in gen_char.items():
            if sub == ' dba ':
                text = text.split(sub, 1)[-1]
            else:
                text = text.replace(sub, replacement)
    #Unique Handling for Address type data
    elif type == 'Address':
        #Dictionary of common phrases and their preferred replacements
        replace_phrases = {
            ' plaza': 'plz',
            ' street': ' st',
            ' boulevard': ' blvd',
            ' avenue': ' ave',
            ' road': ' rd',
            ' lane': ' ln',
            ' circle': 'cir',
            ' suite': ' ste',
            ' floor':' fl',
            ' court':'ct',
            ' west': ' w',
            ' east': ' e',
            ' north': ' n',
            ' south': ' s',
            '#':'ste',
            'deleted':'',
            #Subaddress identifiers -> replacing with a generic pound sign for matching later on
            # Note: Does not affect original data
            ' ste': ' #', 
            ' apt':' #', 
            ' unit': ' #', 
            ' bldg': ' #', 
            ' fl':' #'
        }
        #Loop for addresses
        for phrase, replacement in replace_phrases.items():
            text = text.replace(phrase, replacement)
    #Final step that removes leading and trailing white space
    text = text.rstrip().lstrip()
    return text

def Pre_Zip(df, column_name):
    '''
    inputs: 
    df: a pandas dataframe containing the zipcode data we want to process
    column_name: the name of the column that contains the string data

    outputs: 
    df[column_name]: a new column of zipcode data that is appended to input dataframe

    Descriptions:
    This function simply regularizes the zip codes to snure that they are all of numeric type and 5 digits
    '''
    # Remove all non-numeric characters 
    df[column_name] = df[column_name].str.replace(r'[^0-9]', '', regex=True)
    
    # Replace empty/invalid strings
    df[column_name] = df[column_name].fillna('0').replace('', '0')
    
    # Truncate to 5 characters
    df[column_name] = df[column_name].str.slice(0, 5)
    
    # Integer type conversion
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce').fillna(0).astype(int)
    
    return df[column_name]

def extract_add_num(input_string):
    ''' 
    inputs: 
    input string: A piece of string type data. Intended to be an address

    outputs: 
    numeric_substring: This is the leading number of an address
    remaining_string: This is the rest of the address 

    Description: This function simply splits addresses into their number and their remaining street addresses. 
    It is necessary for later functions.
    '''
    numeric_substring = ""  
    remaining_string = ""   
    
    #This loop searches for initial numbers and stops once a non-number is found
    for char in input_string:
        if char.isdigit():
            numeric_substring += char  
        else:
            remaining_string = input_string[len(numeric_substring):]  
            break  
        
    if numeric_substring != '':
        numeric_substring= int(numeric_substring)

    return numeric_substring, remaining_string

In [11]:
#Dataread for Government Data
Gpath=r"C:\Users\khat\OneDrive - PENNSYLVANIA COMPENSATION RATING BUREAU\Desktop\DNB_Data_Matching\Local Data\Generated\Joint_PADE_GOV_Data.csv"
GDF=pd.read_csv(Gpath)
#Dropping empty lead column
GDF=GDF.drop(GDF.columns[0], axis=1)
#Applying Business Name Cleaning
GDF['Adj_BN']=GDF['Adj_BN'].apply(lambda x: prep_col(x))
#Fixing null values in Address 2 column
GDF['Address 2']=GDF['Address 2'].fillna("")
#Appending Address 2 to Adress 1 to match with Optimus formatting in new adjusted address column
GDF['Adj_Add']= GDF['Address 1']+' ' + GDF['Address 2']
#Applying prep column function to entire adjusted address column
GDF['Adj_Add']=GDF['Adj_Add'].apply(lambda x: prep_col(x, 'Address'))
#Applying zip code cleaning
GDF['Zip']=GDF['Zip'].astype(str)
GDF['Zip']=Pre_Zip(GDF, 'Zip')
GDF['Zip']=GDF['Zip'].astype(str)
#Applying Address Splitting
GDF[['G_Add_#', 'G_Add_Name']] = GDF['Adj_Add'].apply(lambda x: pd.Series(extract_add_num(x)))
GDF['City']= GDF['City'].apply(lambda y: prep_col(y, 'City'))
#Shape check and Head Check for Government Data
print(GDF.shape)
GDF.head()

(2374999, 12)


Unnamed: 0,Business Name,Address 1,Address 2,City,State,Zip,Geocoded Location,Trade name,Adj_BN,Adj_Add,G_Add_#,G_Add_Name
0,"Greenacy, Llc",600 N 2nd Street,4th Floor,harrisburg,PA,17101,(-76.888068038 40.263499986),No Trade Name,greenacy,600 n 2nd st 4th #,600,n 2nd st 4th #
1,Alphamed Rx Inc.,502 W 7th Street,Suite 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,alphamed rx,502 w 7th st # 100,502,w 7th st # 100
2,Legam Llc,502 W 7th Street,Suite 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,legam,502 w 7th st # 100,502,w 7th st # 100
3,"Lycobirds, Llc",502 W 7th St,Ste 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,lycobirds,502 w 7th st # 100,502,w 7th st # 100
4,Cotoletta Llc,600 N 2nd Street,4th Floor,harrisburg,PA,17101,(-76.888068038 40.263499986),No Trade Name,cotoletta,600 n 2nd st 4th #,600,n 2nd st 4th #


In [12]:
#Dataread for Opti Data
#Im not commenting this code well because I think we will have to throw it out on Snowflake
OP_path=r"G:\SHEETS\Research\D&B\DataMatching\Businesses with multiple locations only_11_25\Single\ALL_Single_File.csv"
ODF=pd.read_csv(OP_path)
ODF=ODF[ODF['MATCH_SCORE']<.78]
ODF.drop(['MATCHED_BUSINESS_NAME', 'NAME_SCORE',
          'STREETADDRESS_DNB', 'ADDRESS_SCORE',
          'CITY_DNB', 'CITY_SCORE',
          'ZIP_DNB', 'ZIP_SCORE', 'MATCH_SCORE'], axis=1, inplace=True)
ODF['Cleaned_Name']=ODF['PRIMARYINSUREDNAME'].apply(lambda x: prep_col(x))
ODF['OC_Add']=ODF['STREETADDRESS_OPT'].apply(lambda x: prep_col(x, 'Address'))
ODF['ZIP_OPT']= ODF['ZIP_OPT'].astype(str)
ODF['ZIP_OPT']= Pre_Zip(ODF, 'ZIP_OPT')
GDF['Zip']=GDF['Zip'].astype(str)
ODF[['O_Add_#', 'O_Add_Name']] = ODF['OC_Add'].apply(lambda x: pd.Series(extract_add_num(x)))
ODF['CITY_OPT']= ODF['CITY_OPT'].apply(lambda y: prep_col(y, 'City'))
print(ODF.shape)
ODF.head()

(127266, 8)


Unnamed: 0,PRIMARYINSUREDNAME,STREETADDRESS_OPT,CITY_OPT,ZIP_OPT,Cleaned_Name,OC_Add,O_Add_#,O_Add_Name
1,HYDRITE CHEMICAL CO,208 WASHINGTON AVE,ellendale,19941,hydrite chemical co,208 washington ave,208.0,washington ave
2,SUNRISE VALLEY,CONSTRUCTION LLC 239 SPRINGVILLE RD,kinzers,17535,sunrise valley,construction llc 239 springville rd,,construction llc 239 springville rd
4,DIOCESAN CENTRAL COUNCIL OF GREENSBURG,70 N MT VERNON AVE,uniontown,15401,diocesan central council of greensburg,70 n mt vernon ave,70.0,n mt vernon ave
5,HOSPITAL HOUSEKEEPING SYSTEMS LLC,155 WILSON AVE,washington,15301,hospital housekeeping systems,155 wilson ave,155.0,wilson ave
7,ANYWHERE REAL ESTATE INC,3865 REED BLVD,murrysville,15668,anywhere real estate,3865 reed blvd,3865.0,reed blvd


In [13]:
ODF.to_csv('OPTI_UNMATCH_78.csv', index=False)
GDF.to_csv('GOV_PREPD.csv', index=False)