In [168]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz, process
import numpy as np
import string
tqdm.pandas()

##This portion just surpresses warnings that do not affect the performance of the code.
import warnings
warnings.filterwarnings("ignore")

In [169]:
#These Dictionaries ensure that the read csv command gets the correct data types for columns
O_col={'PRIMARYINSUREDNAME': str, 'STREETADDRESS_OPT': str, 'CITY_OPT': str, 'ZIP_OPT':str,
       'Cleaned_Name':str, 'OC_Add':str, 'O_Add_#':float, 'O_Add_Name':str}

G_col={'Business Name':str, 'Address 1':str, 'Address 2':str, 'City':str, 'State':str, 'Zip':str,
       'Geocoded Location':str, 'Trade name':str, 'Adj_BN':str, 'Adj_Add':str, 'G_Add_#': float,
       'G_Add_Name':str}

In [170]:
# #Dataread for Government Data
Gpath=r"C:\Users\khat\OneDrive - PENNSYLVANIA COMPENSATION RATING BUREAU\Desktop\DNB_Data_Matching\Local Data\Generated\GOV_PREPD.csv"
GDF=pd.read_csv(Gpath, dtype=G_col)

#Shape check and Head Check for Government Data
print(GDF.shape)
GDF.head()



(2374999, 12)


Unnamed: 0,Business Name,Address 1,Address 2,City,State,Zip,Geocoded Location,Trade name,Adj_BN,Adj_Add,G_Add_#,G_Add_Name
0,"Greenacy, Llc",600 N 2nd Street,4th Floor,harrisburg,PA,17101,(-76.888068038 40.263499986),No Trade Name,greenacy,600 n 2nd st 4th #,600.0,n 2nd st 4th #
1,Alphamed Rx Inc.,502 W 7th Street,Suite 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,alphamed rx,502 w 7th st # 100,502.0,w 7th st # 100
2,Legam Llc,502 W 7th Street,Suite 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,legam,502 w 7th st # 100,502.0,w 7th st # 100
3,"Lycobirds, Llc",502 W 7th St,Ste 100,erie,PA,16502,(-80.095924977 42.12455901),No Trade Name,lycobirds,502 w 7th st # 100,502.0,w 7th st # 100
4,Cotoletta Llc,600 N 2nd Street,4th Floor,harrisburg,PA,17101,(-76.888068038 40.263499986),No Trade Name,cotoletta,600 n 2nd st 4th #,600.0,n 2nd st 4th #


In [171]:

OP_path=r"C:\Users\khat\OneDrive - PENNSYLVANIA COMPENSATION RATING BUREAU\Desktop\DNB_Data_Matching\Local Data\Generated\OPTI_Unmatch_78.csv"
ODF=pd.read_csv(OP_path, dtype=O_col)
ODF['ZIP_OPT']= ODF['ZIP_OPT'].astype(str)
print(ODF.shape)
ODF.head()


(127266, 8)


Unnamed: 0,PRIMARYINSUREDNAME,STREETADDRESS_OPT,CITY_OPT,ZIP_OPT,Cleaned_Name,OC_Add,O_Add_#,O_Add_Name
0,HYDRITE CHEMICAL CO,208 WASHINGTON AVE,ellendale,19941,hydrite chemical co,208 washington ave,208.0,washington ave
1,SUNRISE VALLEY,CONSTRUCTION LLC 239 SPRINGVILLE RD,kinzers,17535,sunrise valley,construction llc 239 springville rd,,construction llc 239 springville rd
2,DIOCESAN CENTRAL COUNCIL OF GREENSBURG,70 N MT VERNON AVE,uniontown,15401,diocesan central council of greensburg,70 n mt vernon ave,70.0,n mt vernon ave
3,HOSPITAL HOUSEKEEPING SYSTEMS LLC,155 WILSON AVE,washington,15301,hospital housekeeping systems,155 wilson ave,155.0,wilson ave
4,ANYWHERE REAL ESTATE INC,3865 REED BLVD,murrysville,15668,anywhere real estate,3865 reed blvd,3865.0,reed blvd


In [172]:
#Creating new columns for each dataset that just contains the first 4 digits zip code
#This allows for the code to only check nearby businesses
ODF['APPROX_ZIP_OPT'] = ODF['ZIP_OPT'].astype(str).str[:4].astype(int)
GDF['APPROX_ZIP_GOV'] = GDF['Zip'].astype(str).str[:4].astype(int)

In [173]:
#These are the submatching functions that actually apply the matching for individual rows
#================================================
#Secondary Address handler:
def suite_cleaner(text1, text2):
    ''' 
    Inputs: 
    text1: A single string of address data from optimus data
    text2: A single string of address data from Gov data

    Ouputs: A single Boolean

    Description: This function matches secondary address numbers/codes such as suite, floor, unit etc. and returns a boolean if the
    addresses have matching secondary address numbers. True indicates either a match or an assymetry of data, 
    meaning one or both of the addresses doesnt have a secondary address. This boolean is used in an upcoming function
    ''' 
    #Remove all spaces from both input strings
    text1 = text1.replace(" ", "")
    text2 = text2.replace(" ", "")

    #Find the generic pound sign symbolizing a the prescence of a secondary address
    index1 = text1.find('#')
    index2 = text2.find('#')
    
    #If either are missing the pound sign, return True
    if index1 == -1 or index2 == -1:
        return True 
    #Log the secondary address numbers
    else:
        start1 = index1 + 1
        start2 = index2 + 1    
        text1= text1[start1:]
        text2= text2[start2:]

        #Check for secondary Address equality
        if text1 == text2:
            return True
        else:
            return False


#==========================================
#Address Fuzzy Matcher:
def a_fm(num1,rem1,num2, rem2):
    '''
    Input:
    num1: The leading address number from an address string from optimus
    num2: The leading address number from an address string from Gov data
    rem1: The second part of an address string from optimus
    rem2: The second part of an address string from Gov

    Output:
    final_score: a match score from 0 - 1 that indicates the match score

    Description: This function using the simple rapidfuzz ratio (normalized distance) on the rem1 and rem2 part of the addresses, but
    also applies custom penalties based on address number and secondary address numbers. This ensures more accurate address matches.

    
    '''
    #Removes leading and trailing spaces
    
    rem1=str(rem1).strip()
    rem2=str(rem2).strip()   
    
    #Checks if leading numbers match
    num_match = num1 == num2 
    #Fuzzy match address info
    fuzz_score = fuzz.ratio(rem1, rem2) / 100
    
    #Check if secondary address number matches
    sc=suite_cleaner(rem1, rem2)
    
    #Apply penalties to fuzzy match score
    #If suite numbers mismatch, apply a -.2 penalty
    if sc is False:
        fuzz_score-=.2

    #If leading numbers do not match: apply a -.35 penalty
    if not num_match:
        fuzz_score -= 0.35  
    
    #Safety Catch to stop negative scores from occuring, thus setting the floor at 0
    final_score = max(fuzz_score, 0) 
    
    return final_score
#===========================================
#Name fuzzy match
def n_fm(b1,b2):
    ''' 
    Inputs: 
    b1: a single string containg the business name from optimus
    b2: a single string containg the business name from the government data

    Outputs:
    nscore: A fuzzymatch score from 0-1

    Description: This function averages two distinct fuzzy matching scoring algorithms, Weighted ratio and standard ratio,

    '''
    Rscore=fuzz.ratio(b1,b2)/ 100
    Wscore=fuzz.WRatio(b1,b2)/100
    nscore=(Wscore + Rscore)/2
   
    return(nscore)

#========================================================
#Full matching function:

def manual_extract_one(nquery, a_num, a_st, cquery, nchoices, addnums,addsts, cchoices):
    '''    
    Inputs: 
    nquery: a string containing a business name from Optimus
    a_num: a string containing a business address number from Optimus
    a_st: a string containing a business address remainder from Optimus
    cquery: a string containing a business' city from Optimus
    nchoices: a column of a dataframe containg strings of business names from Gov data
    addnums: a column of a dataframe containg strings of address numbers from Gov data
    addsts: a column of a dataframe containg strings of address remainders from Gov data
    cchoices: a column of a dataframe containg strings of business cities from Gov data

    Outputs:
    best_index: The index of the row in the Gov dataframe that is the best match for the current row in Optimus
    best_score: the best total Match Score for the row
    BestN: The name score for the best index row for business name
    BestA: The address score for the best index row for business address
    BestC: The city score for the best index row for business city

    Description: This function applies the name and address matching functions from above to whole rows of Governmen data. It selects a single row
    of Optimus data and then iterates row by row through the Gov data, until it finds the best match
    '''
    #Initializing best score and best index variables
    best_score = 0
    best_index = -1
    bestN=''
    bestA=''
    bestC=0
    Nscore=0
    Ascore=0
    Cscore=0
    #Main loop that goes row by row
    for index, (nchoice, a_num_choice, a_st_choice, cchoice) in enumerate(zip(nchoices, addnums, addsts, cchoices)):
        # Perform fuzzy matching for Name and Addresss
        Nscore = n_fm(nquery, nchoice)
        Ascore = a_fm(a_num, a_st, a_num_choice, a_st_choice)
        #City Score
        Cscore=n_fm(cquery, cchoice)
        #If city score is greater than .9, the penalty term is set to 0
        if Cscore >.9:
            Cmod=0
        #Apply a penalty term for low city score of -.1
        else:
            Cmod= .1
        # Average the scores and subtract the City penalty
        avg_score = ((0.5*Nscore) + (0.5*Ascore))-Cmod

        # Update best match if the new average score is better
        if avg_score > best_score:
            best_score = avg_score
            best_index = index
            bestN=Nscore
            bestA=Ascore
            bestC=Cscore
          
        # If a good match occurs, stop the loop to speed up processing time
        if best_score >= .92:
            break
  
    return best_index, best_score, bestN, bestA, bestC

In [174]:
#Function that applies manual_extract_one to every row of optimus
def find_match(row, choices):
    ''' 
    input:
    row: A single row of the Optimus dataframe
    choices: The government dataframe

    Output: A single list that contains all the outputs of the manual_extract_one. It will contain False and 0's in the event of an error

    Description: This function narrows down the potential choices of the Gov dataframe to only rows with the same first four digits of zipcode. It also handles errors and the event
    of an empty choices df
    '''
    try:
        #Filter down the potential matches by approx zip code
        choices_filtered = choices[(choices['APPROX_ZIP_GOV'] == row['APPROX_ZIP_OPT'])]

        if choices_filtered.empty:
            return [False, 0, 0, 0,0]
        
        # Call the manual_extract_one function for fuzzy matching scores
        index, total_score, namescore, addscore, cscore = manual_extract_one(row['Cleaned_Name'], 
                                                                             row['O_Add_#'], row['O_Add_Name'], 
                                                                             row['CITY_OPT'], choices_filtered['Adj_BN'], 
                                                                             choices_filtered['G_Add_#'], choices_filtered['G_Add_Name'], choices_filtered['City'])
        #Handle the case where no match is found
        if index is None:
            return [None, 0, 0, 0,0]
        # Return results when match is found
        else:
            original_index = choices_filtered.index[index]
         
            return [original_index, total_score, namescore, addscore, cscore]
    #Handle all errors and empty rows
    except Exception as e:
        # Return default values 
 
        print(f"Error in find_match: {e}")
        return [False, 0, 0, 0,0]


In [175]:
#================================================================================================
#Main Function
def main(DF, Gov):
    ''' 
    input: 
    DF: This is the optimus df
    Gov: This is the Gov df

    Outputs:
    RDF: A formatted df with the matches appended

    Description: This function runs find match and handles loading bars, and reformats the results intoa df
    '''
    results = []  # List to collect result dictionaries
    errors=0
    # Iterate through rows of Optimus data
    for _, row in tqdm(DF.iterrows(), total=DF.shape[0], desc='Performing Fuzzy Matching'): 
        # Get the best match index and score from find_match
        MATCH = find_match(row, Gov)
    
        # Skip if no match found
        if not MATCH:
            continue

        # Unpack the match results into individual variables
        match_index, score, namescore, addscore, cscore = MATCH[0], MATCH[1], MATCH[2], MATCH[3], MATCH[4]
        
        # Check if a valid match was found
        if match_index is False:
            errors+=1
            # Handle case when no valid match is found by creating a blank row
            blank = pd.DataFrame(np.nan, columns=Gov.columns, index=[0])
            mrow = blank.iloc[0]
        else:
            # Get the matched row from Gov data using the index
            mrow = Gov.iloc[match_index]
        
        # Create a result dictionary with the selected data from both Optimus and Gov
        result = {
            "OPTI_Name": row['PRIMARYINSUREDNAME'],
            "Name Score": namescore,
            "Address Score": addscore,
            "City Score": cscore, 
            "Match Score": score,
            "Opti_Address": row['STREETADDRESS_OPT'], 
            "Opti_City": row['CITY_OPT'],  
            "Opti_Zip": row['ZIP_OPT']
        }

        # Add matched row data from Gov to result
        result.update(mrow.to_dict())  
        results.append(result)
    #Convert to a pandas dataframe
    RDF = pd.DataFrame(results)
    #Select custom columns
    RDF=RDF[['OPTI_Name','Business Name', 'Name Score',
              'Opti_Address', 'Address 1', 'Address 2', 'Address Score', 
              'Opti_City','City', "City Score",
              'Opti_Zip', 'Zip', 'Match Score',
              'State', 'Geocoded Location', 'Trade name', 'Adj_BN', 'Adj_Add', 'G_Add_#', 'G_Add_Name' ]]
    RDF=pd.DataFrame(RDF)
    print(f"{errors} occured")
    return RDF

In [None]:
df=ODF.sample(1800, random_state=961)

#11 is a good seed
R=main(df,GDF)


#Resturcturing R


RQ = R[(R['Match Score']>0.73) ] 
RQ.sort_values(by='Match Score', ascending=True, inplace=True)
RQ['Zip']=RQ['Zip'].astype(int)
print(f"{(len(RQ)/len(R))*100}% Match Rate")
RQ.head(45)





Performing Fuzzy Matching:   0%|          | 0/1800 [00:00<?, ?it/s]

Performing Fuzzy Matching: 100%|██████████| 1800/1800 [04:02<00:00,  7.41it/s]

0 occured
11.722222222222223% Match Rate





Unnamed: 0,OPTI_Name,Business Name,Name Score,Opti_Address,Address 1,Address 2,Address Score,Opti_City,City,City Score,Opti_Zip,Zip,Match Score,State,Geocoded Location,Trade name,Adj_BN,Adj_Add,G_Add_#,G_Add_Name
1050,CSS BUILDING SERVICES INC,"Csi Services, Inc.",0.791136,3737 MARKET ST,1901 Market Street,,0.65,philadelphia,philadelphia,1.0,19104,19103,0.720568,PA,(-75.171933041 39.953814982),No Trade Name,csi services,1901 market st,1901.0,market st
73,ENVIRONMENTAL SERVICES CONCEPTS,525 Virginia Drive Associates Acquisition Corp.,0.441558,443 S GULPH RD,443 S Gulph Rd,,1.0,king of prussia,king of prussia,1.0,19406,19406,0.720779,PA,(-75.36820201 40.077629983),No Trade Name,525 virginia drive associates acquisition corp,443 s gulph rd,443.0,s gulph rd
1476,THE SUSAN G KOMEN BREAST CANCER FOUNDATION INC,"Moses Taylor Foundation, Inc.",0.689038,125 N WASHINGTON AVE STE 260,"125 N. Washington Ave., Suite 205",,0.754545,scranton,scranton,1.0,18503,18503,0.721792,PA,(-75.664387036 41.407765011),No Trade Name,moses taylor foundation,125 n washington ave # 205,125.0,n washington ave # 205
1730,LIVEACTION INC,"Novacomm, Inc.",0.444444,125 S NORWINDEN DR,125 S. Norwinden Dr,,1.0,springfield,springfield,1.0,19064,19064,0.722222,PA,(-75.320661029 39.928663992),No Trade Name,novacomm,125 s norwinden dr,125.0,s norwinden dr
658,HEALTH ADVOCATE SOLUTIONS INC,American Telecast Corporation,0.444444,835 SPRINGDALE DR,835 Springdale Dr,,1.0,exton,exton,1.0,19341,19341,0.722222,PA,(-75.601053878 40.029387102),No Trade Name,american telecast corporation,835 springdale dr,835.0,springdale dr
971,SP PLUS CORPORATION,K K K Corporation,0.796528,733 CHESTNUT ST,1626 Chestnut Street,,0.65,philadelphia,philadelphia,1.0,19107,19104,0.723264,PA,(-75.168348018 39.951133989),No Trade Name,k k k corporation,1626 chestnut st,1626.0,chestnut st
503,LAFLIN BOROUGH,Laflin Borough Volunteer Fire Department Ladie...,0.647183,47 LAFLIN RD,47 Laflin Rd,,1.0,wilkes barre,laflin,0.223611,18702,18702,0.723592,PA,(-75.796448999 41.291299403),No Trade Name,laflin borough volunteer fire department ladie...,47 laflin rd,47.0,laflin rd
15,GENESCO INC,4600 Jonestown Llc,0.447619,4600 JONESTOWN RD,4600 Jonestown Rd,,1.0,harrisburg,harrisburg,1.0,17109,17109,0.72381,PA,(-76.816510976 40.302286991),No Trade Name,4600 jonestown,4600 jonestown rd,4600.0,jonestown rd
800,ACRISURE HOLDINGS INC,"3.31 Holdings, Llc",0.690345,444 LIBERTY AVE,444 Liberty Avenue Suite 2100,,0.758621,pittsburgh,pittsburgh,1.0,15222,15222,0.724483,PA,(-80.004709 40.440891986),No Trade Name,331 holdings,444 liberty ave # 2100,444.0,liberty ave # 2100
536,WESTFALL TOWNSHIP VOLUNTEER FIRE DEPT INC,Mill Rift Fire Department,0.483871,139 BLUESTONE BLVD,139 Blue Stone Blvd,,0.965517,millrift,millrift,1.0,18340,18340,0.724694,PA,(-74.745101973 41.411786983),No Trade Name,mill rift fire department,139 blue stone blvd,139.0,blue stone blvd


In [177]:
Anum=100
Aname='commerce dr'
Bnum=963
Bname='winterset rd'
a_fm(Anum, Aname, Bnum, Bname)

0.08478260869565218

In [178]:
def dc_search(df, search_value, column_name):
    # Filter the DataFrame to only include rows where the specified column contains the search value
    filtered_df = df[df[column_name] == search_value]
    return(filtered_df.head())
# dc_search(GDF,'Polymath Park Resort Inc.','Business Name' )