In [1]:
import pandas as pd
import recordlinkage
import re
from pathlib import Path

def entity_file_creator(entity_file_path, ws_name):
    entity_sheet = pd.read_excel(entity_file_path, ws_name)
    col_name_list = re.findall('[A-Z][^A-Z]*', ws_name)
    col_name = ' '.join(col_name_list) + ' Name'
    if (len(entity_sheet[col_name].dropna()) > 0) == True:
        col_id = ' '.join(col_name_list) + ' Id'
        data = {col_name : entity_sheet[col_name].str.rstrip(), col_id: entity_sheet[col_id]}
        
        return pd.DataFrame(data).dropna()    
    
def member_id_creator(member_file_path, col_name):
    member_file = pd.read_excel(member_file_path)
    if (len(member_file[col_name].dropna()) > 0) == True:
        unique_entries_index = member_file[col_name].drop_duplicates().index
        member_descriptions = member_file.loc[unique_entries_index, col_name].values
        data = {col_name:member_descriptions}
            
        return pd.DataFrame(data) 
    
def clean(df,col_name):
    df[col_name+'_clean'] = df[col_name]
    df[col_name+'_clean'].replace('\d+', '', regex=True,inplace=True)
    df[col_name+'_clean'].replace('\(', '',regex=True,inplace=True)
    df[col_name+'_clean'].replace('\)', '',regex=True,inplace=True)
    df[col_name+'_clean'].replace('Elem Sch', 'Elementary',regex=True,inplace=True)
    df[col_name+'_clean'].replace('High School', 'HS',regex=True,inplace=True)
    df[col_name+'_clean'].replace('Middle Sch', 'MS',regex=True,inplace=True)
    df[col_name+'_clean'].replace('High Sch', 'HS',regex=True,inplace=True)
    df[col_name+'_clean'] = df[col_name+'_clean'].str.rstrip(' ')
    return df

def matcher(member_file_path, entity_file_path, member_entity_category_tuple):        
    mem_name = member_entity_category_tuple[0]
    enti_name_list = re.findall('[A-Z][^A-Z]*', member_entity_category_tuple[1])
    enti_name = ' '.join(enti_name_list) + ' Name'
    enti_id = ' '.join(enti_name_list) + ' Id'
    member = member_id_creator(member_file_path, mem_name)
    entity = entity_file_creator(entity_file_path, member_entity_category_tuple[1])
    
    if entity is not None and member is not None:
        member = clean(member, mem_name)
        
        cleaned_mem_name = mem_name+'_clean'
            
        #perfect matching
        perf_matches = member.merge(entity, how='inner', left_on=cleaned_mem_name, right_on=enti_name)
        member_wo_perf_matches = member[~member[mem_name].isin(perf_matches[mem_name])]
        entity_wo_perf_matches = entity[~entity[enti_id].isin(perf_matches[enti_id])]

        #fuzzy matching
        member_wo_perf_matches.set_index(mem_name,inplace=True)
        entity_wo_perf_matches.set_index(enti_id,inplace=True)
        indexer = recordlinkage.Index()
        indexer.full()
        candidates = indexer.index(member_wo_perf_matches, entity_wo_perf_matches)
        compare = recordlinkage.Compare()
        compare.string(cleaned_mem_name,enti_name,threshold=0.6,label='similarity')
        features = compare.compute(candidates, member_wo_perf_matches, entity_wo_perf_matches)
        potential_matches = features[features.sum(axis=1) == 1].reset_index()

        entity_lu = entity_wo_perf_matches[[enti_name]].reset_index()
        member_lu = member_wo_perf_matches[[cleaned_mem_name]].reset_index()
        entity_merge = potential_matches.merge(entity_lu, how='outer')
        fuzzy_matches = entity_merge.merge(member_lu, how='right').drop(['similarity'],axis=1)
        fuzzy_matches = fuzzy_matches.dropna(subset=[mem_name],axis=0)
    
        if len(perf_matches) !=  0: perf_matches['type'] = ['Perfect Match']*len(perf_matches) 
        if len(fuzzy_matches) !=  0:
            fuzzy_matches.loc[fuzzy_matches[mem_name].duplicated(keep=False) == False, 'type'] = 'One-to-One Fuzzy Match'
            fuzzy_matches.loc[fuzzy_matches[mem_name].duplicated(keep=False) == True, 'type'] = 'Multiple Fuzzy Matches'
            fuzzy_matches.loc[fuzzy_matches[enti_id].isna(), 'type'] = 'No Match to Entity File Found'
        fuzzy_matches.to_csv(str(member_file_path)[:-24] + "{}_fuzzymatch.csv".format(member_entity_category_tuple[0]),index=False)
        concat = pd.concat([fuzzy_matches,perf_matches])
        #print(str(member_file_path)[:-24])
        concat.to_csv(str(member_file_path)[:-24] + "{}_match.csv".format(member_entity_category_tuple[0]),index=False)
        print('Both membership and entity file data exist for {}. Fuzzy match spreadsheet created.'.format(member_entity_category_tuple))
    elif entity is None and member is not None:
        member.to_csv(str(member_file_path)[:-24] + "{}_member.csv".format(member_entity_category_tuple[0]),index=False)
        print('The membership file data exists for {} but entity file data does not.'.format(member_entity_category_tuple))
        print('A spreadsheet of entries in the membership file has been created.')
        print('The affiliate may need to be contacted to create entity file categories.')
    elif entity is not None and member is None:
        print('The entity file data exists for {} but membership file data does not.'.format(member_entity_category_tuple))
        print('No spreadsheet created.')
    else:
        print('Neither membership nor entity file data exists.')
        print('No spreadsheet created.')
        

categories = [('JobTitleName','JobTitle'),('LocalJobClassName','LocalJobClass'),('WorkLocationName','WorkLocation'),('WorkStructureName','WorkStructure')]

directory = r"C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals"
        
pathlist = Path(directory).glob('**/*Knackbuild.xlsx')
for path in pathlist:
    member_file_path = path
    print(path)
    #put limit here so only knackbuild files with correct str length get processed? prevent possible errors/bugs?
    entity_file_path = str(path)[:-30] + 'EntityList.xlsx'
    entity_file_path = Path(entity_file_path)
    for category in categories:
        #try:
        matcher(member_file_path, entity_file_path, category)
        #except:
        #    print('Error')
        #    pass

In [8]:
from pathlib import Path
import pandas as pd
import shutil
import re

directory = r"C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals"

#creating new copy of membership file to work in so original file is preserved
pathlist_knackbuild = Path(directory).glob('**/*Knackbuild.xlsx')
for path in pathlist_knackbuild:
    shutil.copy(path, str(path)[:-5] + '_w_entityids.xlsx')

pathlist_fuzzymatch = Path(directory).glob('**/*_match.csv')
for path in pathlist_fuzzymatch:
    print(path)
    fuzzy_match_file = pd.read_csv(path)
    member_file_path = list(Path(str(path)[0:71]).glob('**/*Knackbuild_w_entityids.xlsx'))[0]
    member_file = pd.read_excel(member_file_path)
    
    match_col_name = str(path).split("_")[4]
    merged_file = member_file.merge(fuzzy_match_file, on = match_col_name, how='left')
    
    match_col_name_list = re.findall('[A-Z][^A-Z]*', match_col_name) #creating vars for id with spaces and id without spaces
    match_col_spaces = ' '.join(match_col_name_list) #creating var for name without spaces
    match_col_name_list.remove('Name')
    match_col_id_spaces = ' '.join(match_col_name_list) + ' Id'
    match_col_id_no_spaces = ''.join(match_col_name_list) + 'Id'  
    merged_file[match_col_name] = merged_file[match_col_spaces]
    merged_file[match_col_id_no_spaces] = merged_file[match_col_id_spaces] 
    merged_file.drop([match_col_spaces, match_col_id_spaces, match_col_name + "_clean", "type"],axis=1,inplace=True)  #drop match_col_spaces + match_col_id_spaces
    
    merged_file.to_excel(member_file_path,index=False)


C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals\04100_XXXXX\04100_XXXXX_JobTitleName_match.csv
C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals\04100_XXXXX\04100_XXXXX_LocalJobClassName_match.csv
C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals\05221_XXXXX\05221_XXXXX_JobTitleName_match.csv
C:\Users\AFT\OneDrive - aft.org\Fuzzymatcher\testing_locals\05221_XXXXX\05221_XXXXX_LocalJobClassName_match.csv


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\AFT\\OneDrive - aft.org\\Fuzzymatcher\\testing_locals\\05221_XXXXX\\05221_XXXXX_XXXXXXXX_Knackbuild_w_entityids.xlsx'