In [3]:
import pandas as pd
import recordlinkage
import re
from pathlib import Path
import csv
import os.path
import configparser


#takes in entity file path and creates dataframe
def entity_file_creator(entity_file_path, ws_name):
    entity_sheet = pd.read_excel(entity_file_path, ws_name)
    col_name_list = re.findall('[A-Z][^A-Z]*', ws_name)
    col_name = ' '.join(col_name_list) + ' Name'
    if (len(entity_sheet[col_name].dropna()) > 0) == True:
        col_id = ' '.join(col_name_list) + ' Id'
        data = {col_name : entity_sheet[col_name].str.rstrip(), col_id: entity_sheet[col_id]}
        if ws_name == 'JobTitle' or ws_name == 'LocalJobClass': 
            if ((len(entity_sheet['Unit Id'].dropna()) > 0) == True): 
                data['Unit Id'] = entity_sheet['Unit Id'].values   
        elif ws_name == 'WorkLocation' or ws_name == 'WorkStructure': 
            if ((len(entity_sheet['Employer Id'].dropna()) > 0) == True): 
                data['Employer Id'] = entity_sheet['Employer Id'].values  
        return pd.DataFrame(data).dropna()    
    
#takes in knackbuild file path and creates dataframe
def member_id_creator(member_file_path, col_name):
    member_file = pd.read_excel(member_file_path)
    if (len(member_file[col_name].dropna()) > 0) == True:
        block_type = ''
        if col_name == 'JobTitleName' or col_name == 'LocalJobClassName': 
            block_type = 'UnitId'
        elif col_name == 'WorkLocationName' or col_name == 'WorkStructureName':
            block_type = 'EmployerId'
        elif col_name == 'LocalDuesCategoryName':
            block_type = 'UnionRelationshipTypeName' #we don't actually need this data... just didn't wanna rewrite the program
        col_and_units_name = str(col_name) + " + " + block_type
        member_file[col_and_units_name] = member_file[col_name].astype(str) + member_file[block_type].astype(str)
        unique_entries_index = member_file[col_and_units_name].drop_duplicates().index
        member_descriptions = member_file.loc[unique_entries_index, col_name].values
        units = member_file.loc[unique_entries_index, block_type].values
        descriptions_and_units = member_file.loc[unique_entries_index, col_and_units_name].values
        data = {col_name:member_descriptions, block_type:units, col_and_units_name:descriptions_and_units}
        return pd.DataFrame(data) 
    
#cleans up column in knackbuild file so match accuracy is improved
#each local has unique cleaning instructions
def clean(col_name, member_df, member_file_path):
    local_num = str(member_file_path)[-36:-31]
    cleaning_dict_path = str(member_file_path)[:-59] + "\cleaning_dictionaries\{}_cleaning_dict.csv".format(local_num)
        
    cleaning_dict = {}  
    with open(cleaning_dict_path) as file:
        cleaning_dict_csv = csv.reader(file, delimiter=',')
        for line in cleaning_dict_csv:
            cleaning_dict[line[0]] = line[1]
            #print(cleaning_dict) #FOR TESTING

    member_df[col_name+'_clean'] = member_df[col_name]
    [member_df[col_name+'_clean'].replace(key, value, regex=True,inplace=True) for key, value in cleaning_dict.items()]
    member_df[col_name+'_clean'] = member_df[col_name+'_clean'].str.rstrip(' ')
    return member_df

#performs the match
def matcher(member_df, entity_df, mem_col_name, cleaned_mem_col_name, enti_col_name, enti_col_id): 
    #assigning the columns that match should be blocked on
    #i.e. only match if the unitid column is the same
    block_type = ''
    block_type_spaces = ''  
    if mem_col_name == 'JobTitleName' or mem_col_name == 'LocalJobClassName': 
        block_type = 'UnitId'
        block_type_spaces = 'Unit Id'
    elif mem_col_name == 'WorkLocationName' or mem_col_name == 'WorkStructureName':
        block_type = 'EmployerId'
        block_type_spaces = 'Employer Id'
    
    if (block_type in member_df.columns) and (block_type_spaces in entity_df.columns):
        #perfect matching
        perf_matches = member_df.merge(entity_df, how='inner', left_on=[cleaned_mem_col_name, block_type], right_on=[enti_col_name, block_type_spaces])
        member_wo_perf_matches = member_df[~member_df[mem_col_name + " + " + block_type].isin(perf_matches[mem_col_name + " + " + block_type])]
        entity_wo_perf_matches = entity_df[~entity_df[enti_col_id].isin(perf_matches[enti_col_id])]
        
        #fuzzy matching
        member_wo_perf_matches.set_index(mem_col_name + " + " + block_type,inplace=True)
        entity_wo_perf_matches.set_index(enti_col_id,inplace=True)
        indexer = recordlinkage.Index()
        indexer.block(left_on=block_type, right_on=block_type_spaces)
        candidates = indexer.index(member_wo_perf_matches, entity_wo_perf_matches)
        compare = recordlinkage.Compare()
        compare.string(cleaned_mem_col_name,enti_col_name,threshold=0.6,label='similarity')
        features = compare.compute(candidates, member_wo_perf_matches, entity_wo_perf_matches)
        potential_matches = features[features.sum(axis=1) == 1].reset_index()

        entity_lu = entity_wo_perf_matches[[enti_col_name, block_type_spaces]].reset_index()
        member_lu = member_wo_perf_matches[[cleaned_mem_col_name, mem_col_name, block_type]].reset_index()
        entity_merge = potential_matches.merge(entity_lu, how='outer')
        fuzzy_matches = entity_merge.merge(member_lu, how='right').drop(['similarity'],axis=1) 
    
        if len(perf_matches) !=  0: perf_matches['type'] = ['Perfect Match']*len(perf_matches) 
        if len(fuzzy_matches) !=  0:
            fuzzy_matches.loc[fuzzy_matches[mem_col_name].duplicated(keep=False) == False, 'type'] = 'One-to-One Fuzzy Match'
            fuzzy_matches.loc[fuzzy_matches[mem_col_name].duplicated(keep=False) == True, 'type'] = 'Multiple Fuzzy Matches'
            fuzzy_matches.loc[fuzzy_matches[enti_col_id].isna(), 'type'] = 'No Match to Entity File Found'
    
        matched_df = pd.concat([fuzzy_matches,perf_matches]).sort_values(by=['type'])
        matched_df.rename(columns={mem_col_name:mem_col_name + ' Knackbuild',
                               enti_col_name:enti_col_name+ ' Entity File',
                               block_type: block_type + ' Knackbuild', 
                               block_type_spaces:block_type_spaces + ' Entity File'},inplace=True)
        return matched_df

    else:
        print('ERROR: {} data does not exist in both the entity and membership file but it should.'.format(block_type))
        print('Clean file and try again.')

#merges knackbuild match column entries to entity file match column entries 
#for localduescategory
#needs separate function because this is a single column match
def matcher_localduescategory(member_df, entity_df, mem_col_name, cleaned_mem_col_name, enti_col_name, enti_col_id):
    #perfect matching
    perf_matches = member_df.merge(entity_df, how='inner', left_on=[cleaned_mem_col_name], right_on=[enti_col_name])
    if len(perf_matches) !=  0: 
        perf_matches['type'] = ['Perfect Match']*len(perf_matches) 

    member_wo_perf_matches = member_df[~member_df[mem_col_name].isin(perf_matches[mem_col_name])]
    entity_wo_perf_matches = entity_df[~entity_df[enti_col_id].isin(perf_matches[enti_col_id])]
    member_wo_perf_matches = member_wo_perf_matches.dropna(subset=[mem_col_name],axis=0)
    
    #fuzzy matching
    if len(member_wo_perf_matches[mem_col_name].dropna()) > 0 == True:
        member_wo_perf_matches.set_index(mem_col_name,inplace=True)
        entity_wo_perf_matches.set_index(enti_col_id,inplace=True)
        indexer = recordlinkage.Index()
        indexer.full()
    
        candidates = indexer.index(member_wo_perf_matches, entity_wo_perf_matches)
        compare = recordlinkage.Compare()
        compare.string(cleaned_mem_col_name,enti_col_name,threshold=0.6,label='similarity')
        features = compare.compute(candidates, member_wo_perf_matches, entity_wo_perf_matches)
        potential_matches = features[features.sum(axis=1) == 1].reset_index()

        entity_lu = entity_wo_perf_matches[[enti_col_name]].reset_index()
        member_lu = member_wo_perf_matches[[cleaned_mem_col_name, mem_col_name]].reset_index()
        entity_merge = potential_matches.merge(entity_lu, how='outer')
        fuzzy_matches = entity_merge.merge(member_lu, how='right').drop(['similarity'],axis=1)
        
        if len(fuzzy_matches) !=  0:
            fuzzy_matches.loc[fuzzy_matches[mem_col_name].duplicated(keep=False) == False, 'type'] = 'One-to-One Fuzzy Match'
            fuzzy_matches.loc[fuzzy_matches[mem_col_name].duplicated(keep=False) == True, 'type'] = 'Multiple Fuzzy Matches'
            fuzzy_matches.loc[fuzzy_matches[enti_col_id].isna(), 'type'] = 'No Match to Entity File Found'
    
        matched_df = pd.concat([fuzzy_matches,perf_matches]).sort_values(by=['type'])
        matched_df.rename(columns={mem_col_name:mem_col_name + ' Knackbuild',
                           enti_col_name:enti_col_name+ ' Entity File'},inplace=True)
        return matched_df
    
    perf_matches.rename(columns={mem_col_name:mem_col_name + ' Knackbuild',
        enti_col_name:enti_col_name+ ' Entity File'},inplace=True)
    return perf_matches

#pulls together all helper functions to produce _match spreadsheets for review
def runner(member_file_path, entity_file_path, member_entity_category_tuple):        
    mem_name = member_entity_category_tuple[0]
    enti_name = re.sub(r"(\w)([A-Z])", r"\1 \2", member_entity_category_tuple[1]) + ' Name'
    enti_id = re.sub(r'Name$',"",enti_name) + 'Id'

    member = member_id_creator(member_file_path, mem_name)
    entity = entity_file_creator(entity_file_path, member_entity_category_tuple[1])
            
    if entity is not None and member is not None:
        member = clean(member_df=member, col_name = mem_name,member_file_path=member_file_path)
        cleaned_mem_name = mem_name+'_clean'
    
        if mem_name == 'JobTitleName' or mem_name == 'LocalJobClassName' or mem_name == 'WorkLocationName' or mem_name == 'WorkStructureName': 
            matched_df = matcher(member_df=member, entity_df=entity, mem_col_name=mem_name, 
                                cleaned_mem_col_name=cleaned_mem_name, enti_col_name=enti_name, 
                                enti_col_id=enti_id)
        elif mem_name == 'LocalDuesCategoryName':
            matched_df = matcher_localduescategory(member_df=member, entity_df=entity, mem_col_name=mem_name, 
                                                  cleaned_mem_col_name=cleaned_mem_name, enti_col_name=enti_name, 
                                                  enti_col_id=enti_id)
        
        matched_df.dropna(subset=[mem_name + ' Knackbuild'],inplace=True)
        matched_df.to_csv(str(member_file_path)[:-24] + "{}_match.csv".format(mem_name),index=False)
        print('Both membership and entity file data exist for {}. Fuzzy match spreadsheet created.'.format(mem_name))    

    elif entity is None and member is not None:
        member.to_csv(str(member_file_path)[:-24] + "{}_member.csv".format(member_entity_category_tuple[0]),index=False)
        print('The membership file data exists for {} but entity file data does not.'.format(member_entity_category_tuple))
        print('A spreadsheet of entries in the membership file has been created.')
        print('The affiliate may need to be contacted to create entity file categories.')
    elif entity is not None and member is None:
        print('The entity file data exists for {} but membership file data does not.'.format(member_entity_category_tuple))
        print('No spreadsheet created.')
    else:
        print('Neither membership nor entity file data for {} exists.'.format(member_entity_category_tuple))
        print('No spreadsheet created.')

        
def main():
    config = configparser.ConfigParser()
    config.optionxform = str
    config.read(os.path.expanduser(r"~\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\config.ini"))
    categories = config['DEFAULT']
    categories = [(k, v) for k, v in dict(categories).items()]
    
    directory = os.path.expanduser(r"~\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\Workbench") 
    pathlist = Path(directory).glob('**/*KnackBuild.xlsx')
    for path in pathlist:
        member_file_path = path
        print(path)
        entity_file_path = str(path)[:-30] + 'EntityList.xlsx'
        print(entity_file_path)
        entity_file_path = Path(entity_file_path)
    
        for category in categories:
            runner(member_file_path, entity_file_path, category)
            #except:
            #   print('Error')
            #  pass
main()

C:\Users\Jonathan Ellisor\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\Workbench\01018_23255\01018_23255_20211028_Knackbuild.xlsx
C:\Users\Jonathan Ellisor\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\Workbench\01018_23255\01018_EntityList.xlsx
Neither membership nor entity file data for ('JobTitleName', 'JobTitle') exists.
No spreadsheet created.


  rr = 1 - n / n_max
  rr_avg = 1 - n_total / n_max_total


Both membership and entity file data exist for LocalJobClassName. Fuzzy match spreadsheet created.
Both membership and entity file data exist for WorkLocationName. Fuzzy match spreadsheet created.
Neither membership nor entity file data for ('WorkStructureName', 'WorkStructure') exists.
No spreadsheet created.
Both membership and entity file data exist for LocalDuesCategoryName. Fuzzy match spreadsheet created.


In [4]:
from pathlib import Path
import pandas as pd
import shutil
import re
import os.path
import numpy as np

#creating new copy of membership file to work in so original file is preserved
def copyer(directory):
    pathlist_knackbuild = Path(directory).glob('**/*Knackbuild.xlsx')
    for path in pathlist_knackbuild:
        shutil.copy(path, str(path)[:-24] + 'Knackbuild_w_entityids.xlsx')

#create strings that represent column names
def name_creator(match_file_path):
    match_col_name = str(match_file_path).split("_")[3]
    match_col_spaces = re.sub(r"(\w)([A-Z])", r"\1 \2", match_col_name) #creating var for name with spaces
    match_col_id_spaces = re.sub(r'Name$',"",match_col_spaces) + 'Id'
    match_col_id_no_spaces = re.sub(r'Name$',"",match_col_name) + 'Id'
    return {'match_col_name':match_col_name,'match_col_spaces':match_col_spaces,'match_col_id_no_spaces':match_col_id_no_spaces,'match_col_id_spaces':match_col_id_spaces}

#remerging reviewed fuzzy match files back into knackbuild with ids from entity file
def remerger(names_list,member_df,fuzzy_match_df):
    block_type = ''
    block_type_spaces = ''
    if names_list['match_col_name'] == 'JobTitleName' or names_list['match_col_name'] == 'LocalJobClassName': 
        block_type = 'UnitId'
        block_type_spaces = 'Unit Id'
    elif names_list['match_col_name'] == 'WorkLocationName' or names_list['match_col_name'] == 'WorkStructureName':
        block_type = 'EmployerId'
        block_type_spaces = 'Employer Id'
    fuzzy_match_df.rename(columns={names_list['match_col_spaces'] + ' Entity File': names_list['match_col_spaces'],
                                   block_type_spaces + ' Entity File': block_type_spaces,
                                   names_list['match_col_name'] + ' Knackbuild': names_list['match_col_name'], #possible to not do renaming later on by doing it here? hcange name to unitid so it automatically merges?
                                   block_type + ' Knackbuild': block_type},inplace=True)
    fuzzy_match_df.drop(['type',names_list['match_col_name'] + "_clean",block_type_spaces,names_list['match_col_name'] + " + " + block_type],axis=1,inplace=True)
   
    merged_file = member_df.merge(fuzzy_match_df, on = [names_list['match_col_name'],block_type], how='left')

    merged_file[names_list['match_col_spaces']] = np.where(merged_file[names_list['match_col_spaces']] == np.nan, merged_file[names_list['match_col_name']], names_list['match_col_spaces'])
    merged_file[names_list['match_col_id_no_spaces']] = merged_file[names_list['match_col_id_spaces']]
    
    return merged_file
    
def remerger_localduescategory(names_list,member_df,fuzzy_match_df):
    fuzzy_match_df.rename(columns={names_list['match_col_spaces'] + ' Entity File': names_list['match_col_spaces'],
                                   names_list['match_col_name'] + ' Knackbuild': names_list['match_col_name']}, inplace=True)
    fuzzy_match_df.drop(['type',names_list['match_col_name'] + "_clean",names_list['match_col_name'] + " + UnionRelationshipTypeName"],axis=1,inplace=True)
    
    merged_file = member_df.merge(fuzzy_match_df, on = [names_list['match_col_name'],'UnionRelationshipTypeName'], how='left')
    
    merged_file[names_list['match_col_spaces']] = np.where(merged_file[names_list['match_col_spaces']] == np.nan, merged_file[names_list['match_col_name']], names_list['match_col_spaces'])
    merged_file[names_list['match_col_id_no_spaces']] = merged_file[names_list['match_col_id_spaces']]
    
    return merged_file

def main():
    directory = os.path.expanduser(r"~\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\Workbench")

    copyer(directory)
    pathlist_fuzzymatch = Path(directory).glob('**/*_match.csv')
    for path in pathlist_fuzzymatch:
        print(path)
        names_list = name_creator(match_file_path=path)

        fuzzy_match_file = pd.read_csv(path)
        member_file_path = str(path).split("_")[:-2]
        member_file_path = Path('_'.join(member_file_path) + '_Knackbuild_w_entityids.xlsx')
        member_file = pd.read_excel(member_file_path)
        
        if names_list['match_col_name'] == 'JobTitleName' or names_list['match_col_name'] == 'LocalJobClassName' or names_list['match_col_name'] == 'WorkLocationName' or names_list['match_col_name'] == 'WorkStructureName': 
            merged_file = remerger(names_list=names_list,member_df=member_file,fuzzy_match_df=fuzzy_match_file)
        elif names_list['match_col_name'] == 'LocalDuesCategoryName':
            merged_file = remerger_localduescategory(names_list=names_list,member_df=member_file,fuzzy_match_df=fuzzy_match_file)
    
        merged_file.drop([names_list['match_col_spaces'],names_list['match_col_id_spaces']],axis=1,inplace=True)

        for col in merged_file.columns:
            if len(merged_file[col].dropna()) > 0:
                if 'Date' in col:
                    merged_file[col] = merged_file[col].dt.date
        
        merged_file.to_excel(member_file_path,index=False)
        
main()

C:\Users\Jonathan Ellisor\OneDrive - aft.org\AFTDBFileUpload\Fuzzymatcher\Workbench\01018_23255\01018_23255_LocalDuesCategoryName_match.csv


AttributeError: Can only use .dt accessor with datetimelike values