In [1]:
#! /usr/bin/python3

# Script for comparing an input .csv file with an existing .csv file (e.g. the current CHGIS).
# Indicates 1) matches on name and 2) strength of match on content
# Requires the library 'pandas' to be installed, which is included in Anaconda's free Python distribution

# by Stephen Ford (stephen.p.ford@gmail.com)

import pandas as pd
import os.path

# suppressing SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# function for selecting .csv files for manipulation

def csv_picker():
    ''' Function for checking whether user input path 1) is that of a valid file, and 2) is of a file ending with '.csv'
        Prompts for re-entry if entry is invalid.
        Returns a pandas DataFrame constructed from the valid .csv file
    '''
    name = input()

    # checking that the path is a valid filename, and prompting for re-entry if not
    while not (os.path.isfile(name)):
        print("Not a valid filename.  Please try again:")
        name = input()
        
    # checking that the valid filename ends in .csv, prompting for re-entry if not
    while not name.endswith('.csv'):
        print("Filename does not end in .csv -- please try again:")
        name = input()

    print("\nThank you -- filename %s is valid.\n" % name)       
    return pd.read_csv(name, low_memory=False)

In [3]:
# function for mapping the input .csv's fields to the desired, standardized output fields
def field_mapper(std_field, frame, fields):
    ''' Function that will prompt user to manually map fields of the input .csv to standardized output fields.
        Name changes will be made in-place (i.e. in the DataFrame -- the .csv will be untouched).
        If user fails to enter anything for the given mapping, that field will dropped from the final output file.        
    '''
    
    print("\nPlease enter the field that will be labeled '%s' in the output .csv:" % std_field)
    orig_field = input()
    
    # prompts for re-entering the input field if 1) it is not one of the column names and 2) it isn't an empty string
    while (not orig_field in list(frame.columns)) and (orig_field):
        print("\nNot a valid column name.  Please try again:")
        orig_field = input()
        
    # simply exit if the user pressed enter, bypassing the mapping, or perform the mapping if a valid field name has been entered
    if orig_field:
        frame.rename(columns={orig_field:std_field}, inplace=True)
        fields += [std_field]
    return fields


In [4]:
def name_checker(name_fields, fields, prefix, frame):
    ''' Function will confirm that at least one name field has been entered, and prompt user to remap the three
        name fields until at least one is a valid entry. Updates the DataFrame in-place and returns the updated field list.
    '''
    
    while (((name_fields[0] in fields) or (name_fields[1] in fields) or (name_fields[2] in fields)) == False):
        print("At least one name field needs to be specified. Please try again.")
        # counter ensures that name fields are inserted at correct place in sequence
        counter = 1
        for name_field in name_fields:
            print("Please enter the field that will be labeled '%s' in the output .csv:" % name_field)
            orig_field = input()
            if orig_field:
                if (orig_field in list(frame.columns)):
                    frame.rename(columns={orig_field:name_field}, inplace=True)
                    fields.insert(counter, name_field)
                    counter+=1
                else: 
                    print("Input not accepted -- field not found in data.")
    return fields
               

In [5]:
# offering user choice of strict or fuzzy name-matching

def merge_chooser(target_field, incoming_field):
    ''' Function called only if the user selects to merge on traditional characters 繁體字 or simplified characters 简体字
        Lets user choose whether to do a strict or fuzzy merge.
        In a fuzzy merge, only the first two characters of the Chinese names will be checked against one another.
    '''
    print(
    '''
Please indicate, by entering a numerical digit 1-2, whether you wish to do a strict or fuzzy match of names:
    1. Strict matching (e.g. '張掖' matches '張掖', but '張掖' does not match '張掖居延屬國')
    2. Fuzzy matching (e.g. '張掖' matches '張掖', and '張掖' also matches '張掖居延屬國')
    ''')

    accepted = False
    choice = input()

    while accepted == False:
        if choice == '1':
            print('Proceeding with strict matching of names.')
            accepted = True
            mode = 'strict'
            df = target.merge(incoming, how='outer', left_on=target_field, right_on=incoming_field, indicator=True)
            return df, mode
        elif choice == '2':
            print('Proceeding with fuzzy matching of names.')
            accepted = True
            mode = 'fuzzy'
            target['fuzzy_nm'] = target[target_field].map(lambda x: x[:2])
            incoming['fuzzy_nm'] = incoming[incoming_field].map(lambda x: x[:2])
            df = target.merge(incoming, how='outer', on='fuzzy_nm', indicator=True)
            return df, mode
        else:
            print("\nNot a valid response.  Please try again:\n")
            choice = input()

In [6]:
def coordinate_matcher(coords, frame, fields):
    ''' Function that performs a strict or fuzzy matching of spatial coordinates.  Returns a string indicating the type of match performed.
        The matches' results are added to the DataFrame within the function.
    '''
    
    print(
        '''
    Please indicate, by entering a numerical digit 1-2, whether you wish to do a strict or fuzzy match of spatial coordinates:
        1. Strict matching (requires exact match, e.g. '119.64656' does NOT match '119.646560', '119.64657', or '119.325')
        2. Fuzzy matching (requires only that rounded numbers match -- you specify the number of decimal places)
        ''')

    accepted = False
    choice = input()

    while accepted == False:
        if choice == '1':
            print('Proceeding with strict matching of spatial coordinates.')
            accepted = True
            for coord in coords:
                frame['out_%s_coord_match' % coord] = frame['input_%s_coord' % coord] == frame['tgaz_%s_coord' % coord]
                fields += ['out_%s_coord_match' % coord]
            return "strict"
        elif choice == '2':
            print("Proceeding with fuzzy matching of spatial coordinates.")
            accepted = True
            print('''Please enter a number indicating the number of decimal places to which to round the decimal coordinate value.
                      For example, if you enter 0, 117.91 and 118.08 will round to 118 and match; if you enter 1, 117.91 will round to 117.9 and 118.08 will round to 118.1, and they won't match.
                      Be advised that coordinates rarely have more than 7 decimal places of precision.
                  ''')
            decimal_place = input()
            try: 
                for coord in coords:
                    frame['fuzzy_out_%s_coord_match' % coord] = frame['input_%s_coord' % coord].map(lambda x: round(x, int(decimal_place))) == frame['tgaz_%s_coord' % coord].map(lambda x: round(x, int(decimal_place)))
                    fields += ['fuzzy_out_%s_coord_match' % coord]
                return "fuzzy"
                                                                                                
            except:
                print("Not a valid response. Defaulting to 0 (integer-rounding).")
                for coord in coords:
                    frame['fuzzy_out_%s_coord_match' % coord] = frame['input_%s_coord' % coord].map(lambda x: round(x, 0)) == frame['tgaz_%s_coord' % coord].map(lambda x: round(x, 0))
                    fields += ['fuzzy_out_%s_coord_match' % coord]
                return "fuzzy" 
        else:
            print("\nNot a valid response.  Please try again:\n")
            choice = input()

In [7]:
# soliciting files for comparison; presumption is that second file entered will be the CHGIS v5 in .csv format
print("Please type the path of the incoming .csv file (with extension):")
incoming = csv_picker()
# /home/sf/chgis/input/sample_data/Donghan_2014-10-02_copy.csv
# /home/sf/chgis/input/sample_data/lexdata.txt.data.csv

print("Please type the path of the target .csv file (with extension):")
target = csv_picker()
# /home/sf/chgis/input/v5_augment_2016-08-09.csv
# /home/sf/chgis/input/V6_input_draft_20160811.csv


Please type the path of the incoming .csv file (with extension):
/home/sf/chgis/input/sample_data/Donghan_2014-10-02_copy.csv

Thank you -- filename /home/sf/chgis/input/sample_data/Donghan_2014-10-02_copy.csv is valid.

Please type the path of the target .csv file (with extension):
/home/sf/chgis/input/v5_augment_2016-08-09.csv

Thank you -- filename /home/sf/chgis/input/v5_augment_2016-08-09.csv is valid.



In [8]:
### Populating the initial columns of the two spreadheets, and initializing the lists for the final output (unused fields will be dropped)

# initializing list of incoming_fields 
incoming_fields = []

# declaring the list of output .csv field names that will derive from the new data
# i.e., all those fields whose names will be 'input_*' in the final .csv
final_incoming_fields = [   
    'input_id',
    'input_nm_py',
    'input_nm_simp',
    'input_nm_trad',
    'input_type',
    'input_year_beg',
    'input_year_end',
    'input_dynasty',
    'input_other_id',
    'input_prnt',
    'input_obj_type',
    'input_x_coord',
    'input_y_coord'
]


# initializing list of actual target fields
target_fields = []

# initializing list of default CHGIS fields (taken from v5_augment_2016-08-09.csv)
default_target_fields = [
    'seq', 
    'sys_id', 
    'src', 
    'nm_py', 
    'nm_simp', 
    'nm_trad', 
    'x_coord', 
    'y_coord', 
    'pres_loc', 
    'type_py', 
    'type_ch', 
    'beg', 
    'end', 
    'obj_type',
    'prnt_id', 
    'prnt_sysid', 
    'prnt_simp', 
    'prnt_py'
]

# initializing list of tgaz fields (i.e. the standardized output-form of the CHGIS fields)
final_target_fields = [
    'tgaz_sys_id',
    'tgaz_nm_py',
    'tgaz_nm_simp',
    'tgaz_nm_trad',
    'tgaz_beg',
    'tgaz_end',
    'tgaz_data_source',
    'tgaz_obj_type',
    'tgaz_pres_loc',
    'tgaz_prnt_id',
    'tgaz_prnt_py',
    'tgaz_prnt_simp',
    'tgaz_prnt_sysid',
    'tgaz_type_ch',
    'tgaz_type_py',
    'tgaz_x_coord',
    'tgaz_y_coord'
]


In [9]:
### presenting user with choice of a default mapping of CHGIS fields (based on v5_augment_2016-08-09.csv) or of manually entering their own mapping

print('''Please indicate by entering '1' or '2' whether you wish to use a default mapping of CHGIS fields, or wish to manually map fields.
    
1. Use the default mapping -- presumes the input CHGIS file has the following columns:

  %s

2. Use a manual mapping -- you will be prompted to indicate which column from the file should map to which output column.
''' % str(default_target_fields))

accepted = False
mapping = input()

while accepted == False:
    # checks them against one another using comparison of sets (which are collections of unordered, unique items)
    if (mapping == '1'):
        if (set(list(target.columns)) == set(default_target_fields)):
            # manually renaming one exception to the following pattern
            target.rename(columns={'src':'data_source'}, inplace=True)

            # manually dropping the seq field
            del target['seq']

            # renaming the CHGIS fields in-place to conform to output specifications
            target.columns = ['tgaz_%s' % x for x in target.columns]
            # FOR TESTING ONLY
            if (set(list(target.columns)) != set(final_target_fields)):
                print("POSSIBLE ERROR")
                print("chgis.columns is %s" % str(list(chgis.columns)))
                print("final_chgis_fields is %s" % str(final_chgis_fields))
                print("set difference is %s" % str(set(list(chgis.columns)).difference(set(final_chgis_fields))))
            target = target[final_target_fields]
            accepted = True
        else: 
            print("The columns in the selected CHGIS spreadsheet do not precisely match expectations. \n\n Proceeding with manual mapping.")
            for field in final_target_fields:
                target_fields = field_mapper(field, target, target_fields)  
            target_fields = name_checker(['tgaz_nm_py', 'tgaz_nm_simp', 'tgaz_nm_trad'], target_fields, "tgaz", target)
            target = target[target_fields]
            accepted = True
    elif (mapping == '2'):
        print("Now, please specify fields from the CHGIS (match-receiving) data that will be included in the final spreadsheet.\n")
        for field in final_target_fields:
            target_fields = field_mapper(field, target, target_fields)  
        target_fields = name_checker(['tgaz_nm_py', 'tgaz_nm_simp', 'tgaz_nm_trad'], target_fields, "tgaz", target)
        target = target[target_fields]
        accepted = True
    else:
        print("\nNot a valid response.  Please try again:\n")
        mapping = input()

Please indicate by entering '1' or '2' whether you wish to use a default mapping of CHGIS fields, or wish to manually map fields.
    
1. Use the default mapping -- presumes the input CHGIS file has the following columns:

  ['seq', 'sys_id', 'src', 'nm_py', 'nm_simp', 'nm_trad', 'x_coord', 'y_coord', 'pres_loc', 'type_py', 'type_ch', 'beg', 'end', 'obj_type', 'prnt_id', 'prnt_sysid', 'prnt_simp', 'prnt_py']

2. Use a manual mapping -- you will be prompted to indicate which column from the file should map to which output column.

1


In [10]:
# renaming fields in the incoming DataFrame to conform to specifications

print("Now we will specify fields from the incoming (match-making) data that will be included in the final spreadsheet.\n")
for field in final_incoming_fields:
    incoming_fields = field_mapper(field, incoming, incoming_fields)    

incoming_fields = name_checker(['input_nm_py', 'input_nm_simp', 'input_nm_trad'], incoming_fields, "input", incoming)
incoming = incoming[incoming_fields]


Now we will specify fields from the incoming (match-making) data that will be included in the final spreadsheet.


Please enter the field that will be labeled 'input_id' in the output .csv:
規範碼

Please enter the field that will be labeled 'input_nm_py' in the output .csv:
县名

Please enter the field that will be labeled 'input_nm_simp' in the output .csv:


Please enter the field that will be labeled 'input_nm_trad' in the output .csv:


Please enter the field that will be labeled 'input_type' in the output .csv:


Please enter the field that will be labeled 'input_year_beg' in the output .csv:
BEG

Please enter the field that will be labeled 'input_year_end' in the output .csv:
END

Please enter the field that will be labeled 'input_dynasty' in the output .csv:


Please enter the field that will be labeled 'input_other_id' in the output .csv:


Please enter the field that will be labeled 'input_prnt' in the output .csv:


Please enter the field that will be labeled 'input_obj_type' in 

In [None]:


# dropping fields from the incoming DataFrame that weren't mapped
#for field in incoming.columns:
#    if incoming[field].empty:
#        incoming.drop(field, inplace=True)
#        input_fields.remove(field)
        
#print("input_fields are: %s" % str(input_fields))
#print("incoming cols are: %s" % str(incoming.columns))

In [None]:
# manually renaming one exception to the following pattern
#chgis.rename(columns={'src':'data_source'}, inplace=True)

# renaming the CHGIS fields in-place to conform to output specifications
#chgis.columns = ['tgaz_%s' % x for x in chgis.columns]

In [11]:
# soliciting user choice regarding which name field to take as primary
print(
    '''
Thank you. Now, please indicate, by entering a numerical digit 1-3, which of the following names you wish to make the primary key for comparing data:
    1. Name in complex/traditional Chinese characters 繁体字
    2. Name in simplified Chinese characters 简体字
    3. Name in pinyin 拼音
    '''
)

accepted = False
choice = input()

while accepted == False:
    if ((choice == '1') and ('input_nm_trad' in incoming_fields)):
        print("\nUsing name in complex/traditional Chinese characters 繁体字 as primary matching key.")
        accepted = True
        incoming_name_match_field = 'input_nm_trad'
        target_name_match_field = 'tgaz_nm_trad'
        df, name_mode = merge_chooser(target_name_match_field, incoming_name_match_field)
    elif ((choice == '2') and ('input_nm_simp' in incoming_fields)):
        print("\nUsing name in simplified Chinese characters 简体字 as primary matching key.")
        accepted = True
        incoming_name_match_field = 'input_nm_simp'
        target_name_match_field = 'tgaz_nm_simp'
        df, name_mode = merge_chooser(target_name_match_field, incoming_name_match_field)
    elif ((choice == '3') and ('input_nm_py' in incoming_fields)):
        print("\nUsing name in pinyin 拼音 as primary matching key.")
        accepted = True
        incoming_name_match_field = 'input_nm_py'
        target_name_match_field = 'tgaz_nm_py'
        name_mode = 'strict'
        print('Fuzzy matching is not currently supported for pinyin names.  Proceeding with strict matching.')
        df = target.merge(incoming, how='outer', left_on=target_name_match_field, right_on=incoming_name_match_field, indicator=True)
        
    else:
        print("\nNot a valid response.  Please try again, entering a choice corresponding to a valid field:\n")
        choice = input()
        
output_fields = []            


Thank you. Now, please indicate, by entering a numerical digit 1-3, which of the following names you wish to make the primary key for comparing data:
    1. Name in complex/traditional Chinese characters 繁体字
    2. Name in simplified Chinese characters 简体字
    3. Name in pinyin 拼音
    
3

Using name in pinyin 拼音 as primary matching key.
Fuzzy matching is not currently supported for pinyin names.  Proceeding with strict matching.


In [12]:
# removing rows that are only present in the CHGIS file
df = df[df['_merge'] != 'left_only']

# renaming merge indicators for legibility
df = df.replace(to_replace='both', value='found')
df = df.replace(to_replace='right_only', value='not_found')
df.rename(columns={'_merge':'match'}, inplace=True)

In [13]:
#### Abandoning comprehensive for-loop for sake of year matching; manually stringifying coords instead

# converting all fields to strings for ease of comparison
#for field in list(df.columns):
#    df[field] = df[field].astype(str)
spatial_coords = ['input_x_coord', 'input_y_coord', 'tgaz_x_coord', 'tgaz_y_coord']

# converting the given field's values to a numeric, or NaN, for possible rounding later
try: 
    for field in spatial_coords:
        print("in the for loop")
        if (field in incoming_fields) or (field in target_fields):
            print("converting %s" % field)
            df[field] = pd.to_numeric(df[field], errors='coerce')
    if ('input_x_coord' in incoming_fields) and ('input_y_coord' in incoming_fields):
        coord_mode = coordinate_matcher(['x', 'y'], df, output_fields)
    elif ('input_x_coord' in incoming_fields) and not ('input_y_coord' in incoming_fields):
        coord_mode = coordinate_matcher(['x'], df, output_fields)
    elif not ('input_x_coord' in incoming_fields) and ('input_y_coord' in incoming_fields):
        coord_mode = coordinate_matcher(['y'], df, output_fields)
    else:
        print("\nNo spatial coordinate fields available for matching.")
except KeyError:
    print("Spatial coordinates not properly entered. Skipping coordinate matching.")

in the for loop
converting input_x_coord
in the for loop
converting input_y_coord
in the for loop
in the for loop

    Please indicate, by entering a numerical digit 1-2, whether you wish to do a strict or fuzzy match of spatial coordinates:
        1. Strict matching (requires exact match, e.g. '119.64656' does NOT match '119.646560', '119.64657', or '119.325')
        2. Fuzzy matching (requires only that rounded numbers match -- you specify the number of decimal places)
        
2
Proceeding with fuzzy matching of spatial coordinates.
Please enter a number indicating the number of decimal places to which to round the decimal coordinate value.
                      For example, if you enter 0, 117.91 and 118.08 will round to 118 and match; if you enter 1, 117.91 will round to 117.9 and 118.08 will round to 118.1, and they won't match.
                      Be advised that coordinates rarely have more than 7 decimal places of precision.
                  
2


In [14]:
############## NEW ################
#if (df['input_year_beg'].empty() | df['input_year_end'].empty()):
#    if (df['input_year_beg'].empty()):
#        df.drop(['input_year_beg'])
#        input_fields.remove('input_year_beg')
#        beg_year_matching = False
#    if (df['input_year_end'].empty()):
#        df.drop(['input_year_end'])
#        input_fields.remove('input_year_end')
#        end_year_matching = False

if (not 'input_year_beg' in incoming_fields) or (not 'input_year_end' in incoming_fields):
    print("Incoming data lacks a beginning and/or ending year field; no date comparisons will be made.")
else:
    # beg_year_matching, end_year_matching = True, True
    
    year_fields = ['input_year_beg', 'input_year_end', 'tgaz_beg', 'tgaz_end']
    
    #for field in year_fields:
    #    print(df[field].dtype)
           
    # overlap field initialization
    df['out_year_overlap'] = ''

           
    # converting to type float64 (invalid years become 'NaN', and pandas.isnull(<Series_(column)>) returns True for those values
    for year_field in year_fields:
        df[year_field] = pd.to_numeric(df[year_field], errors='coerce')
    
    # in-row match testing
    df['out_beg_match'] = df['input_year_beg'] == df['tgaz_beg']
    df['out_end_match'] = df['input_year_end'] == df['tgaz_end']
    
    # testing for timespan relationships
    df['out_year_overlap'][((df['input_year_beg'] == (df['tgaz_end'] + 1)) | (df['input_year_end'] == (df['tgaz_beg'] - 1)))] = 'adjacent'
    df['out_year_overlap'][(df['input_year_beg'] <= df['tgaz_beg']) & (df['input_year_end'] >= df['tgaz_beg']) & (df['input_year_end'] < df['tgaz_end'])] = 'partial_incl_start_of_target'
    df['out_year_overlap'][(df['input_year_beg'] > df['tgaz_beg']) & (df['input_year_beg'] <= df['tgaz_end']) & (df['input_year_end'] >= df['tgaz_end'])] = 'partial_incl_end_of_target'
    df['out_year_overlap'][((df['input_year_beg'] >= df['tgaz_beg']) & (df['input_year_end'] < df['tgaz_end'])) | ((df['input_year_beg'] > df['tgaz_beg']) & (df['input_year_end'] <= df['tgaz_end']))] = 'incoming_nested_in_target'
    df['out_year_overlap'][((df['input_year_beg'] <= df['tgaz_beg']) & (df['input_year_end'] > df['tgaz_end']) | (df['input_year_beg'] < df['tgaz_beg']) & (df['input_year_end'] >= df['tgaz_end']))] = 'target_nested_in_incoming'
    df['out_year_overlap'][(df['input_year_beg'] == df['tgaz_beg']) & (df['input_year_end'] == df['tgaz_end'])] = 'perfect_match'
    df['out_year_overlap'][(df['input_year_beg'] == 0) | (df['input_year_end'] == 0) | (df['tgaz_beg'] == 0) | (df['tgaz_end'] == 0)] = 'CAUTION__ZEROES'
    df['out_year_overlap'][(df['input_year_beg'] > df['input_year_end']) | (df['tgaz_beg'] > df['tgaz_end'])] = 'ERROR__END_BEFORE_BEG'
    
    # designating items with non-numeric text values in at least one year field (which therefore break the overlap checker)
    df['out_year_overlap'][(df['match'] == 'found') & (pd.isnull(df['input_year_beg']) | pd.isnull(df['input_year_end']) | pd.isnull(df['tgaz_beg']) | pd.isnull(df['tgaz_end']))] = 'ERROR__NON_NUMERIC_YEAR_VALUE'
    
    # updating output fields
    output_fields += ['out_beg_match'] + ['out_end_match'] + ['out_year_overlap']

In [15]:
# adding the 'match_strength' column, leveraging the fact that Python True and False evaluate to 1 and 0 respectively when passed to int()
df['out_content_match_strength'] = 0
output_fields += ['out_content_match_strength']

if ('input_x_coord' in incoming_fields):
    if coord_mode == "strict":
        df['out_content_match_strength'] += df['out_x_coord_match'].astype(int)
    else: 
        df['out_content_match_strength'] += df['fuzzy_out_x_coord_match'].astype(int)

if ('input_y_coord' in incoming_fields):
    if coord_mode == "strict":
        df['out_content_match_strength'] += df['out_y_coord_match'].astype(int)
    else: 
        df['out_content_match_strength'] += df['fuzzy_out_y_coord_match'].astype(int)
    
if ('input_year_beg' in incoming_fields):
    df['out_content_match_strength'] += df['out_beg_match'].astype(int) 
    
if ('input_year_end' in incoming_fields):
    df['out_content_match_strength'] += df['out_end_match'].astype(int)

In [16]:
# sorting columns
#print(output_fields)
ordered_fields = target_fields + incoming_fields + ['match'] + output_fields
#print(ordered_fields)
fuzzy_ordered_fields = ['fuzzy_nm'] + ordered_fields

if name_mode == 'strict':
    df = df[ordered_fields]
else:
    df = df[fuzzy_ordered_fields]

In [17]:
# replacing 'nan' with '' for improved legibility
df = df.replace('nan', '')

In [18]:
# outputting results
print('\nData check is complete.  Please type a name (without extension) for your output files:\n')
output_path = input()

# TODO:  VALIDATE FILE NAME

print("Thank you. Saving results.")  

# writing the file to the specified output path while dropping the unlabeled index column that pandas DataFrames generate by default
df.to_csv("%s.csv" % output_path, index=False)


print("\nData output to %s.csv \nMatching info and summary of results output to %s.info.txt \nNow exiting" % (output_path, output_path))



Data check is complete.  Please type a name (without extension) for your output files:

160925test3
Thank you. Saving results.

Data output to 160925test3.csv 
Matching info and summary of results output to 160925test3.info.txt 
Now exiting
