In [None]:
import funcy
import os
import pandas as pd
from fuzzywuzzy import fuzz

test_lst = [
    'BENDER GENEVIEVE I', 'BENDER LOUIS J JR BENDLIN JUDITH E',
    'BENEDICT JEAN D', 'BENEDICT MYRTLE M', 'BENHAM WILLIAM F']

#jupyter nbconvert --to script
# convert jupyter notebook to script so you can import functions

SUFFIXES = ['JR', 'SR', 'I', 'II', 'IV', 'III', 'DR', 'V', 'PROF']
PROBLEM_SUFFIXES = ['8', 'OR', 'DR', 'MC', 'LA', 'FR', '1', '11', '111', 'ANN']
path = '/Users/lrraymond13/MIT/Azoulay_RA_2016/Data'


def is_suffix_initial(wrd):
    if wrd in SUFFIXES + PROBLEM_SUFFIXES:
        return True
    return len(wrd) == 1


def clean_name_string(raw_str):
    # standardize spaces, map to uppercase
    raw_lst = filter(None, raw_str.split(' '))
    # remove periods, commas
    cleaned_str = map(
        lambda wrd: ''.join(
            funcy.remove(lambda x: x in ['.', ','], wrd)), raw_lst)
    # convert back into a string
    return ' '.join(map(lambda x: x.upper().strip(), cleaned_str))


def has_unicode_char(raw_str):
    # boolean function to mark strings with unicode chars
    try:
        new_str = raw_str.encode('ascii', errors='strict')
        return False
    except UnicodeEncodeError as e:
        return True


def is_long_middle_name(wrd, last_name):
    # check if this long word is long middle name or next last name
    return fuzz.ratio(wrd[:3], last_name[:3]) > 80


def parse_multi_names(raw_str):
    wrds = raw_str.split(' ')
    counter = 0
    names_list = []
    print raw_str
    while counter < len(wrds):
        next_counter = 2
        name = wrds[counter: counter+next_counter]
        # check next word
        while counter+next_counter < len(wrds):
            if is_suffix_initial(wrds[counter+next_counter]):
                name.append(wrds[counter+next_counter])
                next_counter += 1
            # check for long middle name if remaining words
            elif counter+next_counter+1 < len(wrds) and \
                    is_long_middle_name(wrds[counter+next_counter+1], wrds[counter]):
                print 'checking for long middle name'
                # probably a long middle name
                name.append(wrds[counter+next_counter])
                next_counter += 1
                break
            else:
                break
        counter = counter + next_counter
        names_list.append(' '.join(name))
    return names_list


def process_list_names(index, lst_names):
    wrds = lst_names[index].split(' ')
    if (index == len(lst_names) - 1) and (len(wrds) <= 3):
        # if last set of words in the list and less than 3 words
        return True, [], lst_names[index]
    if len(wrds) == 3 and is_suffix_initial(wrds[-1]):
        return True, lst_names[index+1:], lst_names[index]
    elif len(wrds) == 4 and is_suffix_initial(wrds[-1]):
    # something like ANDERSON ELIZABETH P DR
        return True, lst_names[index+1:], lst_names[index]
    elif len(wrds) == 2:
    # check to make sure no suffix starting with next word
        next_wrds = lst_names[index+1].split(' ')
        if is_suffix_initial(next_wrds[1]) and is_suffix_initial(next_wrds[0]):
            new_next = ' '.join(next_wrds[2:])
            return False, [new_next] + lst_names[index+2:], ' '.join(wrds+next_wrds[:2])
        elif is_suffix_initial(next_wrds[0]):
            print 'checking for plus one suffix'
            new_next = ' '.join(next_wrds[1:])
            return False, [new_next] + lst_names[index+2:], ' '.join(wrds+next_wrds[:1])
        elif is_long_middle_name(next_wrds[1], wrds[0]):
        # check for long middle name (not initial)
            new_next = ' '.join(next_wrds[1:])
            if len(next_wrds) > 0:
                return False, [new_next] + lst_names[index+2:], ' '.join(wrds+next_wrds[:1])
            return False, lst_names[index+2:], ' '.join(wrds+next_wrds[:1])
        else:
            return True, lst_names[index+2:], ' '.join(wrds)
    else:
    # length is greater before, so def more than 1 name
    # parse these out
        names = parse_multi_names(lst_names[index])
        return False, lst_names[index+1:], names


def separate_names(multi_names_df):
    # create new df with multi names sep into rows
    new_dfs = []
    for row in multi_names_df['clean'].values:
        new_names = []
        row_names = row.split('\n')
        process_list = row_names
        while len(process_list) > 0:
            res = process_list_names(0, process_list)
            is_name = res[0]
            one_name = res[2]
            process_list = res[1]
            if len(one_name) > 0:
                new_names.append(one_name)
        new_names_flat = funcy.flatten(new_names)
        new_dfs.append(pd.DataFrame({
            'name': [row]*len(new_names_flat),
            'clean': new_names_flat,
            'cleaned_name_flag': [1]*len(new_names_flat)}))
    return pd.concat(new_dfs, axis=0)


def has_multi_names(raw_str):
    # check if a column contains more than one name
    # if a column contains a newline, def multiple names
    split_names = raw_str.split('\n')
    if len(split_names) > 1:
        return True
    # otherwise, if more than 4 sep words, flag
    wrds = raw_str.split(' ')
    if len(wrds) <= 3:
        return False
    if len(wrds) <= 5:
        return not is_suffix_initial(wrds[-1])
    return True


def process_sheet(sheet_df_obj):
    # clean strings, find rows with multiple names in one row
    # separate those into sep df, create single names
    # return appended df
    sheet_df_obj['clean'] = sheet_df_obj['name'].apply(clean_name_string)
    # then find rows with multiple names in one row
    multi_row_mask = sheet_df_obj['clean'].apply(has_multi_names)
    multi_rows = sheet_df_obj[multi_row_mask]
    single_names = sheet_df_obj[~multi_row_mask]
    if multi_rows.shape[0] > 0:
        new_rows = separate_names(multi_rows)
        return single_names.append(new_rows)
    return single_names


def mark_unicode_strs(df):
    is_unicode_mask = df['clean'].apply(has_unicode_char)
    df['unicode_flag'] = 0
    df.loc[is_unicode_mask, 'unicode_flag'] = 1
    return df


def process_workbook(year, path, sheets=None):
    # sheet should be a list
    # for each sheet, create a dataframe of cleaned names
    # append together, add year column and return
    filename = os.path.join(path, '{}_excel.xlsx'.format(str(year)))
    if not sheets:
        excel_file = pd.ExcelFile(filename)
        sheets = filter(lambda x: x.startswith('Sheet'), excel_file.sheet_names)
    year_df_lst = []
    for sheet_name in sheets:
        print sheet_name
        df = pd.read_excel(filename, sheetname=sheet_name, header=None, names=['name'])
        new_df = process_sheet(df)
        new_df['sheet'] = sheet_name
        year_df_lst.append(new_df)
    # append all dfs together, add a year columns
    year_df = pd.concat(year_df_lst, axis=0)
    year_df['year'] = int(year)
    year_df['cleaned_name_flag'].fillna(0, inplace=True)
    # add unicde flag and return
    return mark_unicode_strs(year_df)


def unicode_to_ascii(raw_str):
    # strings are in unicode, need to encode to ascii
    # this only works on unicode strings
    assert isinstance(raw_str, unicode)
    try:
        return raw_str.encode('ascii', errors='strict')
    except UnicodeEncodeError as e:
        print raw_str, e
        return raw_str.encode('ascii', errors='replace')


def write_out_result(df, path, year):
    # drop any empty columns in name
    df2 = df.dropna(axis=0, subset=['name'])
    # need to deal with unicode here before exporting
    df2[['clean', 'name']] = df2[['clean', 'name']].applymap(unicode_to_ascii)
    df2.to_excel(os.path.join(path, '{}_filtered.xlsx'.format(str(year))), index=False)


def process_loop(year, path=path, sheets=None):
    res = process_workbook(year, path, sheets)
    write_out_result(res, path, year)
