In [None]:
import csv
import copy
import openpyxl

In [None]:
def parse_workbook(file):
    print(file)
    wb = openpyxl.load_workbook(file)
    current_sheet = wb['1']
    sheet_dict = dict()
    for num, row in enumerate(current_sheet.iter_rows()):
        if num == 0:
            keys = [i.value or num for i in row]
            continue
        values = [i.value for i in row]
        row_dict = {keys[i]: values[i] for i in range(len(keys))}
        sheet_dict[num] = row_dict
    return sheet_dict

In [None]:
ediss = parse_workbook('ListBibliographyReport-EDISS-ETHESIS.xlsx')
ediss_url = parse_workbook('URLreports-EDISS-ETHESIS.xlsx')
diss = parse_workbook('ListBibliographyReport-DISS-THESIS.xlsx')

In [None]:
# Example digital commons items

# print('ediss')
# for row, value_dict in ediss.items():
#     print(row, value_dict)
#     break
# print('ediss_url')
# for row, value_dict in ediss_url.items():
#     print(row, value_dict)
#     break
# print('diss')
# for row, value_dict in diss.items():
#     print(row, value_dict)
#     break

In [None]:
# Removes duplicate rows if they are identical -- complains otherwise

def remove_duplicate_rows(workbook):
    print('starting length: {}'.format(len(workbook)))
    squashed_workbook = dict()
    for column, column_values in workbook.items():
        flex_key = column_values.get('flexkey')
        if flex_key in squashed_workbook:
            for header, field_value in column_values.items():
                if header in ('callnum', 'itemid', 'homeloca', 'currentloca', 'itemtype', 'library', 'Staff', 'permortemp', ):
                    continue
                if header in ('Suffix', 'Public', 'category3', 'category2', 'category1', 'CircNote', ):
                    continue
                if squashed_workbook[flex_key][header] != field_value:
                    print('mismatched duplicated flexkeys {} {}'.format(flex_key, header))
        else:
            squashed_workbook[flex_key] = column_values
    print('ending length: {}'.format(len(squashed_workbook)))
    return squashed_workbook

In [None]:
ediss_url = remove_duplicate_rows(ediss_url)

In [None]:
ediss = remove_duplicate_rows(ediss)

In [None]:
# Ensure no headers in common between two dicts before we merge them

for flexkey, item_dict in ediss_url.items():
    for header, value in item_dict.items():
        if header in ('flexkey', ):
            continue
        if header in ediss.get(flexkey):
            print('duplicate headers {} {}'.format(flexkey, header))

In [None]:
# merge the two dicts

for flexkey, item_dict in ediss.items():
    ediss[flexkey] = {**item_dict, **ediss_url[flexkey]}

In [None]:
remove_duplicate_rows(diss)

print('take home message:  there are a few different items with the same flexkey')

In [None]:
def merge_dicts(*dicts):
    counter = 0
    merged_dict = dict()
    for d in dicts:
        for _, i in d.items():
            try:
                itemid = i['itemid']
            except KeyError:
                itemid = i['urn']
            if not itemid:
                merged_dict[counter] = i
                counter += 1
            elif itemid in merged_dict:
                print(i)
                print('duplicate {}'.format(itemid))
                break
            else:
                merged_dict[itemid] = i
    return merged_dict

In [None]:
merged_catalog = merge_dicts(ediss, diss)

In [None]:
# Example merged_catalog item

# for k, v in merged_catalog.items():
#     for label, value in v.items():
#         print("{}***  {}".format(label, value))
#     break

In [None]:
digcomm_diss = parse_workbook('DigCommExports/gradschool_dissertations_1.xls_Fri_Mar_23_13_38_07_2018part_1.xlsx')
# digcomm_historical = parse_workbook('DigCommExports/gradschool_disstheses_1.xls_Mon_Feb_26_11_28_15_2018part_1.xlsx')
digcomm_majorp = parse_workbook('DigCommExports/gradschool_majorpapers_1.xls_Mon_Feb_26_11_25_49_2018part_1.xlsx')
digcomm_thess = parse_workbook('DigCommExports/gradschool_theses_1.xls_Mon_Apr_30_09_40_49_2018part_1.xlsx')

In [None]:
merged_digcomm = merge_dicts(digcomm_majorp, digcomm_thess, digcomm_diss )

In [None]:
# Example merged_digcomm item

# for k, v in merged_digcomm.items():
#     for label, value in v.items():
#         print("{}***  {}".format(label, value))
#     break

In [None]:
# if urn already in catalog, remove the item from merged_digcomm

print(len(merged_digcomm))
for k, v in merged_catalog.items():
    urn = v.get('Subfield u of 856')
    if not urn:
        continue
    urn = urn.replace('http://digitalcommons.lsu.edu/do/search/?q=', '').replace('/', '')
    if urn in merged_digcomm:
        merged_digcomm.pop(urn)
print(len(merged_digcomm))

In [None]:
# print(len(digcomm_historical))
# for k, v in merged_catalog.items():
#     urn = v.get('Subfield u of 856')
#     if not urn:
#         continue
#     urn = urn.replace('http://digitalcommons.lsu.edu/do/search/?q=', '').replace('/', '')
#     if urn in digcomm_historical:
#         digcomm_historical.pop(urn)
# print(len(digcomm_historical))

In [None]:
def look_for_author_match(author_f, author_m, author_l):
    return [item_dict for urn, item_dict in merged_catalog.items()
        if '{}, {} {}'.format(author_l, author_f, author_m).lower() in item_dict.get('author', '').lower()]

In [None]:
def alnum_string(string):
    return ''.join([i.lower() for i in string if i.isalnum()]).replace('carbon', 'c').replace('beta', 'b')

In [None]:
def get_info(item_dict):
    title, year = item_dict.get('title'), item_dict.get('publication_date')
    full_author = '{} {} {} {}'.format(item_dict.get('author1_fname', ''),
                                  item_dict.get('author1_mname', ''),
                                  item_dict.get('author1_lname', ''),
                                  item_dict.get('author1_suffix', ''))
    author_last = item_dict.get('author1_lname', '')
    author_first = item_dict.get('author1_fname', '')
    author_middle = item_dict.get('author1_mname', '')
    return title, full_author, author_last, author_first, author_middle, year

In [None]:
def squash_flatten(string):
    return ''.join(
        [i.lower() for i in string.replace('(Spanish Text)', '')
         if (64 < ord(i) < 90) or (96 < ord(i) < 123)])

In [None]:
def difference_in(str_a, str_b):
    list_a = [i for i in squash_flatten(str_a)]
    list_b = [i for i in squash_flatten(str_b)]
    new_list_a = list_a[:]
    new_list_b = list_b[:]
    for char in list_a:
        try:
            new_list_b.remove(char)
        except:
            pass
    for char in list_b:
        try:
            new_list_a.remove(char)
        except:
            pass
    return ''.join(new_list_a), ''.join(new_list_b)

In [None]:
def reck(string, author):
    return squash_flatten(string.replace('[electronic resource]', '').replace(f'{author}', '').split('/')[0])

In [None]:
# If confirmed no match, record sent to manual_checked_no_match.
# If confirmed match, record popped.


def purge_matches(digcomm, catalog):
    digcomm_copy = copy.deepcopy(digcomm)
    print(len(digcomm_copy))
    digcomm_copy = {k: v for k, v in digcomm_copy.items() if v['publication_date'].year < 2014}
    print(len(digcomm_copy))
    for urn, item_dict in digcomm_copy.items():
        title, full_author, author_last, author_first, author_middle, year = get_info(item_dict)
        full_author = ' '.join([i for i in full_author.split(' ') if i != 'None'])
        digcomm_date = item_dict['publication_date'].year
        potential_matches = look_for_author_match(author_first, author_middle, author_last)
#         if not potential_matches:
#             print('{}, {} {}'.format(author_last, author_first, author_middle))
#         print('{} potential matches'.format(len(potential_matches)))
        for p in potential_matches:
            min_range, max_range = p['pubyr'] - 5, p['pubyr'] + 5
#             if (min_range > digcomm_date) or (max_range < digcomm_date):
#                 continue
            print(title)
            print(p.get('title'))
            print('\n')
#             if reck(p.get('title'), full_author) == reck(title, full_author):
#                 merged_digcomm.pop(urn)
#                 break
#             if alnum_string(p.get('title')) == alnum_string(title):
#                 digcomm_historical.pop(urn)
#                 break
#             if squash_flatten(p.get('title')) == squash_flatten(title):
#                 digcomm_historical.pop(urn)
#                 break
#             diff_a, diff_b = difference_in(p.get('title'), title)
#             if len(diff_a) < 3 and len(diff_b) < 4:
#                 digcomm_historical.pop(urn)
#                 break
#             p_last_name, p_first_parts, *args = p.get('author').split(',')
#             p_first_name, p_middle_name, *args = p_first_parts.strip().split(' ')
#             reshaped_p_author = 'by{}{}{}'.format(p_first_name, p_middle_name, p_last_name).lower()
#             if alnum_string(p.get('title')).replace(reshaped_p_author, '') == alnum_string(title):
#                 digcomm_historical.pop(urn)
#                 break
#             difference_a, difference_b = difference_in(p.get('title'),
#                                                        title)

#             print('{}\n{}\n{}\n### {}\n'.format(title, full_author, year, difference_b))
#             print('{}\n{}\n{}\n### {}\n'.format(p.get('title', ''),
#                                         p.get('author', ''),
#                                         p.get('pubyr', ''),
#                                         difference_a))

            response = input('are these the same? (y/n)')
            if response.lower().strip() == 'y':          
                merged_digcomm.pop(urn)
                break
#             if response.lower().strip() == 'skip':
#                 break
#         else:
#             manual_checked_no_matches[urn] = item_dict
#             digcomm_historical.pop(urn)   

In [None]:
purge_matches(merged_digcomm, merged_catalog)

In [None]:
len(merged_digcomm), len(merged_catalog)

In [None]:
print(reck("""EM Algorithm for Multiple Wideband Source Localization""", 'Kiran Kumar Mada'))
print(reck("""EM algorithm for multiple wideband source localization [electronic resource] / by Kiran Kumar Mada""", 'Kiran Kumar Mada'))

In [None]:
def merged_dict_to_csv(output_filename, source_dict):
    with open(output_filename, 'w') as f:
        for urn, item_dict in source_dict.items():
            headers = sorted(item_dict.keys())
            break
        w = csv.DictWriter(f, headers)
        w.writeheader()
        for urn, item_dict in source_dict.items():
            w.writerow(item_dict)

In [None]:
merged_dict_to_csv('DigCommPossiblyNotInCatalog.csv', merged_digcomm)