In [1]:
from pprint import pprint
import re
from collections import Counter
from copy import deepcopy
import itertools
import csv

In [2]:
with open("./stored_authors/author_url_pairings.txt", encoding="utf8") as pairings_file:
    author_url_pairings = eval(pairings_file.read())
with open("./stored_authors/unknown_authors.txt", encoding="utf8") as unknowns_file:
    unknown_authors = eval(unknowns_file.read())
with open("./stored_authors/duplicate_authors.txt", encoding="utf8") as duplicates_file:
    duplicate_authors = eval(duplicates_file.read())
with open("./stored_authors/authors_ids.txt", encoding="utf8") as authors_ids_file:
    authors_ids = eval(authors_ids_file.read())

In [3]:
# Used for reference when checking for authors in duplicate list

unpacked_duplicates = set()
for author_list in duplicate_authors.values():
    for author in author_list:
        unpacked_duplicates.add(author)
        
sorted_unpacked_duplicates = sorted(list(unpacked_duplicates))

In [4]:
h_likely = [] # Highly likely, unknowns that have matches
possible = [] # Moderately likely, the ones whose possible matches have similar characters
unlikely = [] # Unlikely, few matching characters

# Pairs duplicates
duplicate_pairing = {}
for author, url in author_url_pairings.items():
    if author in sorted_unpacked_duplicates:
        duplicate_pairing[url] = []
        
# Iterates through duplicates and appends pairings for each URL
for author, pair in duplicate_authors.items():
    for pair_id in pair:
        duplicate_pairing[author_url_pairings[pair_id]].append(pair_id)
        
merged_pairing = deepcopy(duplicate_pairing)

# Generates list of all the url IDs for any urls that have them (i.e. urls with "contributions"
url_num_list = []
for url in duplicate_pairing:
    if "contributions" in url:
        url_num_list.append(re.search(r"\d+", url).group(0))

# Generates list for all url IDs that have > 1 urls that contain them
same_num_list = [num for num, count in Counter(url_num_list).items() if count > 1]

# Iterates through nums then urls: longest_url -> longest (most complete name) to set as the new merged key
# merged_entries -> merging of the values of the keys for assigning to the new merged key 
for num in same_num_list:
    longest_url = ""        # Will be used as the key of the new merged entry (longer name is almost always the most detailed name)
    url_list = []           # Keeps track of urls to remove them from merged_pairing 
    merged_entries = []     # Used a value for new merged entry
    
    # Finds values and key for merging values
    for url in duplicate_pairing:
        if num in url:
            url_list.append(url)
            merged_entries += duplicate_pairing[url]
            
            # Assigns longest_url for use as a key
            if len(url) > len(longest_url):
                longest_url = url
    
    # Merges/assigns values import itertools
    for url in url_list:
        if url == longest_url:
            merged_pairing[url] = merged_entries
            
        # Removes other unmerged entries
        else:
            merged_pairing.pop(url)

for url, matches in merged_pairing.items():
    if len(matches) > 1:
        h_likely.append(matches)

In [5]:
# Pairs unknown names (e.g. first initial names)

unknown_pairing = {}
for unknown in unknown_authors:
    unknown_pairing[unknown] = None

for author, pair in unknown_authors.items():

    if author in author_url_pairings:
        author_url = author_url_pairings[author]
    else:
        author_url = "no"

#     print("-----new iteration-----")
#     print(author)
#     print(author_url) # 
#     print("--end of author url--")

    found = "Unsure"
    for pair_id in pair:
        pair_url = author_url_pairings[pair_id]

#         print(pair_url) #

        if "contributions" in author_url and "contributions" in pair_url:
            author_num = re.search(r"\d+", author_url).group(0)
            pair_num = re.search(r"\d+", pair_url).group(0)
            if author_num == pair_num:
                found = pair_id
                h_likely.append([author, pair_id])

#                 print("yes!") #

        elif author_url ==  pair_url:
            found = pair_id
            h_likely.append([author, pair_id])

#             print("yes!") #

    unknown_pairing[author] = found

In [6]:
# for list_thing in h_likely:
#     if len(list_thing) == 2:
#         print(list_thing)

all_h_likely_values = list(itertools.chain(*h_likely))
# pprint(h_first)
h_likely_duplicates = [author for author, count in Counter(all_h_likely_values).items() if count > 1]
# pprint(h_likely_duplicates)
print(len(h_likely_duplicates))
print(len(h_likely))

28
196


In [7]:
unknown_total = len(unknown_pairing)
print("Total: " + str(unknown_total))

unknown_not_unsure = len([author for author, status in unknown_pairing.items() if status != "Unsure"])
print("Not unsure: " + str(unknown_not_unsure))

unknown_unsure = unknown_total - unknown_not_unsure
print("Unsure: " + str(unknown_unsure))


Total: 240
Not unsure: 151
Unsure: 89


In [8]:
duplicate_total = len(merged_pairing)
print("Total: " + str(duplicate_total))

duplicate_matches = len([url for url, matches in merged_pairing.items() if len(matches) > 1])
print("Matches: " + str(duplicate_matches))

duplicate_individual = duplicate_total - duplicate_matches
print("Individuals (unsure): " + str(duplicate_individual))

Total: 104
Matches: 35
Individuals (unsure): 69


In [9]:
overall_total = unknown_total + duplicate_total
print("Overall Total: " + str(overall_total)) 

overall_highly_likely = len(h_likely)
print("Overall highly_likely: " + str(overall_highly_likely))

overall_unsure = overall_total - overall_highly_likely
print("Overall unsure: " + str(overall_unsure))

Overall Total: 344
Overall highly_likely: 196
Overall unsure: 148


In [25]:
# Selects an author as the "root" (the longest/one with the most special characters)

processing_roots = {}
root_names = {}

# Iterates through the lists in h_likely
for matches in h_likely:
    root_name = ""              # Longest/most accurate name to have everything merge into for database
    root_id = ""                # ID of the root author
    to_be_merged = deepcopy(matches) # remember to remove the root from matches
    
#     print(to_be_merged) #
    
    # For every author_id in the matches list
    for author in matches: 
        full_name = authors_ids[author][0] + "_" + authors_ids[author][1] + "_" + authors_ids[author][2]
        
#         pprint(full_name) #
#         print(author) #
        
        if len(author) == 19: # i.e. if the author_id is an ORC ID since ORC IDs are 19 characters while regular ID hashes are 40 characters
            if root_id == "":
                root_id = author
            else: # If there is a different ORC ID already (two different orc id == issue)
                print("***** THESE ARE NOT THE SAME PERSON *****")
            
        if len(full_name) > len(root_name):
            root_name = full_name
            temp_id = author
    
    if root_id == "": 
        root_id = temp_id
    
#     print(root_id) #
    
    to_be_merged.remove(root_id)
    processing_roots[root_id] = to_be_merged
    root_names[root_id] = root_name

# pprint(processing_roots)

with open("./stored_authors/root_names.txt", 'w') as root_out:
    pprint(root_names, stream = root_out)
with open("./stored_authors/mergees.txt", 'w') as mergees_out:
    pprint(processing_roots, stream = mergees_out)
with open("./stored_authors/paired.csv", 'w') as csv_out:
    csv_writer = csv.writer(csv_out)
    csv_writer.writerow(processing_roots.keys())
    csv_writer.writerows(itertools.zip_longest(*processing_roots.values(), fillvalue = ''))

In [11]:
print(len(processing_roots))
print(len(h_likely))

188
196


In [26]:
# pprint(processing_roots)

In [27]:
# pprint(h_likely)

In [28]:
# pprint(merged_pairing)