In [1]:
import json # used for managing the JSON files from API
from urllib.request import urlopen # fetch URl of API
from pprint import pprint # just for printing values for human use
from collections import Counter # used for determining duplicate names

In [2]:
# Loads the API calls, only need to do this once per run session

# Loads the authors (and their IDs) to json
authors = json.load(urlopen("http://dirac.nist.gov/adsorption.nist.gov/isodb/api/authors.json"))

In [3]:
# Writes to a file ID name pairings so other scripts won't have to load the authors.json file. 
# Also provides a method to access names from an author_id

authors_ids = {}

for person in authors:
    authors_ids[person["author_id"]] = [person["given_name"], person["middle_name"], person["family_name"]]

with open("./stored_authors/authors_ids.txt", "w") as ids_file:
    pprint(authors_ids, stream = ids_file)

In [4]:
# Generates a dictionary of authors last names and their associated first names

authors_names = {}

for person in authors:
    if person["given_name"]:
        first_name = person["given_name"]
    else: 
        first_name = ""
        
    id = person["author_id"]
    last_name = person["family_name"]
    
    if last_name not in authors_names.keys():
        authors_names[last_name] = [ (first_name, id) ]
    else:
        authors_names[last_name].append( (first_name, id) )

In [5]:
# Sorts the names to place similar/same names closer to each other. The purpose is just for human viewing 
# and all lists deriving from this one will also be sorted

for last_name, first_name in authors_names.items():
    first_name.sort()

In [6]:
# Generates a dictionary of authors whose first names are just initials or contain no first name at akk

unknown_authors = {}
matchless_authors = []

for last_name, first_names in authors_names.items():          # Iterates through last names
    for person in first_names:                                # Iterates through first names
        if ("." in person[0] and len(person[0]) <= 2) or person[0] == "": # Checks if first name initial only or empty
            unknown_authors[ person[1] ] = []                 # Initializes the list associated with unknown person's id

# As of 2019/06/05, there are 6 people without first names


# Pairs the unknown authors with possible matches
for unknown, matches in unknown_authors.items():
    for person in authors_names[authors_ids[unknown][2]]:
        # Case of empty first name unknown, anyone could match
        if authors_ids[unknown][0] == "" and person[1] != unknown: 
            matches.append( [person[0], person[1]] )
            
        # Only matches with same first letter
        elif authors_ids[unknown][0] != "" and person[1] != unknown and person[0][0] == authors_ids[unknown][0][0]: 
            matches.append( [person[0], person[1]] )
    if not matches: # If the author has no matches, they're unique, this list is to pop them later
        matchless_authors.append(unknown)

# Write all of the unknowns, even with no matches to a file
with open("./stored_authors/all_unknown_authors.txt", "w") as unknowns_file:
    pprint(unknown_authors, stream = unknowns_file)
     
# Remove matchless authors from the dictionary since they are unique
for author in matchless_authors:
    unknown_authors.pop(author)

# This finishes the processing for unknown authors (first initial only or no first name)

In [30]:
# Finding duplicate names

duplicate_authors = {}

# Iterates through all authors last names
for last_name, first_names in authors_names.items():
    if len(first_names) > 1: # eliminates last names with only one associated person
        temp_list = []       # temporary list for names to count with Counter
        for person in first_names:
            if len(person[0]) >= 2 and "." not in person[0] and person[0] != "": # Ignores unknown names, covered by other case
                temp_list.append(person[0])
        temp_counter = Counter(temp_list) # Counters instances of first names
        for first_name, freq in temp_counter.items():
            if freq > 1:
                duplicate_authors[ (first_name, last_name) ] = [] # Append to list of duplicates 
                
        for duplicate, ids in duplicate_authors.items(): # Adds author_ids to the duplicate list 
            for person in first_names:
                if person[0] == duplicate[0] and last_name == duplicate[1]: # Need to check last name as well as duplicates dict contains all duplicates, not just specific to this iteration
                    ids.append(person[1])

{('Alírio', 'Rodrigues'): ['04a46020814d276f91dbe2ddbe804d96c7d7c7b9',
                           '584647c9748417e6dddb743141ae895a61de3dce'],
 ('Ana', 'Ribeiro'): ['831da78505f36a7f7f27627e3cf7c7ce70f3c814',
                      '8b80ee24d983f356b3117484ac1a6f9a2e7ce441'],
 ('Andrea', 'Sudik'): ['7c15f2522511eca375135bdf1443eb7337d33eaa',
                       '7f4283ff7fcd0c07c89889b041aff3bfe1d5bca1'],
 ('Asim', 'Bhaumik'): ['0000-0002-4907-7418',
                       'c8579580ef35af4669ea16470ba8f628a7b7b1c0'],
 ('Badie', 'Morsi'): ['8602215936c1eb9465b58b12bc05093ff26c2e90',
                      'f2d062d5c2d48c419af09fd0b0e4ffe169d28306'],
 ('Bidyut', 'Saha'): ['4ff206651a890dafc062f9f7fa51888d77b2bca3',
                      '975de28b0249c30fde13e4d1214d21d5d4ac6763'],
 ('Brian', 'Wiers'): ['1793308439197cb9781a83d1038fd640757bab6f',
                      '5e0d4fbd09e2c2f85ab309f32f016868f28203a3'],
 ('Cantwell', 'Carson'): ['12a2d89604438f78b003f90b3077bcefae8e0b71',
      

In [12]:
# Writing all of the unknown authors to file

with open("./stored_authors/unknown_authors.txt", "w") as unknowns_file:
    pprint(unknown_authors, stream = unknowns_file)

In [34]:
# Writing all of the duplicate authors to file

with open("./stored_authors/duplicate_authors.txt", "w") as duplicates_file:
    pprint(duplicate_authors, stream = duplicates_file)

Concludes processing for identifying unknown and duplicate authors.