In [1]:
import json # used for managing the JSON files from API
from urllib.request import urlopen # fetch URl of API
from pprint import pprint # just for printing values for human use
from collections import Counter # used for determining duplicate names

In [2]:
# Loads the API calls, only need to do this once per run session

# Loads the papers to json
papers = json.load(urlopen("http://dirac.nist.gov/adsorption.nist.gov/isodb/api/minimalbiblio.json"))

# Loads the authors (and their IDs) to json
authors = json.load(urlopen("http://dirac.nist.gov/adsorption.nist.gov/isodb/api/authors.json"))

In [3]:
# Generates a dictionary of authors last names and their associated first names

authors_names = {}

for person in authors:
    if person["given_name"]:
        first_name = person["given_name"]
    else: 
        first_name = ""
        
    id = person["author_id"]
    last_name = person["family_name"]
    
    if last_name not in authors_names.keys():
        authors_names[last_name] = [ (first_name, id) ]
    else:
        authors_names[last_name].append( (first_name, id) )

In [4]:
# Sorts the names to place similar/same names closer to each other. The purpose is just for human viewing 
# and all lists deriving from this one will also be sorted

for last_name, first_name in authors_names.items():
    first_name.sort()

In [5]:
# Generates a dictionary of authors whose first names are just initials or contain no first name at akk

unknown_authors = {}
for last_name, first_names in authors_names.items():          # Iterates through last names
    for person in first_names:                                # Iterates through first names
        if ("." in person[0] and len(person[0]) <= 2) or person[0] == "": # Checks if first name initial only or empty
            unknown_authors[ (last_name, person) ] = []       # Initializes the list associated with unknown

# As of 2019/06/05, there are 6 people without first names


# Pairs the unknown authors with possible matches
for unknown, matches in unknown_authors.items():
    for person in authors_names[unknown[0]]:
        # Case of empty first name unknown, anyone could match
        if unknown[1][0] == "" and person[1] != unknown[1][1]: 
            matches.append(person)
            
        # Only matches with same first letter
        elif unknown[1][0] != "" and person[1] != unknown[1][1] and person[0][0] == unknown[1][0][0]: 
            matches.append(person)

# This finishes the processing for unknown authors (first initial only or no first name)

In [6]:
# Finding duplicate names

duplicate_author_names = []

# Iterates through all authors last names
for last_name, first_names in authors_names.items():
    if len(first_names) > 1: # eliminates last names with only one associated person
        temp_list = []       # temporary list for names to count with Counter
        for person in first_names:
            if (len(person[0]) >= 2 and "." not in person[0] and person[0] != ""): # Ignores unknown names, covered by other case
                temp_list.append(person[0])
        temp_counter = Counter(temp_list) # Counters instances of first names
        for first_name, freq in temp_counter.items():
            if (freq > 1):
                duplicate_author_names.append( (first_name, last_name) ) # Append to list of duplicates 

In [7]:
# Prints all authors

# pprint(authors_names)

In [8]:
# Printing matches for authors with no first name

# for unknown, matches in unknown_authors.items():
#     if unknown[1][0] == "":
#         print(unknown[0])
#         pprint(matches)

In [9]:
# Printing possible matches with authors who had first initials

# for unknown, matches in unknown_authors.items():
#     if unknown[1][0] != "" and matches:
#         print(unknown[1][0] + " " + unknown[0] + " " + unknown[1][1])
#         pprint(matches)

In [10]:
# Printing an example of duplicate_author_names

pprint(duplicate_author_names[:4])
# pprint(duplicate_author_names)

[('Asim', 'Bhaumik'),
 ('David', 'Britt'),
 ('Cantwell', 'Carson'),
 ('Gilberto', 'de SÃ¡')]


In [11]:
# Writing all of the unknown authors to file

with open("./stored_authors/unknown_authors.txt", "w") as unknowns_file:
    pprint(unknown_authors, stream = unknowns_file)

In [13]:
# Writing all of the duplicate authors to file

with open("./stored_authors/duplicate_authors.txt", "w") as duplicates_file:
    pprint(duplicate_author_names, stream = duplicates_file)

Concludes processing for identifying unknown and duplicate authors.