In [1]:
import json # used for managing the JSON files from API
from urllib.request import urlopen # fetch URl of API
from pprint import pprint # just for printing values for human use
from collections import Counter # used for determining duplicate names

In [2]:
# Loads the API calls, only need to do this once per run session

# Loads the authors (and their IDs) to json
authors = json.load(urlopen("http://dirac.nist.gov/adsorption.nist.gov/isodb/api/authors.json"))

In [3]:
for person in authors:
    if person["orc_id"] != "":
        print(person["orc_id"])

0000-0002-6496-8411
0000-0002-2005-3877
0000-0001-6682-3040
0000-0001-8081-8723
0000-0001-5192-0016
0000-0002-0197-833X
0000-0003-1770-412X
0000-0002-4907-7418
0000-0003-4843-8776
0000-0002-0583-6435
0000-0002-1328-7376
0000-0003-0931-085X
0000-0001-5404-4728
0000-0003-4838-5678
0000-0002-7412-428X
0000-0003-0345-2697
0000-0001-7998-4492
0000-0002-2691-2986
0000-0001-8576-1914
0000-0003-0860-5488
0000-0002-2816-2875
0000-0002-1894-366X
0000-0002-5013-1194
0000-0002-6845-4354
0000-0002-9523-6918
0000-0001-6082-5862
0000-0002-9664-3697
0000-0001-8224-839X
0000-0001-8155-6489
0000-0002-3608-8003
0000-0001-6501-8875
0000-0002-7660-6768
0000-0001-9627-2331
0000-0001-9555-6009
0000-0001-7586-9841
0000-0001-7792-4322
0000-0002-5115-1488
0000-0001-5905-8336
0000-0002-9359-9889
0000-0003-1288-5451
0000-0001-8379-2098
0000-0002-0528-9814
0000-0003-2749-4269
0000-0003-4757-4741
0000-0002-2980-7997
0000-0002-4662-8448
0000-0002-0284-7147
0000-0003-0967-6560
0000-0002-6018-3641
0000-0002-6260-7727


In [4]:
# Loads API call for papers
papers = json.load(urlopen("http://dirac.nist.gov/adsorption.nist.gov/isodb/api/minimalbiblio.json"))

# Writes to file 
with open("./stored_authors/papers.txt", "w") as papers_file:
    pprint(papers, stream = papers_file)

In [5]:
# Writes to a file ID name pairings so other scripts won't have to load the authors.json file. 
# Also provides a method to access names from an author_id

authors_ids = {}

for person in authors:
    authors_ids[person["author_id"]] = [person["given_name"], person["middle_name"], person["family_name"]]

with open("./stored_authors/authors_ids.txt", "w") as ids_file:
    pprint(authors_ids, stream = ids_file)

In [6]:
# Generates a dictionary of authors last names and their associated first names

authors_names = {}

for person in authors:
    if person["given_name"]:
        first_name = person["given_name"]
    else: 
        first_name = ""
        
    id = person["author_id"]
    last_name = person["family_name"]
    
    if last_name not in authors_names.keys():
        authors_names[last_name] = [ (first_name, id) ]
    else:
        authors_names[last_name].append( (first_name, id) )

In [7]:
# Sorts the names to place similar/same names closer to each other. The purpose is just for human viewing 
# and all lists deriving from this one will also be sorted

for last_name, first_name in authors_names.items():
    first_name.sort()

In [8]:
# Generates a dictionary of authors whose first names are just initials or contain no first name at akk

unknown_authors = {}
matchless_authors = []

for last_name, first_names in authors_names.items():          # Iterates through last names
    for person in first_names:                                # Iterates through first names
        if ("." in person[0] and len(person[0]) <= 2) or person[0] == "": # Checks if first name initial only or empty
            unknown_authors[ person[1] ] = []                 # Initializes the list associated with unknown person's id

# As of 2019/06/05, there are 6 people without first names


# Pairs the unknown authors with possible matches
for unknown, matches in unknown_authors.items():
    for person in authors_names[authors_ids[unknown][2]]:
        # Case of empty first name unknown, anyone could match
        if authors_ids[unknown][0] == "" and person[1] != unknown: 
            matches.append( person[1] )
            
        # Only matches with same first letter
        elif authors_ids[unknown][0] != "" and person[1] != unknown and person[0][0] == authors_ids[unknown][0][0]: 
            matches.append( person[1] )
    if not matches: # If the author has no matches, they're unique, this list is to pop them later
        matchless_authors.append(unknown)

# Write all of the unknowns, even with no matches to a file
with open("./stored_authors/all_unknown_authors.txt", "w") as unknowns_file:
    pprint(unknown_authors, stream = unknowns_file)
     
# Remove matchless authors from the dictionary since they are unique
for author in matchless_authors:
    unknown_authors.pop(author)

# This finishes the processing for unknown authors (first initial only or no first name)

In [9]:
# Finding duplicate names

duplicate_authors = {}

# Iterates through all authors last names
for last_name, first_names in authors_names.items():
    if len(first_names) > 1: # eliminates last names with only one associated person
        temp_list = []       # temporary list for names to count with Counter
        for person in first_names:
            if len(person[0]) >= 2 and "." not in person[0] and person[0] != "": # Ignores unknown names, covered by other case
                temp_list.append(person[0])
        temp_counter = Counter(temp_list) # Counters instances of first names
        for first_name, freq in temp_counter.items():
            if freq > 1:
                duplicate_authors[ (first_name, last_name) ] = [] # Append to list of duplicates 
                
        for duplicate, ids in duplicate_authors.items(): # Adds author_ids to the duplicate list 
            for person in first_names:
                if person[0] == duplicate[0] and last_name == duplicate[1]: # Need to check last name as well as duplicates dict contains all duplicates, not just specific to this iteration
                    ids.append(person[1])

In [10]:
# Writing all of the unknown authors to file

with open("./stored_authors/unknown_authors.txt", "w") as unknowns_file:
    pprint(unknown_authors, stream = unknowns_file)

In [11]:
# Writing all of the duplicate authors to file

with open("./stored_authors/duplicate_authors.txt", "w") as duplicates_file:
    pprint(duplicate_authors, stream = duplicates_file)

Concludes processing for identifying unknown and duplicate authors.